apollo-crawler 0.1.24 → 0.1.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/apollo_crawler.rb +3 -0
- data/lib/apollo_crawler/agent/agents.rb +3 -0
- data/lib/apollo_crawler/agent/base_agent.rb +2 -0
- data/lib/apollo_crawler/agent/crawler_agent.rb +77 -0
- data/lib/apollo_crawler/agent/domainer_agent.rb +51 -0
- data/lib/apollo_crawler/agent/exchanges.rb +55 -0
- data/lib/apollo_crawler/agent/fetcher_agent.rb +62 -12
- data/lib/apollo_crawler/crawler/base_crawler.rb +2 -2
- data/lib/apollo_crawler/crawler/slashdot_crawler.rb +1 -8
- data/lib/apollo_crawler/fetcher/base_fetcher.rb +17 -4
- data/lib/apollo_crawler/lib.rb +3 -0
- data/lib/apollo_crawler/model/data_source.rb +37 -0
- data/lib/apollo_crawler/model/domain.rb +37 -0
- data/lib/apollo_crawler/model/models.rb +3 -1
- data/lib/apollo_crawler/model/queued_url.rb +2 -1
- data/lib/apollo_crawler/model/raw_document.rb +5 -1
- data/lib/apollo_crawler/planner/smart_planner.rb +76 -18
- data/lib/apollo_crawler/program/platform_program.rb +40 -8
- data/lib/apollo_crawler/scheduler/base_scheduler.rb +39 -0
- data/lib/apollo_crawler/scheduler/schedulers.rb +21 -0
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79e9ecdfed577a1ce13b74b24d6d5bc26bf75843
|
4
|
+
data.tar.gz: 6d93c6da6316d4666ddc5e434bab1caadc213ba3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 863d10a255722bd53c9ee998e2886fd86d04cf7808284323d33f7da1fe77fc99ac0874f1e2a61f20c8656e039bc3bac2d2a9cef911e9b6e6d12266e91636b3bc
|
7
|
+
data.tar.gz: db77a2d4606dcecbec1ae2e0d872cc41f6fea64012280e179b1147d8bbabfe7eef5446bf364978ee930f338cb367624e30a86e11fe0613e4499e03dcbc670e4b
|
data/lib/apollo_crawler.rb
CHANGED
@@ -57,5 +57,8 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/planner/planners')
|
|
57
57
|
# Program
|
58
58
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/program/programs')
|
59
59
|
|
60
|
+
# Scheduler
|
61
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/scheduler/schedulers')
|
62
|
+
|
60
63
|
# Stores
|
61
64
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/store/stores')
|
@@ -18,5 +18,8 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require File.join(File.dirname(__FILE__), 'exchanges')
|
21
22
|
require File.join(File.dirname(__FILE__), 'base_agent')
|
23
|
+
require File.join(File.dirname(__FILE__), 'domainer_agent')
|
24
|
+
require File.join(File.dirname(__FILE__), 'crawler_agent')
|
22
25
|
require File.join(File.dirname(__FILE__), 'fetcher_agent')
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'exchanges')
|
22
|
+
require File.join(File.dirname(__FILE__), 'base_agent')
|
23
|
+
|
24
|
+
require File.join(File.dirname(__FILE__), '../crawler/crawlers')
|
25
|
+
|
26
|
+
require 'nokogiri'
|
27
|
+
|
28
|
+
module Apollo
|
29
|
+
module Agent
|
30
|
+
class CrawlerAgent < BaseAgent
|
31
|
+
attr_accessor :declarations
|
32
|
+
|
33
|
+
def initialize(amqp, opts={})
|
34
|
+
if(opts[:verbose])
|
35
|
+
puts "Initializing crawler agent..."
|
36
|
+
end
|
37
|
+
|
38
|
+
# Declarations
|
39
|
+
channel = amqp.create_channel
|
40
|
+
declarations = Apollo::Agent.declare_exchanges(channel, opts)
|
41
|
+
|
42
|
+
# Binding
|
43
|
+
declarations[:queues]["crawler.queue"].bind(declarations[:exchanges]["crawler"]).subscribe do |delivery_info, metadata, payload|
|
44
|
+
puts "CrawlerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
|
45
|
+
|
46
|
+
msg = JSON.parse(payload)
|
47
|
+
|
48
|
+
request = msg['request']
|
49
|
+
response = msg['response']
|
50
|
+
|
51
|
+
# puts "PLANEEEER: #{msg.inspect}"
|
52
|
+
|
53
|
+
doc = Nokogiri::HTML(response['body'])
|
54
|
+
crawler = request['crawler_name'].constantize.new
|
55
|
+
data = crawler.extract_data(doc)
|
56
|
+
links = crawler.extract_links(doc)
|
57
|
+
|
58
|
+
# puts crawler.to_s
|
59
|
+
# puts res.inspect
|
60
|
+
|
61
|
+
if(metadata[:reply_to] != nil)
|
62
|
+
x = declarations[:exchanges][metadata[:reply_to]]
|
63
|
+
|
64
|
+
msg = {
|
65
|
+
:request => request,
|
66
|
+
:response => response,
|
67
|
+
:data => data,
|
68
|
+
:links => links
|
69
|
+
}
|
70
|
+
|
71
|
+
x.publish(msg.to_json)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end # class CrawlerAgent
|
76
|
+
end # module Agent
|
77
|
+
end # module Apollo
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'exchanges')
|
22
|
+
require File.join(File.dirname(__FILE__), 'base_agent')
|
23
|
+
|
24
|
+
require File.join(File.dirname(__FILE__), '../crawler/crawlers')
|
25
|
+
|
26
|
+
require 'nokogiri'
|
27
|
+
|
28
|
+
module Apollo
|
29
|
+
module Agent
|
30
|
+
class DomainerAgent < BaseAgent
|
31
|
+
attr_accessor :declarations
|
32
|
+
|
33
|
+
def initialize(amqp, opts={})
|
34
|
+
if(opts[:verbose])
|
35
|
+
puts "Initializing crawler agent..."
|
36
|
+
end
|
37
|
+
|
38
|
+
# Declarations
|
39
|
+
channel = amqp.create_channel
|
40
|
+
declarations = Apollo::Agent.declare_exchanges(channel, opts)
|
41
|
+
|
42
|
+
# Binding
|
43
|
+
declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|
|
44
|
+
puts "DomainerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
|
45
|
+
|
46
|
+
msg = JSON.parse(payload)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end # class DomainerAgent
|
50
|
+
end # module Agent
|
51
|
+
end # module Apollo
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
module Apollo
|
22
|
+
module Agent
|
23
|
+
def self.declare_exchanges(channel, opts={})
|
24
|
+
if(opts[:verbose])
|
25
|
+
puts "Declaring AMQP Exchanges"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Exchanges
|
29
|
+
exchanges = {}
|
30
|
+
exchanges["crawler"] = channel.direct("crawler", :auto_delete => false, :durable => true)
|
31
|
+
exchanges["domainer"] = channel.direct("domainer", :auto_delete => false, :durable => true)
|
32
|
+
exchanges["fetcher"] = channel.direct("fetcher", :auto_delete => false, :durable => true)
|
33
|
+
exchanges["planner.crawled"] = channel.direct("planner.crawled", :auto_delete => false, :durable => true)
|
34
|
+
exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
|
35
|
+
exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
|
36
|
+
|
37
|
+
# Queues
|
38
|
+
queues = {}
|
39
|
+
queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
|
40
|
+
queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
|
41
|
+
queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
|
42
|
+
queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
|
43
|
+
queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
|
44
|
+
queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
|
45
|
+
|
46
|
+
# Compose res
|
47
|
+
res = {
|
48
|
+
:exchanges => exchanges,
|
49
|
+
:queues => queues
|
50
|
+
}
|
51
|
+
|
52
|
+
return res
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -18,13 +18,22 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require File.join(File.dirname(__FILE__), 'exchanges')
|
21
22
|
require File.join(File.dirname(__FILE__), 'base_agent')
|
23
|
+
|
22
24
|
require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
|
23
25
|
|
26
|
+
require 'digest/sha1'
|
27
|
+
require 'thread/pool'
|
28
|
+
|
24
29
|
module Apollo
|
25
30
|
module Agent
|
26
31
|
class FetcherAgent < BaseAgent
|
32
|
+
THREAD_POOL_SIZE = 10
|
33
|
+
|
27
34
|
attr_accessor :fetcher
|
35
|
+
attr_accessor :declarations
|
36
|
+
attr_accessor :thread_pool
|
28
37
|
|
29
38
|
def initialize(amqp, opts={})
|
30
39
|
self.fetcher = Apollo::Fetcher::SmartFetcher.new
|
@@ -33,23 +42,64 @@ module Apollo
|
|
33
42
|
puts "Initializing fetcher agent..."
|
34
43
|
end
|
35
44
|
|
36
|
-
|
37
|
-
q = ch.queue("fetcher", :auto_delete => false, :durable => true)
|
38
|
-
x = ch.default_exchange
|
45
|
+
thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
|
39
46
|
|
40
|
-
|
41
|
-
|
47
|
+
# Declarations
|
48
|
+
channel = amqp.create_channel
|
49
|
+
declarations = Apollo::Agent.declare_exchanges(channel, opts)
|
42
50
|
|
43
|
-
|
44
|
-
|
45
|
-
|
51
|
+
# Binding
|
52
|
+
declarations[:queues]["fetcher.queue"].bind(declarations[:exchanges]["fetcher"]).subscribe do |delivery_info, metadata, payload|
|
53
|
+
thread_pool.process {
|
46
54
|
queued_url = JSON.parse(payload)
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
55
|
+
url = queued_url["url"]
|
56
|
+
|
57
|
+
puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
|
58
|
+
|
59
|
+
doc = nil
|
60
|
+
begin
|
61
|
+
doc = Apollo::Fetcher::SmartFetcher::fetch(url)
|
62
|
+
rescue Exception => e
|
63
|
+
puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
|
64
|
+
end
|
65
|
+
|
66
|
+
doc = get_fetched_doc(queued_url, doc, metadata, opts)
|
67
|
+
|
68
|
+
# send_response_msg(metadata[:reply_to], get_response_msg(queued_url, doc))
|
69
|
+
|
70
|
+
if(metadata && metadata[:reply_to])
|
71
|
+
exchange_name = metadata[:reply_to]
|
72
|
+
|
73
|
+
if(exchange_name != nil)
|
74
|
+
msg = get_response_msg(queued_url, doc)
|
75
|
+
|
76
|
+
x = declarations[:exchanges][exchange_name]
|
77
|
+
x.publish(msg.to_json)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
}
|
51
81
|
end
|
52
82
|
end
|
83
|
+
|
84
|
+
def get_fetched_doc(queued_url, doc, metadata, opts={})
|
85
|
+
url = queued_url["url"]
|
86
|
+
|
87
|
+
res = Apollo::Model::RawDocument.new
|
88
|
+
res.headers = doc.headers
|
89
|
+
res.body = doc.body
|
90
|
+
res.sha_hash = Digest::SHA1.hexdigest(doc.body)
|
91
|
+
res.status = doc.status
|
92
|
+
res.url = url
|
93
|
+
|
94
|
+
return res
|
95
|
+
end
|
96
|
+
|
97
|
+
def get_response_msg(queued_url, doc)
|
98
|
+
return {
|
99
|
+
:request => queued_url,
|
100
|
+
:response => doc
|
101
|
+
}
|
102
|
+
end
|
53
103
|
end # class FetcherAgent
|
54
104
|
end # module Agent
|
55
105
|
end # module Apollo
|
@@ -172,8 +172,8 @@ module Apollo
|
|
172
172
|
def self.create_metadoc(url, doc)
|
173
173
|
return {
|
174
174
|
'url' => url,
|
175
|
-
'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
|
176
|
-
'hash' => Digest::SHA256.new.update(doc).hexdigest,
|
175
|
+
'doc' => doc.body.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
|
176
|
+
'hash' => Digest::SHA256.new.update(doc.body).hexdigest,
|
177
177
|
'created_at' => Time.now.utc,
|
178
178
|
'expires_at' => nil,
|
179
179
|
'version' => 0
|
@@ -46,14 +46,7 @@ module Apollo
|
|
46
46
|
end
|
47
47
|
|
48
48
|
def extract_links(doc)
|
49
|
-
|
50
|
-
url = BaseCrawler.try_get_url(self.url, node['href']).to_s
|
51
|
-
next if url.nil?
|
52
|
-
|
53
|
-
{
|
54
|
-
:link => url
|
55
|
-
}
|
56
|
-
}
|
49
|
+
return []
|
57
50
|
end
|
58
51
|
end # class SlashdotCrawler
|
59
52
|
end # module Crawler
|
@@ -24,6 +24,9 @@ require "open-uri"
|
|
24
24
|
require 'faraday'
|
25
25
|
require 'ipaddr'
|
26
26
|
|
27
|
+
# require 'resolv'
|
28
|
+
# require 'resolv-replace'
|
29
|
+
|
27
30
|
module Apollo
|
28
31
|
module Fetcher
|
29
32
|
class BaseFetcher
|
@@ -36,7 +39,12 @@ module Apollo
|
|
36
39
|
end
|
37
40
|
|
38
41
|
def self.fetch(url, options = {})
|
39
|
-
|
42
|
+
begin
|
43
|
+
uri = URI.parse(url.to_s)
|
44
|
+
rescue Exception => e
|
45
|
+
puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
|
46
|
+
return nil
|
47
|
+
end
|
40
48
|
|
41
49
|
# See https://github.com/lostisland/faraday
|
42
50
|
conn = Faraday.new(:url => url) do |faraday|
|
@@ -46,12 +54,17 @@ module Apollo
|
|
46
54
|
end
|
47
55
|
|
48
56
|
# Make request
|
49
|
-
|
50
|
-
|
57
|
+
begin
|
58
|
+
res = conn.get(uri) do |request|
|
59
|
+
request.headers = BaseFetcher.get_fake_headers(uri)
|
60
|
+
end
|
61
|
+
rescue Exception => e
|
62
|
+
puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
|
63
|
+
return nil
|
51
64
|
end
|
52
65
|
|
53
66
|
# Return result
|
54
|
-
return res
|
67
|
+
return res
|
55
68
|
end
|
56
69
|
end # class BaseFetcher
|
57
70
|
end # module Fetcher
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -54,5 +54,8 @@ require File.join(File.dirname(__FILE__), 'planner/planners')
|
|
54
54
|
# Programs
|
55
55
|
require File.join(File.dirname(__FILE__), 'program/programs')
|
56
56
|
|
57
|
+
# Programs
|
58
|
+
require File.join(File.dirname(__FILE__), 'scheduler/schedulers')
|
59
|
+
|
57
60
|
# Stores
|
58
61
|
require File.join(File.dirname(__FILE__), 'store/stores')
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class DataSource < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "data_sources"
|
30
|
+
|
31
|
+
field :url
|
32
|
+
|
33
|
+
# Indexes
|
34
|
+
index({ created_at: 1, updated_at: 1, url: 1 })
|
35
|
+
end # class DataSource
|
36
|
+
end # module Model
|
37
|
+
end # module Apollo
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class Domain < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "domains"
|
30
|
+
|
31
|
+
field :name
|
32
|
+
|
33
|
+
# Indexes
|
34
|
+
index({ created_at: 1, updated_at: 1, name: 1 })
|
35
|
+
end # class Domain
|
36
|
+
end # module Model
|
37
|
+
end # module Apollo
|
@@ -20,5 +20,7 @@
|
|
20
20
|
|
21
21
|
require File.join(File.dirname(__FILE__), 'base_model')
|
22
22
|
require File.join(File.dirname(__FILE__), 'crawler')
|
23
|
+
require File.join(File.dirname(__FILE__), 'data_source')
|
24
|
+
require File.join(File.dirname(__FILE__), 'domain')
|
23
25
|
require File.join(File.dirname(__FILE__), 'queued_url')
|
24
|
-
require File.join(File.dirname(__FILE__), 'raw_document')
|
26
|
+
require File.join(File.dirname(__FILE__), 'raw_document')
|
@@ -30,9 +30,10 @@ module Apollo
|
|
30
30
|
|
31
31
|
field :url
|
32
32
|
field :state
|
33
|
+
field :crawler_name
|
33
34
|
|
34
35
|
# Indexes
|
35
|
-
index({ created_at: 1, updated_at: 1 })
|
36
|
+
index({ created_at: 1, updated_at: 1, crawler_name: 1 })
|
36
37
|
end # class QueuedUrl
|
37
38
|
end # module Model
|
38
39
|
end # module Apollo
|
@@ -28,10 +28,14 @@ module Apollo
|
|
28
28
|
|
29
29
|
store_in collection: "raw_docs"
|
30
30
|
|
31
|
+
field :url
|
32
|
+
field :headers
|
31
33
|
field :body
|
34
|
+
field :status
|
35
|
+
field :sha_hash
|
32
36
|
|
33
37
|
# Indexes
|
34
|
-
index({ created_at: 1, updated_at: 1 })
|
38
|
+
index({ created_at: 1, updated_at: 1, sha_hash: 1 })
|
35
39
|
end # class RawDocument
|
36
40
|
end # module Model
|
37
41
|
end # module Apollo
|
@@ -22,43 +22,101 @@ require File.join(File.dirname(__FILE__),'base_planner')
|
|
22
22
|
|
23
23
|
require File.join(File.dirname(__FILE__),'../model/models.rb')
|
24
24
|
|
25
|
+
require File.join(File.dirname(__FILE__),'../agent/exchanges.rb')
|
26
|
+
|
27
|
+
require File.join(File.dirname(__FILE__),'../scheduler/schedulers')
|
28
|
+
|
29
|
+
require 'nokogiri'
|
30
|
+
|
25
31
|
module Apollo
|
26
32
|
module Planner
|
27
33
|
class SmartPlanner < BasePlanner
|
28
34
|
attr_accessor :amqp
|
29
35
|
attr_accessor :mongo
|
36
|
+
attr_accessor :declarations
|
30
37
|
|
31
|
-
def initialize(amqp=nil, mongo=nil)
|
38
|
+
def initialize(amqp=nil, mongo=nil, opts={})
|
32
39
|
self.amqp = amqp
|
33
40
|
self.mongo = mongo
|
34
|
-
end
|
35
41
|
|
36
|
-
|
37
|
-
|
42
|
+
# Declarations
|
43
|
+
channel = amqp.create_channel
|
44
|
+
self.declarations = Apollo::Agent.declare_exchanges(channel, opts)
|
38
45
|
|
39
|
-
|
40
|
-
|
41
|
-
|
46
|
+
# Bindings
|
47
|
+
declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
|
48
|
+
msg = JSON.parse(payload)
|
42
49
|
|
43
|
-
|
50
|
+
request = msg['request']
|
51
|
+
response = msg['response']
|
44
52
|
|
45
|
-
|
46
|
-
|
47
|
-
|
53
|
+
doc = Apollo::Model::QueuedUrl.find(request["_id"])
|
54
|
+
doc.update_attributes(msg['request'])
|
55
|
+
doc.state = "fetched"
|
56
|
+
doc.save
|
57
|
+
|
58
|
+
doc = Apollo::Model::RawDocument.where(:url => request['url']).first
|
59
|
+
if doc
|
60
|
+
if doc.sha_hash != response['sha_hash']
|
61
|
+
puts "Removing old cached version of '#{request['url']}'" if opts[:verbose]
|
62
|
+
|
63
|
+
doc.destroy
|
64
|
+
doc = nil
|
65
|
+
else
|
66
|
+
puts "Using cached version of '#{request['url']}'" if opts[:verbose]
|
67
|
+
end
|
68
|
+
else
|
69
|
+
doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
|
70
|
+
end
|
71
|
+
|
72
|
+
if(doc.nil?)
|
73
|
+
doc = Apollo::Model::RawDocument.new(response).save
|
74
|
+
end
|
75
|
+
|
76
|
+
# Publish
|
77
|
+
declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
|
78
|
+
end
|
79
|
+
|
80
|
+
declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
|
81
|
+
msg = JSON.parse(payload)
|
48
82
|
|
83
|
+
puts "DOMAINED !!!"
|
84
|
+
end
|
85
|
+
|
86
|
+
declarations[:queues]["planner.crawled.queue"].bind(declarations[:exchanges]["planner.crawled"]).subscribe do |delivery_info, metadata, payload|
|
87
|
+
msg = JSON.parse(payload)
|
88
|
+
|
89
|
+
request = msg['request']
|
90
|
+
response = msg['response']
|
91
|
+
data = msg['data']
|
92
|
+
links = msg['links']
|
93
|
+
links = [] if links.nil?
|
94
|
+
|
95
|
+
links.each do |url|
|
96
|
+
link = url['link']
|
97
|
+
|
98
|
+
Apollo::Scheduler::BaseScheduler::schedule(link, request['crawler_name'])
|
99
|
+
end
|
100
|
+
|
101
|
+
# puts JSON.pretty_generate(data)
|
102
|
+
# puts JSON.pretty_generate(links)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def fetch_url(url, opts={})
|
49
107
|
if(opts[:verbose])
|
50
|
-
puts "
|
108
|
+
puts "AMQP fetching '#{url.inspect}'"
|
51
109
|
end
|
52
110
|
|
53
|
-
|
111
|
+
# Publish
|
112
|
+
declarations[:exchanges]["fetcher"].publish(url.to_json, :reply_to => "planner.fetched")
|
113
|
+
end
|
54
114
|
|
55
|
-
|
56
|
-
|
57
|
-
url.
|
115
|
+
def fetch_queued_urls(opts={})
|
116
|
+
while url = Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
|
117
|
+
# puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
|
58
118
|
|
59
119
|
fetch_url(url, opts)
|
60
|
-
|
61
|
-
# puts "Removing URL from Queue '#{url.inspect}'" if opts[:verbose]
|
62
120
|
end
|
63
121
|
end
|
64
122
|
|
@@ -37,6 +37,7 @@ require 'eventmachine'
|
|
37
37
|
require 'em-http'
|
38
38
|
|
39
39
|
require 'fileutils'
|
40
|
+
require 'csv'
|
40
41
|
|
41
42
|
require 'mongoid'
|
42
43
|
|
@@ -44,6 +45,8 @@ require File.join(File.dirname(__FILE__), '..', 'version')
|
|
44
45
|
|
45
46
|
require File.join(File.dirname(__FILE__),'base_program')
|
46
47
|
|
48
|
+
require File.join(File.dirname(__FILE__),'../scheduler/schedulers')
|
49
|
+
|
47
50
|
module Apollo
|
48
51
|
# Apollo Crawler Base Directory
|
49
52
|
APOLLO_PLATFORM_BASE_DIR = File.join(File.dirname(__FILE__), "..")
|
@@ -97,29 +100,56 @@ module Apollo
|
|
97
100
|
def enqueue_crawlers_urls(amqp, crawlers=Apollo::Crawler::BaseCrawler.subclasses, opts={})
|
98
101
|
crawlers.each do |crawler|
|
99
102
|
i = crawler.new
|
100
|
-
|
101
|
-
qu = Apollo::Model::QueuedUrl.new(:url => i.url, :state => :queued)
|
102
|
-
qu.save
|
103
|
+
Apollo::Scheduler::BaseScheduler::schedule(i.url, crawler)
|
103
104
|
end
|
104
105
|
end
|
105
106
|
|
107
|
+
def init_crawlers(amqp, opts={})
|
108
|
+
crawlers = []
|
109
|
+
crawlers << Apollo::Agent::CrawlerAgent.new(amqp, self.options)
|
110
|
+
end
|
111
|
+
|
112
|
+
def init_domainers(amqp, opts={})
|
113
|
+
domainers = []
|
114
|
+
domainers << Apollo::Agent::DomainerAgent.new(amqp, self.options)
|
115
|
+
end
|
116
|
+
|
106
117
|
def init_fetchers(amqp, opts={})
|
107
118
|
fetchers = []
|
108
119
|
fetchers << Apollo::Agent::FetcherAgent.new(amqp, self.options)
|
109
120
|
|
121
|
+
# TODO: This should not be here!
|
110
122
|
enqueue_crawlers_urls(amqp, Apollo::Crawler::BaseCrawler.subclasses, opts)
|
111
|
-
|
112
|
-
# ch = self.amqp.create_channel
|
113
|
-
# x = ch.default_exchange
|
114
|
-
# x.publish("Hello!", :routing_key => "fetcher")
|
115
123
|
end
|
116
124
|
|
117
125
|
def init_agents(amqp, opts={})
|
118
126
|
puts "Initializing agents"
|
119
127
|
|
128
|
+
init_crawlers(amqp, opts)
|
129
|
+
init_domainers(amqp, opts)
|
120
130
|
init_fetchers(amqp, opts)
|
121
131
|
end
|
122
132
|
|
133
|
+
def init_domains(opts={})
|
134
|
+
path = File.join(File.dirname(__FILE__), "../../../tmp/top-1m.csv")
|
135
|
+
puts "#{path}"
|
136
|
+
if(File.exists?(path) == false)
|
137
|
+
return 0
|
138
|
+
end
|
139
|
+
|
140
|
+
Thread::new {
|
141
|
+
CSV.foreach(path) do |row|
|
142
|
+
name = row[1]
|
143
|
+
domain = Apollo::Model::Domain.where({:name => name}).first()
|
144
|
+
if(domain.nil?)
|
145
|
+
domain = Apollo::Model::Domain.new({:name => name})
|
146
|
+
domain.save
|
147
|
+
print "."
|
148
|
+
end
|
149
|
+
end
|
150
|
+
}
|
151
|
+
end
|
152
|
+
|
123
153
|
def init_program(args)
|
124
154
|
res = super(args)
|
125
155
|
return res unless res.nil?
|
@@ -148,6 +178,8 @@ module Apollo
|
|
148
178
|
res = super(args)
|
149
179
|
return res unless res.nil?
|
150
180
|
|
181
|
+
init_domains()
|
182
|
+
|
151
183
|
# Here we start
|
152
184
|
# if(ARGV.length < 1)
|
153
185
|
# puts optparser
|
@@ -156,7 +188,7 @@ module Apollo
|
|
156
188
|
|
157
189
|
res_code = 0
|
158
190
|
if(self.options[:daemon])
|
159
|
-
planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo)
|
191
|
+
planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo, self.options)
|
160
192
|
res_code = planner.run(self.options)
|
161
193
|
end
|
162
194
|
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), "../model/models")
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Scheduler
|
25
|
+
class BaseScheduler
|
26
|
+
def self.schedule(url, crawler=nil, opts={})
|
27
|
+
queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
|
28
|
+
|
29
|
+
if(queued_url.nil?)
|
30
|
+
qu = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
|
31
|
+
qu.save
|
32
|
+
return qu
|
33
|
+
end
|
34
|
+
|
35
|
+
return nil
|
36
|
+
end
|
37
|
+
end # class BaseScheduler
|
38
|
+
end # module Scheduler
|
39
|
+
end # module Apollo
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_scheduler')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.25
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-03-
|
11
|
+
date: 2013-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|
@@ -414,6 +414,9 @@ files:
|
|
414
414
|
- ./lib/apollo_crawler/adapter/mongo_adapter.rb
|
415
415
|
- ./lib/apollo_crawler/agent/agents.rb
|
416
416
|
- ./lib/apollo_crawler/agent/base_agent.rb
|
417
|
+
- ./lib/apollo_crawler/agent/crawler_agent.rb
|
418
|
+
- ./lib/apollo_crawler/agent/domainer_agent.rb
|
419
|
+
- ./lib/apollo_crawler/agent/exchanges.rb
|
417
420
|
- ./lib/apollo_crawler/agent/fetcher_agent.rb
|
418
421
|
- ./lib/apollo_crawler/cache/base_cache.rb
|
419
422
|
- ./lib/apollo_crawler/cache/caches.rb
|
@@ -452,6 +455,8 @@ files:
|
|
452
455
|
- ./lib/apollo_crawler/logger/loggers.rb
|
453
456
|
- ./lib/apollo_crawler/model/base_model.rb
|
454
457
|
- ./lib/apollo_crawler/model/crawler.rb
|
458
|
+
- ./lib/apollo_crawler/model/data_source.rb
|
459
|
+
- ./lib/apollo_crawler/model/domain.rb
|
455
460
|
- ./lib/apollo_crawler/model/models.rb
|
456
461
|
- ./lib/apollo_crawler/model/queued_url.rb
|
457
462
|
- ./lib/apollo_crawler/model/raw_document.rb
|
@@ -463,6 +468,8 @@ files:
|
|
463
468
|
- ./lib/apollo_crawler/program/crawler_program.rb
|
464
469
|
- ./lib/apollo_crawler/program/platform_program.rb
|
465
470
|
- ./lib/apollo_crawler/program/programs.rb
|
471
|
+
- ./lib/apollo_crawler/scheduler/base_scheduler.rb
|
472
|
+
- ./lib/apollo_crawler/scheduler/schedulers.rb
|
466
473
|
- ./lib/apollo_crawler/store/base_store.rb
|
467
474
|
- ./lib/apollo_crawler/store/stores.rb
|
468
475
|
- ./lib/apollo_crawler/version.rb
|