apollo-crawler 0.1.26 → 0.1.27
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/apollo-aws +187 -0
- data/config/mongoid.yml +7 -10
- data/lib/apollo_crawler/agent/crawler_agent.rb +8 -6
- data/lib/apollo_crawler/agent/domainer_agent.rb +1 -1
- data/lib/apollo_crawler/agent/exchanges.rb +23 -8
- data/lib/apollo_crawler/agent/fetcher_agent.rb +53 -31
- data/lib/apollo_crawler/crawler/crawlers.rb +1 -0
- data/lib/apollo_crawler/crawler/spider_crawler.rb +52 -0
- data/lib/apollo_crawler/helper/amqp_helper.rb +1 -0
- data/lib/apollo_crawler/helper/mongo_helper.rb +32 -0
- data/lib/apollo_crawler/model/domain.rb +2 -1
- data/lib/apollo_crawler/planner/smart_planner.rb +22 -11
- data/lib/apollo_crawler/program/platform_program.rb +9 -7
- data/lib/apollo_crawler/scheduler/base_scheduler.rb +6 -8
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +4 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d6e559b9a330a556c3fb9eed416e0e83bdeb5d04
|
4
|
+
data.tar.gz: 289cf132fbd702c2c81d9517f95460aed5a629ee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c150d48952a2061a2db796046016d0dc8cf0d3c858a41655995f3dea3b712cbc9470db75461a5598541313ea1de5a1325152579235a17f3c10911745eec7c532
|
7
|
+
data.tar.gz: 40bca94e6c247dcebd06fff4669dce6631a0c8096a3f26b4a3d68da7d03f1c1b9d0388fa067d46907291e64ec49d2ccd4f9c33fef3076c271816172f20d387bb
|
data/bin/apollo-aws
ADDED
@@ -0,0 +1,187 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
|
24
|
+
require "rubygems"
|
25
|
+
require "bundler/setup"
|
26
|
+
|
27
|
+
require 'fog'
|
28
|
+
|
29
|
+
require 'fileutils'
|
30
|
+
require 'yaml'
|
31
|
+
|
32
|
+
module Apollo
|
33
|
+
class AwsConsole
|
34
|
+
AWS_CONFIG_FILE = File.expand_path("~/.apollo/config/apollo-aws.yml")
|
35
|
+
AWS_DEFAULT_CONFIG = {
|
36
|
+
:provider => "aws",
|
37
|
+
:aws_access_key_id => "",
|
38
|
+
:aws_secret_access_key => "",
|
39
|
+
:aws_default_instance => "i-a4039bee",
|
40
|
+
:region => "eu-west-1"
|
41
|
+
}
|
42
|
+
|
43
|
+
attr_accessor :connection
|
44
|
+
attr_accessor :config
|
45
|
+
|
46
|
+
def initialize
|
47
|
+
self.connection = nil
|
48
|
+
self.config = nil
|
49
|
+
end
|
50
|
+
|
51
|
+
def config_init(path)
|
52
|
+
self.config = AWS_DEFAULT_CONFIG
|
53
|
+
|
54
|
+
AWS_DEFAULT_CONFIG.each do |k, v|
|
55
|
+
print "Override '#{k}' (default: '#{v}') ? > "
|
56
|
+
val = STDIN.gets.chomp!
|
57
|
+
if val.empty?
|
58
|
+
self.config[k] = v
|
59
|
+
else
|
60
|
+
self.config[k] = val
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
config_save(AWS_CONFIG_FILE, self.config)
|
65
|
+
end
|
66
|
+
|
67
|
+
def config_save(path, config)
|
68
|
+
dir = File.dirname(path)
|
69
|
+
if(Dir.exists?(dir) == false)
|
70
|
+
FileUtils.mkpath(dir)
|
71
|
+
end
|
72
|
+
|
73
|
+
File.open(path, 'w+') do |f|
|
74
|
+
f.write(config.to_yaml)
|
75
|
+
end
|
76
|
+
|
77
|
+
return config
|
78
|
+
end
|
79
|
+
|
80
|
+
def config_load(path)
|
81
|
+
self.config = YAML.load_file(path)
|
82
|
+
end
|
83
|
+
|
84
|
+
def connect()
|
85
|
+
return self.connection if self.connection
|
86
|
+
|
87
|
+
aws_provider_keys = [:provider, :aws_access_key_id, :aws_secret_access_key, :region]
|
88
|
+
config = self.config.reject { |key, value| !aws_provider_keys.include?(key) }
|
89
|
+
|
90
|
+
begin
|
91
|
+
self.connection = Fog::Compute.new(config)
|
92
|
+
rescue Exception => e
|
93
|
+
puts "Unable to connect to AWS, reason: #{e.to_s}"
|
94
|
+
return -1
|
95
|
+
end
|
96
|
+
|
97
|
+
return self.connection
|
98
|
+
end
|
99
|
+
|
100
|
+
def get_instance(instance_id)
|
101
|
+
puts "Getting instance '#{instance_id}'"
|
102
|
+
self.connection.servers.get(instance_id)
|
103
|
+
end
|
104
|
+
|
105
|
+
def run_cmd(cmd)
|
106
|
+
if cmd == "help"
|
107
|
+
puts "Supported commands - init, info, interactive, list, start, stop"
|
108
|
+
return 0
|
109
|
+
end
|
110
|
+
|
111
|
+
if self.config.nil?
|
112
|
+
if(File.exists?(AWS_CONFIG_FILE) == false)
|
113
|
+
self.config = config_init(AWS_CONFIG_FILE)
|
114
|
+
else
|
115
|
+
self.config = config_load(AWS_CONFIG_FILE)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
case cmd
|
120
|
+
|
121
|
+
when "info"
|
122
|
+
instance = self.config[:aws_default_instance]
|
123
|
+
puts "Inspecting instance '#{instance}'"
|
124
|
+
connect()
|
125
|
+
server = get_instance(instance)
|
126
|
+
puts server.inspect
|
127
|
+
return 0
|
128
|
+
|
129
|
+
when "init"
|
130
|
+
config_init(AWS_CONFIG_FILE)
|
131
|
+
return 0
|
132
|
+
|
133
|
+
when "list"
|
134
|
+
connect()
|
135
|
+
instance_list = connection.servers.all
|
136
|
+
instance_list.table([:id, :flavor_id, :public_ip_address, :private_ip_address, :image_id ])
|
137
|
+
return 0
|
138
|
+
|
139
|
+
when "start"
|
140
|
+
instance = self.config[:aws_default_instance]
|
141
|
+
puts "Starting instance '#{instance}'"
|
142
|
+
connect()
|
143
|
+
server = get_instance(instance)
|
144
|
+
res = server.start
|
145
|
+
puts " => Success: #{res}"
|
146
|
+
puts " => DNS: #{server.dns_name}"
|
147
|
+
return 0
|
148
|
+
|
149
|
+
when "stop"
|
150
|
+
instance = self.config[:aws_default_instance]
|
151
|
+
puts "Stopping instance '#{instance}'"
|
152
|
+
connect()
|
153
|
+
server = get_instance(instance)
|
154
|
+
res = server.stop
|
155
|
+
puts " => Success: #{res}"
|
156
|
+
puts " => DNS: #{server.dns_name}"
|
157
|
+
return 0
|
158
|
+
end
|
159
|
+
|
160
|
+
puts "Unknown command '#{cmd}'"
|
161
|
+
return -1
|
162
|
+
end
|
163
|
+
|
164
|
+
def run()
|
165
|
+
cmd = ARGV.length > 0 ? ARGV[0] : "help"
|
166
|
+
|
167
|
+
if cmd == "interactive"
|
168
|
+
print "> "
|
169
|
+
while cmd = STDIN.gets.chomp!
|
170
|
+
break if cmd.downcase == "quit"
|
171
|
+
|
172
|
+
res = run_cmd(cmd)
|
173
|
+
|
174
|
+
print "> "
|
175
|
+
end
|
176
|
+
|
177
|
+
return 0
|
178
|
+
end
|
179
|
+
|
180
|
+
return run_cmd(cmd)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
if __FILE__ == $0
|
186
|
+
Apollo::AwsConsole::new.run()
|
187
|
+
end
|
data/config/mongoid.yml
CHANGED
@@ -1,26 +1,23 @@
|
|
1
|
-
default:
|
1
|
+
default:
|
2
2
|
sessions:
|
3
3
|
default:
|
4
4
|
hosts:
|
5
5
|
- apollo-crawler.no-ip.org:27017
|
6
|
-
|
7
6
|
development:
|
8
7
|
sessions:
|
9
8
|
default:
|
10
9
|
hosts:
|
11
10
|
- localhost:27017
|
12
11
|
database: apollo-crawler-development
|
13
|
-
|
14
12
|
test:
|
15
13
|
sessions:
|
16
14
|
default:
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
hosts:
|
16
|
+
- apollo-crawler.no-ip.org:27017
|
17
|
+
database: apollo-crawler-test
|
21
18
|
production:
|
22
19
|
sessions:
|
23
20
|
default:
|
24
|
-
|
25
|
-
|
26
|
-
|
21
|
+
hosts:
|
22
|
+
- apollo-crawler.no-ip.org:27017
|
23
|
+
database: apollo-crawler-production
|
@@ -37,18 +37,20 @@ module Apollo
|
|
37
37
|
|
38
38
|
# Declarations
|
39
39
|
channel = amqp.create_channel
|
40
|
-
declarations = Apollo::Agent.
|
41
|
-
|
40
|
+
self.declarations = Apollo::Agent.declare_entities(channel, opts)# Binding
|
41
|
+
|
42
42
|
# Binding
|
43
|
-
declarations[:queues]["crawler.queue"]
|
44
|
-
|
43
|
+
queue = self.declarations[:queues]["crawler.queue"]
|
44
|
+
exchange = self.declarations[:exchanges]["crawler"]
|
45
45
|
|
46
|
+
queue.bind(exchange).subscribe do |delivery_info, metadata, payload|
|
46
47
|
msg = JSON.parse(payload)
|
47
48
|
|
48
49
|
request = msg['request']
|
49
50
|
response = msg['response']
|
51
|
+
url = request["url"]
|
50
52
|
|
51
|
-
|
53
|
+
puts "CrawlerAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
|
52
54
|
|
53
55
|
doc = Nokogiri::HTML(response['body'])
|
54
56
|
crawler = request['crawler_name'].constantize.new
|
@@ -59,7 +61,7 @@ module Apollo
|
|
59
61
|
# puts res.inspect
|
60
62
|
|
61
63
|
if(metadata[:reply_to] != nil)
|
62
|
-
x = declarations[:exchanges][metadata[:reply_to]]
|
64
|
+
x = self.declarations[:exchanges][metadata[:reply_to]]
|
63
65
|
|
64
66
|
msg = {
|
65
67
|
:request => request,
|
@@ -37,7 +37,7 @@ module Apollo
|
|
37
37
|
|
38
38
|
# Declarations
|
39
39
|
channel = amqp.create_channel
|
40
|
-
declarations = Apollo::Agent.
|
40
|
+
declarations = Apollo::Agent.declare_entities(channel, opts)
|
41
41
|
|
42
42
|
# Binding
|
43
43
|
declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|
|
@@ -20,6 +20,23 @@
|
|
20
20
|
|
21
21
|
module Apollo
|
22
22
|
module Agent
|
23
|
+
def self.declare_queues(channel, opts={})
|
24
|
+
if(opts[:verbose])
|
25
|
+
puts "Declaring AMQP Queues"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Queues
|
29
|
+
queues = {}
|
30
|
+
queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
|
31
|
+
queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
|
32
|
+
queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
|
33
|
+
queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
|
34
|
+
queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
|
35
|
+
queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
|
36
|
+
|
37
|
+
return queues
|
38
|
+
end
|
39
|
+
|
23
40
|
def self.declare_exchanges(channel, opts={})
|
24
41
|
if(opts[:verbose])
|
25
42
|
puts "Declaring AMQP Exchanges"
|
@@ -34,14 +51,12 @@ module Apollo
|
|
34
51
|
exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
|
35
52
|
exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
|
36
53
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
queues
|
43
|
-
queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
|
44
|
-
queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
|
54
|
+
return exchanges
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.declare_entities(channel, opts={})
|
58
|
+
exchanges = self.declare_exchanges(channel, opts)
|
59
|
+
queues = self.declare_queues(channel, opts)
|
45
60
|
|
46
61
|
# Compose res
|
47
62
|
res = {
|
@@ -23,17 +23,20 @@ require File.join(File.dirname(__FILE__), 'base_agent')
|
|
23
23
|
|
24
24
|
require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
|
25
25
|
|
26
|
+
require 'amqp'
|
27
|
+
require 'amqp/extensions/rabbitmq'
|
28
|
+
|
26
29
|
require 'digest/sha1'
|
27
|
-
require 'thread
|
30
|
+
require 'thread'
|
28
31
|
|
29
32
|
module Apollo
|
30
33
|
module Agent
|
31
34
|
class FetcherAgent < BaseAgent
|
32
|
-
THREAD_POOL_SIZE =
|
35
|
+
THREAD_POOL_SIZE = 1
|
33
36
|
|
34
37
|
attr_accessor :fetcher
|
35
38
|
attr_accessor :declarations
|
36
|
-
attr_accessor :
|
39
|
+
attr_accessor :mutex
|
37
40
|
|
38
41
|
def initialize(amqp, opts={})
|
39
42
|
self.fetcher = Apollo::Fetcher::SmartFetcher.new
|
@@ -42,46 +45,54 @@ module Apollo
|
|
42
45
|
puts "Initializing fetcher agent..."
|
43
46
|
end
|
44
47
|
|
45
|
-
thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
|
46
|
-
|
47
48
|
# Declarations
|
48
49
|
channel = amqp.create_channel
|
49
|
-
|
50
|
+
channel.prefetch(THREAD_POOL_SIZE)
|
51
|
+
|
52
|
+
# Binding (Default)
|
53
|
+
self.declarations = Apollo::Agent.declare_entities(channel, opts)
|
54
|
+
queue = declarations[:queues]["fetcher.queue"]
|
55
|
+
|
56
|
+
# AMQP contexts for threads
|
57
|
+
contexts = []
|
58
|
+
(0...THREAD_POOL_SIZE).each do |i|
|
59
|
+
puts "FetcherAgent::initialize() - Creating context #{i}" if opts[:verbose]
|
60
|
+
end
|
50
61
|
|
51
|
-
#
|
52
|
-
|
53
|
-
thread_pool.process {
|
54
|
-
queued_url = JSON.parse(payload)
|
55
|
-
url = queued_url["url"]
|
62
|
+
# AMQP contexts mutex/lock
|
63
|
+
self.mutex = Mutex.new()
|
56
64
|
|
57
|
-
|
65
|
+
exchange = self.declarations[:exchanges]["fetcher"]
|
58
66
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
|
64
|
-
end
|
67
|
+
queue.bind(exchange).subscribe(:ack => true) do |delivery_info, metadata, payload|
|
68
|
+
# There can be troubles with concurency, please see https://groups.google.com/forum/?fromgroups=#!topic/ruby-amqp/aO9GPu-jxuE
|
69
|
+
queued_url = JSON.parse(payload)
|
70
|
+
url = queued_url["url"]
|
65
71
|
|
66
|
-
|
67
|
-
|
68
|
-
#
|
72
|
+
puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
|
73
|
+
self.mutex.synchronize {
|
74
|
+
puts "FetcherAgent: Acking - '#{delivery_info.delivery_tag}'" if opts[:verbose]
|
75
|
+
channel.basic_ack(delivery_info.delivery_tag, true)
|
76
|
+
}
|
69
77
|
|
78
|
+
begin
|
79
|
+
doc = Apollo::Fetcher::SmartFetcher::fetch(url)
|
80
|
+
doc = process_fetched_doc(queued_url, doc, metadata, opts)
|
81
|
+
|
70
82
|
if(metadata && metadata[:reply_to])
|
71
|
-
|
83
|
+
puts "Replying to '#{metadata[:reply_to]}'"
|
84
|
+
send_response_msg(metadata[:reply_to], queued_url, doc)
|
85
|
+
end
|
72
86
|
|
73
|
-
|
74
|
-
|
87
|
+
rescue Exception => e
|
88
|
+
puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
|
89
|
+
end
|
75
90
|
|
76
|
-
|
77
|
-
x.publish(msg.to_json)
|
78
|
-
end
|
79
|
-
end
|
80
|
-
}
|
91
|
+
doc
|
81
92
|
end
|
82
93
|
end
|
83
94
|
|
84
|
-
def
|
95
|
+
def process_fetched_doc(queued_url, doc, metadata, opts={})
|
85
96
|
url = queued_url["url"]
|
86
97
|
|
87
98
|
res = Apollo::Model::RawDocument.new
|
@@ -94,12 +105,23 @@ module Apollo
|
|
94
105
|
return res
|
95
106
|
end
|
96
107
|
|
97
|
-
def
|
108
|
+
def format_response_msg(queued_url, doc)
|
98
109
|
return {
|
99
110
|
:request => queued_url,
|
100
111
|
:response => doc
|
101
112
|
}
|
102
113
|
end
|
114
|
+
|
115
|
+
def send_response_msg(dest, queued_url, doc)
|
116
|
+
if(dest != nil)
|
117
|
+
msg = format_response_msg(queued_url, doc)
|
118
|
+
|
119
|
+
self.mutex.synchronize {
|
120
|
+
exchange = self.declarations[:exchanges][dest]
|
121
|
+
exchange.publish(msg.to_json)
|
122
|
+
}
|
123
|
+
end
|
124
|
+
end
|
103
125
|
end # class FetcherAgent
|
104
126
|
end # module Agent
|
105
127
|
end # module Apollo
|
@@ -22,6 +22,7 @@ require File.join(File.dirname(__FILE__), 'base_crawler')
|
|
22
22
|
require File.join(File.dirname(__FILE__), 'google_crawler')
|
23
23
|
require File.join(File.dirname(__FILE__), 'hacker_news_crawler')
|
24
24
|
require File.join(File.dirname(__FILE__), 'slashdot_crawler')
|
25
|
+
require File.join(File.dirname(__FILE__), 'spider_crawler')
|
25
26
|
require File.join(File.dirname(__FILE__), 'stackoverflow_crawler')
|
26
27
|
require File.join(File.dirname(__FILE__), 'xkcd_crawler')
|
27
28
|
require File.join(File.dirname(__FILE__), 'youjizz_crawler')
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_crawler')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Crawler
|
25
|
+
class SpiderCrawler < BaseCrawler
|
26
|
+
def name()
|
27
|
+
return "Spider"
|
28
|
+
end
|
29
|
+
|
30
|
+
def url()
|
31
|
+
return "http://www.wikipedia.org/"
|
32
|
+
end
|
33
|
+
|
34
|
+
def extract_data(doc)
|
35
|
+
[]
|
36
|
+
end
|
37
|
+
|
38
|
+
def extract_links(doc)
|
39
|
+
res = doc.xpath("//a").map { |node|
|
40
|
+
url = BaseCrawler.try_get_url(self.url, node['href']).to_s
|
41
|
+
next if url.nil?
|
42
|
+
|
43
|
+
{
|
44
|
+
:link => url
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
return res.uniq
|
49
|
+
end
|
50
|
+
end # class SpiderCrawler
|
51
|
+
end # module Crawler
|
52
|
+
end # module Apollo
|
@@ -21,6 +21,8 @@
|
|
21
21
|
require 'mongo'
|
22
22
|
require 'mongoid'
|
23
23
|
|
24
|
+
require 'csv'
|
25
|
+
|
24
26
|
module Apollo
|
25
27
|
module Helper
|
26
28
|
module Mongo
|
@@ -37,6 +39,36 @@ module Apollo
|
|
37
39
|
|
38
40
|
return res
|
39
41
|
end
|
42
|
+
|
43
|
+
def self.csv_bulk_insert(path, model, bulk_size, validate=false, &block)
|
44
|
+
batch = []
|
45
|
+
|
46
|
+
CSV.foreach(path) do |row|
|
47
|
+
res = nil
|
48
|
+
if block_given?
|
49
|
+
res = yield row
|
50
|
+
end
|
51
|
+
|
52
|
+
if res.nil? == false
|
53
|
+
if(!validate || model.where(res).length == 0)
|
54
|
+
batch << res
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
if((batch.length % bulk_size) == 0)
|
59
|
+
# puts "Inserting batch '#{batch.inspect}'"
|
60
|
+
|
61
|
+
model.collection.insert(batch)
|
62
|
+
batch.clear
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
if batch.empty? == false
|
67
|
+
model.collection.insert(batch)
|
68
|
+
batch.clear
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
40
72
|
end # Mongo
|
41
73
|
end # module Helper
|
42
74
|
end # module Apollo
|
@@ -29,9 +29,10 @@ module Apollo
|
|
29
29
|
store_in collection: "domains"
|
30
30
|
|
31
31
|
field :name
|
32
|
+
field :rank
|
32
33
|
|
33
34
|
# Indexes
|
34
|
-
index({ created_at: 1, updated_at: 1, name: 1 })
|
35
|
+
index({ created_at: 1, updated_at: 1, name: 1, rank: 1})
|
35
36
|
end # class Domain
|
36
37
|
end # module Model
|
37
38
|
end # module Apollo
|
@@ -41,7 +41,8 @@ module Apollo
|
|
41
41
|
|
42
42
|
# Declarations
|
43
43
|
channel = amqp.create_channel
|
44
|
-
|
44
|
+
# channel.prefetch(1)
|
45
|
+
self.declarations = Apollo::Agent.declare_entities(channel, opts)
|
45
46
|
|
46
47
|
# Bindings
|
47
48
|
declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
|
@@ -67,14 +68,17 @@ module Apollo
|
|
67
68
|
end
|
68
69
|
else
|
69
70
|
doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
|
71
|
+
if(doc.nil? == false)
|
72
|
+
puts "Same as #{doc.inspect}"
|
73
|
+
end
|
70
74
|
end
|
71
75
|
|
72
76
|
if(doc.nil?)
|
73
77
|
doc = Apollo::Model::RawDocument.new(response).save
|
74
|
-
end
|
75
78
|
|
76
|
-
|
77
|
-
|
79
|
+
# Publish
|
80
|
+
declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
|
81
|
+
end
|
78
82
|
end
|
79
83
|
|
80
84
|
declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
|
@@ -103,6 +107,10 @@ module Apollo
|
|
103
107
|
end
|
104
108
|
end
|
105
109
|
|
110
|
+
def get_url_count(state, opts={})
|
111
|
+
Apollo::Model::QueuedUrl.where({:state => state}).count
|
112
|
+
end
|
113
|
+
|
106
114
|
def fetch_url(url, opts={})
|
107
115
|
if(opts[:verbose])
|
108
116
|
puts "AMQP fetching '#{url.inspect}'"
|
@@ -113,19 +121,22 @@ module Apollo
|
|
113
121
|
end
|
114
122
|
|
115
123
|
def get_next_url(opts={})
|
116
|
-
Apollo::Model::QueuedUrl.where({:state => :queued}).
|
124
|
+
tmp = Apollo::Model::QueuedUrl.where({:state => :queued}).order_by(:created_at.asc)
|
125
|
+
tmp.find_and_modify({ "$set" => { state: :fetching }}, new: true)
|
117
126
|
end
|
118
127
|
|
119
128
|
def fetch_queued_urls(opts={})
|
120
|
-
|
129
|
+
fetching_count = Apollo::Model::QueuedUrl.where({:state => :fetching}).count
|
121
130
|
|
122
|
-
|
123
|
-
puts
|
124
|
-
|
131
|
+
if(fetching_count > 4)
|
132
|
+
puts "Fetching too many URLs. Slowing down for a while ..."
|
133
|
+
return
|
134
|
+
end
|
125
135
|
|
136
|
+
while get_url_count(:fetching) < 4
|
137
|
+
url = get_next_url(opts)
|
138
|
+
puts "SmartPlanner::fetch_queued_urls() - Queueing: #{url.inspect}"
|
126
139
|
fetch_url(url, opts)
|
127
|
-
|
128
|
-
url = get_next_url()
|
129
140
|
end
|
130
141
|
end
|
131
142
|
|
@@ -138,14 +138,16 @@ module Apollo
|
|
138
138
|
end
|
139
139
|
|
140
140
|
Thread::new {
|
141
|
-
|
141
|
+
Apollo::Helper::Mongo::csv_bulk_insert(path, Apollo::Model::Domain, 1000, false) do |row|
|
142
|
+
rank = row[0].to_i
|
142
143
|
name = row[1]
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
144
|
+
|
145
|
+
res = {
|
146
|
+
:rank => rank,
|
147
|
+
:name => name
|
148
|
+
}
|
149
|
+
|
150
|
+
res
|
149
151
|
end
|
150
152
|
}
|
151
153
|
end
|
@@ -18,7 +18,7 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
require File.join(File.dirname(__FILE__), "../model/models")
|
21
|
+
require File.join(File.dirname(__FILE__), "../model/models.rb")
|
22
22
|
|
23
23
|
module Apollo
|
24
24
|
module Scheduler
|
@@ -26,13 +26,11 @@ module Apollo
|
|
26
26
|
def self.schedule(url, crawler=nil, opts={})
|
27
27
|
queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
|
28
28
|
|
29
|
-
if
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
return nil
|
29
|
+
return queued_url if queued_url
|
30
|
+
|
31
|
+
res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
|
32
|
+
res.save
|
33
|
+
return res
|
36
34
|
end
|
37
35
|
end # class BaseScheduler
|
38
36
|
end # module Scheduler
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.27
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
@@ -391,6 +391,7 @@ dependencies:
|
|
391
391
|
description: Gem for crawling data from external sources
|
392
392
|
email: korczis@gmail.com
|
393
393
|
executables:
|
394
|
+
- apollo-aws
|
394
395
|
- apollo-console
|
395
396
|
- apollo-crawler
|
396
397
|
- apollo-platform
|
@@ -432,6 +433,7 @@ files:
|
|
432
433
|
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
433
434
|
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
434
435
|
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
436
|
+
- ./lib/apollo_crawler/crawler/spider_crawler.rb
|
435
437
|
- ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
|
436
438
|
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
437
439
|
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
@@ -473,6 +475,7 @@ files:
|
|
473
475
|
- ./lib/apollo_crawler/store/base_store.rb
|
474
476
|
- ./lib/apollo_crawler/store/stores.rb
|
475
477
|
- ./lib/apollo_crawler/version.rb
|
478
|
+
- bin/apollo-aws
|
476
479
|
- bin/apollo-console
|
477
480
|
- bin/apollo-crawler
|
478
481
|
- bin/apollo-platform
|