apollo-crawler 0.1.22 → 0.1.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +6 -14
- data/bin/apollo-console +30 -0
- data/config/amqp.yml +4 -3
- data/config/deploy.rb +61 -0
- data/config/mongoid.yml +9 -6
- data/lib/apollo_crawler.rb +3 -0
- data/lib/apollo_crawler/agent/agents.rb +2 -1
- data/lib/apollo_crawler/agent/base_agent.rb +3 -0
- data/lib/apollo_crawler/agent/fetcher_agent.rb +55 -0
- data/lib/apollo_crawler/config.rb +2 -2
- data/lib/apollo_crawler/helper/amqp_helper.rb +21 -0
- data/lib/apollo_crawler/helper/core_helper.rb +8 -0
- data/lib/apollo_crawler/helper/mongo_helper.rb +16 -0
- data/lib/apollo_crawler/lib.rb +3 -0
- data/lib/apollo_crawler/model/base_model.rb +29 -0
- data/lib/apollo_crawler/model/crawler.rb +39 -0
- data/lib/apollo_crawler/model/models.rb +24 -0
- data/lib/apollo_crawler/model/queued_url.rb +38 -0
- data/lib/apollo_crawler/model/raw_document.rb +37 -0
- data/lib/apollo_crawler/planner/base_planner.rb +4 -1
- data/lib/apollo_crawler/planner/smart_planner.rb +49 -0
- data/lib/apollo_crawler/program/base_program.rb +59 -2
- data/lib/apollo_crawler/program/crawler_program.rb +18 -10
- data/lib/apollo_crawler/program/platform_program.rb +47 -18
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +96 -87
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
ZDc0YmI0MjEyNTgzNTM0NmY2NGE2YTY0Y2MxNTdiNTBkN2M3ZDM2MDdmNjZm
|
10
|
-
MjNhZDVlNmRkNDdkMDVhNzhjNzg0ZDQ2ZjRkYThhNzJlODMxNjE3NmE4MjVm
|
11
|
-
ZDg0ZjFlZWFjM2I0NWQ0ZmIzYmM5ZmY0MTRiOGE2YTMwZDVmYzE=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
N2I5ZGM1NjI2M2QxN2FmMDZkMThkOGU5NDE5MTEyOTNlZWFkNGQ1N2FlZjFi
|
14
|
-
MTI5ZGNhNjdmZTAyZjAyYTVkZWFlNGJmZDk5YzA3ZjlhN2Q5MTc1NTIyNGVi
|
15
|
-
M2VjODgxNzQyYTAxNzQ5NWQ5MTQzZjUxNWY5MWZlNDQzNjg2YmQ=
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 71fb379b6ae32ceb79e40cce451c8a3646278d32
|
4
|
+
data.tar.gz: 08fdec629298945a86993b91be3a743b880ad4a0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d0789d2ef99358144c90d148c378d9bf53e3084b326ede425ed7ff171ab450a4ed9de7a86494dbf49992ed235807f92e6a795cc52676bd61280a006408fc4e90
|
7
|
+
data.tar.gz: 16ac99fc7e192fe137348c6d364e730c9c0071f274f200b395746c4247bb3958c7238b315b571a730014b5f8286602bbc67eb9f26db30c217440e15401d3ac95
|
data/bin/apollo-console
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
|
24
|
+
require "rubygems"
|
25
|
+
require "bundler/setup"
|
26
|
+
|
27
|
+
require File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler")
|
28
|
+
|
29
|
+
Apollo::ConsoleProgram.new.run(ARGV)
|
30
|
+
|
data/config/amqp.yml
CHANGED
@@ -2,17 +2,18 @@ default: &default
|
|
2
2
|
host: localhost
|
3
3
|
username: guest
|
4
4
|
password: guest
|
5
|
+
port: 5672
|
5
6
|
|
6
7
|
development:
|
7
8
|
<<: *default
|
8
|
-
vhost: apollo-crawler-development
|
9
|
+
vhost: /apollo-crawler-development
|
9
10
|
|
10
11
|
test:
|
11
12
|
<<: *default
|
12
13
|
host: apollo-crawler.no-ip.org
|
13
|
-
vhost: apollo-crawler-test
|
14
|
+
vhost: /apollo-crawler-test
|
14
15
|
|
15
16
|
production:
|
16
17
|
<<: *default
|
17
18
|
host: apollo-crawler.no-ip.org
|
18
|
-
vhost: apollo-crawler-production
|
19
|
+
vhost: /apollo-crawler-production
|
data/config/deploy.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'capistrano'
|
2
|
+
|
3
|
+
require "rubygems"
|
4
|
+
require "bundler/setup"
|
5
|
+
require "bundler/capistrano"
|
6
|
+
|
7
|
+
# RVM integration
|
8
|
+
require "rvm/capistrano"
|
9
|
+
|
10
|
+
# Target ruby version
|
11
|
+
set :rvm_ruby_string, '2.0.0'
|
12
|
+
|
13
|
+
set :domain, "apollo-crawler.no-ip.org"
|
14
|
+
set :application, "apollo_platform"
|
15
|
+
# set :deploy_to, File.join(File.expand_path("~"), "/apps/#{application}")
|
16
|
+
set :deploy_to, "/home/ubuntu/apps/#{application}"
|
17
|
+
|
18
|
+
ssh_options[:keys] = [File.join(ENV["HOME"], ".ssh", "key-webs.pem")]
|
19
|
+
|
20
|
+
set :user, "ubuntu"
|
21
|
+
set :use_sudo, false
|
22
|
+
|
23
|
+
set :scm, :git
|
24
|
+
set :repository, "https://github.com/korczis/apollo-crawler.git"
|
25
|
+
set :branch, 'master'
|
26
|
+
set :git_shallow_clone, 1
|
27
|
+
|
28
|
+
role :web, domain
|
29
|
+
role :app, domain
|
30
|
+
role :db, domain, :primary => true
|
31
|
+
|
32
|
+
set :deploy_via, :remote_cache
|
33
|
+
|
34
|
+
namespace :deploy do
|
35
|
+
def remote_cmd(cmd)
|
36
|
+
run "cd #{deploy_to}/current && #{cmd}"
|
37
|
+
end
|
38
|
+
|
39
|
+
task :start, :roles => [:web, :app] do
|
40
|
+
puts "Starting.."
|
41
|
+
remote_cmd "./bin/apollo-platform -V"
|
42
|
+
end
|
43
|
+
|
44
|
+
task :stop, :roles => [:web, :app] do
|
45
|
+
puts "Stopping.."
|
46
|
+
end
|
47
|
+
|
48
|
+
task :status, :roles => [:web, :app] do
|
49
|
+
puts "Statusing.."
|
50
|
+
end
|
51
|
+
|
52
|
+
task :restart, :roles => [:web, :app] do
|
53
|
+
puts "Restarting.."
|
54
|
+
end
|
55
|
+
|
56
|
+
# This will make sure that Capistrano doesn't try to run rake:migrate (this is not a Rails project!)
|
57
|
+
task :cold do
|
58
|
+
deploy.update
|
59
|
+
deploy.start
|
60
|
+
end
|
61
|
+
end
|
data/config/mongoid.yml
CHANGED
@@ -1,23 +1,26 @@
|
|
1
|
-
default: &
|
1
|
+
default: &default_options
|
2
2
|
sessions:
|
3
3
|
default:
|
4
4
|
hosts:
|
5
5
|
- apollo-crawler.no-ip.org:27017
|
6
6
|
|
7
7
|
development:
|
8
|
-
<<: *default
|
9
8
|
sessions:
|
10
9
|
default:
|
10
|
+
hosts:
|
11
|
+
- localhost:27017
|
11
12
|
database: apollo-crawler-development
|
12
13
|
|
13
14
|
test:
|
14
|
-
<<: *default
|
15
15
|
sessions:
|
16
16
|
default:
|
17
|
-
|
17
|
+
hosts:
|
18
|
+
- apollo-crawler.no-ip.org:27017
|
19
|
+
database: apollo-crawler-test
|
18
20
|
|
19
21
|
production:
|
20
|
-
<<: *default
|
21
22
|
sessions:
|
22
23
|
default:
|
23
|
-
|
24
|
+
hosts:
|
25
|
+
- apollo-crawler.no-ip.org:27017
|
26
|
+
database: apollo-crawler-production
|
data/lib/apollo_crawler.rb
CHANGED
@@ -48,6 +48,9 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/helper/helpers')
|
|
48
48
|
# Loggers
|
49
49
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/loggers')
|
50
50
|
|
51
|
+
# Models
|
52
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/model/models')
|
53
|
+
|
51
54
|
# Planner
|
52
55
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/planner/planners')
|
53
56
|
|
@@ -18,4 +18,5 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
require File.join(File.dirname(__FILE__), 'base_agent')
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_agent')
|
22
|
+
require File.join(File.dirname(__FILE__), 'fetcher_agent')
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_agent')
|
22
|
+
require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
|
23
|
+
|
24
|
+
module Apollo
|
25
|
+
module Agent
|
26
|
+
class FetcherAgent < BaseAgent
|
27
|
+
attr_accessor :fetcher
|
28
|
+
|
29
|
+
def initialize(amqp, opts={})
|
30
|
+
self.fetcher = Apollo::Fetcher::SmartFetcher.new
|
31
|
+
|
32
|
+
if(opts[:verbose])
|
33
|
+
puts "Initializing fetcher agent..."
|
34
|
+
end
|
35
|
+
|
36
|
+
ch = amqp.create_channel
|
37
|
+
q = ch.queue("fetcher", :auto_delete => false, :durable => true)
|
38
|
+
x = ch.default_exchange
|
39
|
+
|
40
|
+
q.subscribe do |delivery_info, metadata, payload|
|
41
|
+
res = nil
|
42
|
+
|
43
|
+
puts "Received #{payload}" if opts[:verbose]
|
44
|
+
|
45
|
+
Thread.new do |t|
|
46
|
+
queued_url = JSON.parse(payload)
|
47
|
+
# puts queued_url["url"]
|
48
|
+
# res = Apollo::Fetcher::SmartFetcher::fetch(queued_url["url"])
|
49
|
+
# puts "#{queued_url['url']} - " + res.inspect
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end # class FetcherAgent
|
54
|
+
end # module Agent
|
55
|
+
end # module Apollo
|
@@ -87,8 +87,8 @@ module Apollo
|
|
87
87
|
}
|
88
88
|
|
89
89
|
# Used caching mechanism by default
|
90
|
-
CACHE_CLASS = Apollo::Cache::
|
91
|
-
CACHE_CLASS_OPTIONS =
|
90
|
+
CACHE_CLASS = Apollo::Cache::MemcachedCache
|
91
|
+
CACHE_CLASS_OPTIONS = nil
|
92
92
|
|
93
93
|
############################################################
|
94
94
|
# Crawlers - Built-in out-of box working crawlers
|
@@ -18,9 +18,30 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require 'amqp'
|
22
|
+
require 'bunny'
|
23
|
+
require 'thread'
|
24
|
+
|
21
25
|
module Apollo
|
22
26
|
module Helper
|
23
27
|
module Amqp
|
28
|
+
def self.connect(conn, opts={})
|
29
|
+
res = nil
|
30
|
+
|
31
|
+
if(opts[:verbose])
|
32
|
+
puts "AMQP Connecting - #{conn.inspect}"
|
33
|
+
end
|
34
|
+
|
35
|
+
res = Bunny.new(:host => conn['host'], :user => conn['username'], :password => conn['password'], :vhost => conn['vhost'], :port => conn['port'])
|
36
|
+
res.start
|
37
|
+
|
38
|
+
sleep(0.001) until res
|
39
|
+
if(opts[:verbose])
|
40
|
+
puts "AMQP connected - #{res.inspect}"
|
41
|
+
end
|
42
|
+
|
43
|
+
return res
|
44
|
+
end
|
24
45
|
end # Amqp
|
25
46
|
end # module Helper
|
26
47
|
end # module Apollo
|
@@ -18,9 +18,25 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require 'mongo'
|
22
|
+
require 'mongoid'
|
23
|
+
|
21
24
|
module Apollo
|
22
25
|
module Helper
|
23
26
|
module Mongo
|
27
|
+
def self.connect(conn, opts={})
|
28
|
+
if(opts[:verbose])
|
29
|
+
puts "MongoDB connecting - '#{conn.inspect}"
|
30
|
+
end
|
31
|
+
|
32
|
+
res = ::Mongo::Connection.new(conn['host'])
|
33
|
+
|
34
|
+
if(opts[:verbose])
|
35
|
+
puts "MongoDB connected: #{res.inspect}"
|
36
|
+
end
|
37
|
+
|
38
|
+
return res
|
39
|
+
end
|
24
40
|
end # Mongo
|
25
41
|
end # module Helper
|
26
42
|
end # module Apollo
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -45,6 +45,9 @@ require File.join(File.dirname(__FILE__), 'helper/helpers')
|
|
45
45
|
# Loggers
|
46
46
|
require File.join(File.dirname(__FILE__), 'logger/loggers')
|
47
47
|
|
48
|
+
# Models
|
49
|
+
require File.join(File.dirname(__FILE__), 'model/models')
|
50
|
+
|
48
51
|
# Programs
|
49
52
|
require File.join(File.dirname(__FILE__), 'planner/planners')
|
50
53
|
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require 'mongo'
|
22
|
+
require 'mongoid'
|
23
|
+
|
24
|
+
module Apollo
|
25
|
+
module Model
|
26
|
+
class BaseModel
|
27
|
+
end # class BaseModel
|
28
|
+
end # module Model
|
29
|
+
end # module Apollo
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class Crawler < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "crawlers"
|
30
|
+
|
31
|
+
field :name
|
32
|
+
field :class_name
|
33
|
+
field :source
|
34
|
+
|
35
|
+
# Indexes
|
36
|
+
index({ created_at: 1, updated_at: 1, name: 1, class_name: 1 })
|
37
|
+
end # class Crawler
|
38
|
+
end # module Model
|
39
|
+
end # module Apollo
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
require File.join(File.dirname(__FILE__), 'crawler')
|
23
|
+
require File.join(File.dirname(__FILE__), 'queued_url')
|
24
|
+
require File.join(File.dirname(__FILE__), 'raw_document')
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class QueuedUrl < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "queued_urls"
|
30
|
+
|
31
|
+
field :url
|
32
|
+
field :state
|
33
|
+
|
34
|
+
# Indexes
|
35
|
+
index({ created_at: 1, updated_at: 1 })
|
36
|
+
end # class QueuedUrl
|
37
|
+
end # module Model
|
38
|
+
end # module Apollo
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class RawDocument < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "raw_docs"
|
30
|
+
|
31
|
+
field :body
|
32
|
+
|
33
|
+
# Indexes
|
34
|
+
index({ created_at: 1, updated_at: 1 })
|
35
|
+
end # class RawDocument
|
36
|
+
end # module Model
|
37
|
+
end # module Apollo
|
@@ -20,9 +20,58 @@
|
|
20
20
|
|
21
21
|
require File.join(File.dirname(__FILE__),'base_planner')
|
22
22
|
|
23
|
+
require File.join(File.dirname(__FILE__),'../model/models.rb')
|
24
|
+
|
23
25
|
module Apollo
|
24
26
|
module Planner
|
25
27
|
class SmartPlanner < BasePlanner
|
28
|
+
attr_accessor :amqp
|
29
|
+
attr_accessor :mongo
|
30
|
+
|
31
|
+
def initialize(amqp=nil, mongo=nil)
|
32
|
+
self.amqp = amqp
|
33
|
+
self.mongo = mongo
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch_url(url, opts={})
|
37
|
+
puts "AMQP fetching '#{url.inspect}'"
|
38
|
+
|
39
|
+
ch = amqp.create_channel
|
40
|
+
x = ch.default_exchange
|
41
|
+
x.publish(url.to_json, :routing_key => "fetcher")
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def fetch_queued_urls(opts={})
|
46
|
+
urls = Apollo::Model::QueuedUrl.where({:state => :queued})
|
47
|
+
return if urls.count < 1
|
48
|
+
|
49
|
+
if(opts[:verbose])
|
50
|
+
puts "Fetching Queued URLS"
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "Count of URLs in Queue: #{urls.count}" if opts[:verbose]
|
54
|
+
|
55
|
+
urls.each do |url|
|
56
|
+
url.state = :fetching
|
57
|
+
url.save
|
58
|
+
|
59
|
+
fetch_url(url, opts)
|
60
|
+
|
61
|
+
# puts "Removing URL from Queue '#{url.inspect}'" if opts[:verbose]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def run(opts={})
|
66
|
+
request_exit = false
|
67
|
+
|
68
|
+
while request_exit == false
|
69
|
+
fetch_queued_urls(opts)
|
70
|
+
sleep 1
|
71
|
+
end
|
72
|
+
|
73
|
+
return 0
|
74
|
+
end
|
26
75
|
end # class SmartPlanner
|
27
76
|
end # module Planner
|
28
77
|
end # module Apollo
|
@@ -20,6 +20,8 @@
|
|
20
20
|
|
21
21
|
require 'yaml'
|
22
22
|
|
23
|
+
require File.join(File.dirname(__FILE__), "../model/models.rb")
|
24
|
+
|
23
25
|
module Apollo
|
24
26
|
class BaseProgram
|
25
27
|
CONFIG_DIR = File.join(Apollo::BASE_DIR, "config")
|
@@ -33,16 +35,16 @@ module Apollo
|
|
33
35
|
attr_accessor :options
|
34
36
|
attr_accessor :optparser
|
35
37
|
|
38
|
+
attr_accessor :amqp
|
36
39
|
attr_accessor :mongo
|
37
|
-
attr_accessor :mongo_db
|
38
40
|
|
39
41
|
def initialize
|
40
42
|
self.config = {}
|
41
43
|
self.options = DEFAULT_OPTIONS
|
42
44
|
self.optparser = nil
|
43
45
|
|
46
|
+
self.amqp = nil
|
44
47
|
self.mongo = nil
|
45
|
-
self.mongo_db = nil
|
46
48
|
end
|
47
49
|
|
48
50
|
def self.get_config_path(config)
|
@@ -113,6 +115,55 @@ module Apollo
|
|
113
115
|
return nil
|
114
116
|
end
|
115
117
|
|
118
|
+
def init_amqp()
|
119
|
+
conn_opts = self.config["amqp"]
|
120
|
+
if(conn_opts)
|
121
|
+
self.amqp = Apollo::Helper::Amqp::connect(conn_opts, self.options)
|
122
|
+
end
|
123
|
+
|
124
|
+
return self.amqp
|
125
|
+
end
|
126
|
+
|
127
|
+
def init_mongo()
|
128
|
+
conn_opts = self.config["mongo"]
|
129
|
+
if(conn_opts)
|
130
|
+
self.mongo = Apollo::Helper::Mongo::connect(conn_opts, self.options)
|
131
|
+
|
132
|
+
# Init Mongoid
|
133
|
+
path = File.join(Apollo::BASE_DIR, "config/mongoid.yml")
|
134
|
+
Mongoid.load!(path, @options[:env])
|
135
|
+
end
|
136
|
+
|
137
|
+
return self.mongo
|
138
|
+
end
|
139
|
+
|
140
|
+
def init_seeds_crawlers(opts={})
|
141
|
+
objs = Apollo::Crawler::BaseCrawler.subclasses
|
142
|
+
objs.each do |o|
|
143
|
+
crawler = Apollo::Model::Crawler.new
|
144
|
+
i = o.new
|
145
|
+
crawler.name = i.name
|
146
|
+
crawler.class_name = o.to_s
|
147
|
+
|
148
|
+
res = Apollo::Model::Crawler.where(class_name: crawler.class_name)
|
149
|
+
# puts "RES: '#{res.inspect}'"
|
150
|
+
if(res.nil? || res.count < 1)
|
151
|
+
crawler.save
|
152
|
+
if(opts[:verbose])
|
153
|
+
puts "Adding new crawler - '#{crawler.inspect}'"
|
154
|
+
end
|
155
|
+
else
|
156
|
+
if(opts[:verbose])
|
157
|
+
puts "Using crawler - '#{res[0].inspect}'"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def init_seeds(opts={})
|
164
|
+
init_seeds_crawlers(opts)
|
165
|
+
end
|
166
|
+
|
116
167
|
# Init program
|
117
168
|
def init_program(args)
|
118
169
|
res = nil
|
@@ -134,6 +185,12 @@ module Apollo
|
|
134
185
|
# Init Mongo Connection
|
135
186
|
init_mongo()
|
136
187
|
|
188
|
+
# Init AMQP
|
189
|
+
init_amqp()
|
190
|
+
|
191
|
+
# Init Seed data
|
192
|
+
init_seeds(@options)
|
193
|
+
|
137
194
|
return nil
|
138
195
|
end
|
139
196
|
|
@@ -45,16 +45,6 @@ require File.join(File.dirname(__FILE__), '..', 'version')
|
|
45
45
|
|
46
46
|
require File.join(File.dirname(__FILE__),'base_program')
|
47
47
|
|
48
|
-
|
49
|
-
# Hack
|
50
|
-
class String
|
51
|
-
def to_class
|
52
|
-
self.split('::').inject(Object) do |mod, class_name|
|
53
|
-
mod.const_get(class_name)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
48
|
module Apollo
|
59
49
|
# Apollo Crawler Base Directory
|
60
50
|
APOLLO_CRAWLER_BASE_DIR = File.join(File.dirname(__FILE__), "..")
|
@@ -174,6 +164,8 @@ module Apollo
|
|
174
164
|
|
175
165
|
opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
|
176
166
|
@options[:crawler_dirs] << path
|
167
|
+
|
168
|
+
init_additional_crawlers([path])
|
177
169
|
end
|
178
170
|
|
179
171
|
opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count|
|
@@ -406,6 +398,22 @@ module Apollo
|
|
406
398
|
end
|
407
399
|
end
|
408
400
|
|
401
|
+
def init_additional_crawlers(dirs)
|
402
|
+
# puts "Initializing aditional crawlers ..."
|
403
|
+
dirs.each do |dir|
|
404
|
+
if(@options[:verbose])
|
405
|
+
puts "Registering additional crawler dir '#{dir}'"
|
406
|
+
end
|
407
|
+
|
408
|
+
Dir.glob("#{dir}/*.rb").each do |f|
|
409
|
+
if(@options[:verbose])
|
410
|
+
puts "Registering crawler '#{f}'"
|
411
|
+
end
|
412
|
+
require f
|
413
|
+
end
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
409
417
|
# Init program
|
410
418
|
def init_program(args)
|
411
419
|
init_options()
|
@@ -80,6 +80,10 @@ module Apollo
|
|
80
80
|
self.options[:env] = name
|
81
81
|
end
|
82
82
|
|
83
|
+
opts.on('-d', '--daemon', 'Run Apollo Platform daemon') do
|
84
|
+
self.options[:daemon] = true
|
85
|
+
end
|
86
|
+
|
83
87
|
opts.on('-v', '--verbose', 'Enable verbose output') do
|
84
88
|
self.options[:verbose] = true
|
85
89
|
end
|
@@ -90,6 +94,40 @@ module Apollo
|
|
90
94
|
end
|
91
95
|
end
|
92
96
|
|
97
|
+
def enqueue_crawlers_urls(amqp, crawlers=Apollo::Crawler::BaseCrawler.subclasses, opts={})
|
98
|
+
crawlers.each do |crawler|
|
99
|
+
i = crawler.new
|
100
|
+
puts "Queuying Crawler base URL: '#{i.url}'" if opts[:verbose]
|
101
|
+
qu = Apollo::Model::QueuedUrl.new(:url => i.url, :state => :queued)
|
102
|
+
qu.save
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def init_fetchers(amqp, opts={})
|
107
|
+
fetchers = []
|
108
|
+
fetchers << Apollo::Agent::FetcherAgent.new(amqp, self.options)
|
109
|
+
|
110
|
+
enqueue_crawlers_urls(amqp, Apollo::Crawler::BaseCrawler.subclasses, opts)
|
111
|
+
|
112
|
+
# ch = self.amqp.create_channel
|
113
|
+
# x = ch.default_exchange
|
114
|
+
# x.publish("Hello!", :routing_key => "fetcher")
|
115
|
+
end
|
116
|
+
|
117
|
+
def init_agents(amqp, opts={})
|
118
|
+
puts "Initializing agents"
|
119
|
+
|
120
|
+
init_fetchers(amqp, opts)
|
121
|
+
end
|
122
|
+
|
123
|
+
def init_program(args)
|
124
|
+
res = super(args)
|
125
|
+
return res unless res.nil?
|
126
|
+
|
127
|
+
init_agents(self.amqp, self.options)
|
128
|
+
return nil
|
129
|
+
end
|
130
|
+
|
93
131
|
def process_options(args)
|
94
132
|
if(self.options[:version])
|
95
133
|
puts Apollo::VERSION
|
@@ -105,30 +143,21 @@ module Apollo
|
|
105
143
|
return nil
|
106
144
|
end
|
107
145
|
|
108
|
-
def init_mongo()
|
109
|
-
self.mongo = Mongo::Connection.new(self.config['mongo']['host'])
|
110
|
-
self.mongo_db = self.mongo.db(self.config['mongo']['db'])
|
111
|
-
|
112
|
-
if(self.options[:verbose])
|
113
|
-
puts "(Mongo) Connection Inited: #{self.mongo.inspect}"
|
114
|
-
puts "(Mongo) Database Inited: #{self.mongo_db.inspect}"
|
115
|
-
end
|
116
|
-
|
117
|
-
return self.mongo
|
118
|
-
end
|
119
|
-
|
120
146
|
# Run Program
|
121
147
|
def run(args = ARGV)
|
122
148
|
res = super(args)
|
123
149
|
return res unless res.nil?
|
124
|
-
|
125
|
-
# Print classes
|
126
|
-
# puts Apollo::Crawler::BaseCrawler.subclasses.inspect
|
127
150
|
|
128
151
|
# Here we start
|
129
|
-
if(ARGV.length < 1)
|
130
|
-
|
131
|
-
|
152
|
+
# if(ARGV.length < 1)
|
153
|
+
# puts optparser
|
154
|
+
# return 0
|
155
|
+
# end
|
156
|
+
|
157
|
+
res_code = 0
|
158
|
+
if(self.options[:daemon])
|
159
|
+
planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo)
|
160
|
+
res_code = planner.run(self.options)
|
132
161
|
end
|
133
162
|
|
134
163
|
return request_exit(res_code)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-03-
|
11
|
+
date: 2013-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|
@@ -28,28 +28,28 @@ dependencies:
|
|
28
28
|
name: activesupport
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - '>='
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 3.2.12
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 3.2.12
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: dalli
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: 2.6.2
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - '>='
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 2.6.2
|
55
55
|
- !ruby/object:Gem::Dependency
|
@@ -70,42 +70,42 @@ dependencies:
|
|
70
70
|
name: eventmachine
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - '>='
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '0'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: em-http-request
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - '>='
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- -
|
94
|
+
- - '>='
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: em-synchrony
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - '>='
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: '0'
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- -
|
108
|
+
- - '>='
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
- !ruby/object:Gem::Dependency
|
@@ -154,56 +154,56 @@ dependencies:
|
|
154
154
|
name: memcache-client
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
|
-
- -
|
157
|
+
- - '>='
|
158
158
|
- !ruby/object:Gem::Version
|
159
159
|
version: '0'
|
160
160
|
type: :runtime
|
161
161
|
prerelease: false
|
162
162
|
version_requirements: !ruby/object:Gem::Requirement
|
163
163
|
requirements:
|
164
|
-
- -
|
164
|
+
- - '>='
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
167
|
- !ruby/object:Gem::Dependency
|
168
168
|
name: mongo
|
169
169
|
requirement: !ruby/object:Gem::Requirement
|
170
170
|
requirements:
|
171
|
-
- -
|
171
|
+
- - '>='
|
172
172
|
- !ruby/object:Gem::Version
|
173
173
|
version: 1.8.2
|
174
174
|
type: :runtime
|
175
175
|
prerelease: false
|
176
176
|
version_requirements: !ruby/object:Gem::Requirement
|
177
177
|
requirements:
|
178
|
-
- -
|
178
|
+
- - '>='
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: 1.8.2
|
181
181
|
- !ruby/object:Gem::Dependency
|
182
182
|
name: mongoid
|
183
183
|
requirement: !ruby/object:Gem::Requirement
|
184
184
|
requirements:
|
185
|
-
- -
|
185
|
+
- - '>='
|
186
186
|
- !ruby/object:Gem::Version
|
187
187
|
version: 3.1.2
|
188
188
|
type: :runtime
|
189
189
|
prerelease: false
|
190
190
|
version_requirements: !ruby/object:Gem::Requirement
|
191
191
|
requirements:
|
192
|
-
- -
|
192
|
+
- - '>='
|
193
193
|
- !ruby/object:Gem::Version
|
194
194
|
version: 3.1.2
|
195
195
|
- !ruby/object:Gem::Dependency
|
196
196
|
name: mime-types
|
197
197
|
requirement: !ruby/object:Gem::Requirement
|
198
198
|
requirements:
|
199
|
-
- -
|
199
|
+
- - '>='
|
200
200
|
- !ruby/object:Gem::Version
|
201
201
|
version: '0'
|
202
202
|
type: :runtime
|
203
203
|
prerelease: false
|
204
204
|
version_requirements: !ruby/object:Gem::Requirement
|
205
205
|
requirements:
|
206
|
-
- -
|
206
|
+
- - '>='
|
207
207
|
- !ruby/object:Gem::Version
|
208
208
|
version: '0'
|
209
209
|
- !ruby/object:Gem::Dependency
|
@@ -224,70 +224,70 @@ dependencies:
|
|
224
224
|
name: openurl
|
225
225
|
requirement: !ruby/object:Gem::Requirement
|
226
226
|
requirements:
|
227
|
-
- -
|
227
|
+
- - '>='
|
228
228
|
- !ruby/object:Gem::Version
|
229
229
|
version: 0.4.2
|
230
230
|
type: :runtime
|
231
231
|
prerelease: false
|
232
232
|
version_requirements: !ruby/object:Gem::Requirement
|
233
233
|
requirements:
|
234
|
-
- -
|
234
|
+
- - '>='
|
235
235
|
- !ruby/object:Gem::Version
|
236
236
|
version: 0.4.2
|
237
237
|
- !ruby/object:Gem::Dependency
|
238
238
|
name: parallel
|
239
239
|
requirement: !ruby/object:Gem::Requirement
|
240
240
|
requirements:
|
241
|
-
- -
|
241
|
+
- - '>='
|
242
242
|
- !ruby/object:Gem::Version
|
243
243
|
version: 0.6.2
|
244
244
|
type: :runtime
|
245
245
|
prerelease: false
|
246
246
|
version_requirements: !ruby/object:Gem::Requirement
|
247
247
|
requirements:
|
248
|
-
- -
|
248
|
+
- - '>='
|
249
249
|
- !ruby/object:Gem::Version
|
250
250
|
version: 0.6.2
|
251
251
|
- !ruby/object:Gem::Dependency
|
252
252
|
name: rack
|
253
253
|
requirement: !ruby/object:Gem::Requirement
|
254
254
|
requirements:
|
255
|
-
- -
|
255
|
+
- - '>='
|
256
256
|
- !ruby/object:Gem::Version
|
257
257
|
version: 1.5.2
|
258
258
|
type: :runtime
|
259
259
|
prerelease: false
|
260
260
|
version_requirements: !ruby/object:Gem::Requirement
|
261
261
|
requirements:
|
262
|
-
- -
|
262
|
+
- - '>='
|
263
263
|
- !ruby/object:Gem::Version
|
264
264
|
version: 1.5.2
|
265
265
|
- !ruby/object:Gem::Dependency
|
266
266
|
name: right_aws
|
267
267
|
requirement: !ruby/object:Gem::Requirement
|
268
268
|
requirements:
|
269
|
-
- -
|
269
|
+
- - '>='
|
270
270
|
- !ruby/object:Gem::Version
|
271
271
|
version: 3.0.5
|
272
272
|
type: :runtime
|
273
273
|
prerelease: false
|
274
274
|
version_requirements: !ruby/object:Gem::Requirement
|
275
275
|
requirements:
|
276
|
-
- -
|
276
|
+
- - '>='
|
277
277
|
- !ruby/object:Gem::Version
|
278
278
|
version: 3.0.5
|
279
279
|
- !ruby/object:Gem::Dependency
|
280
280
|
name: right_http_connection
|
281
281
|
requirement: !ruby/object:Gem::Requirement
|
282
282
|
requirements:
|
283
|
-
- -
|
283
|
+
- - '>='
|
284
284
|
- !ruby/object:Gem::Version
|
285
285
|
version: 1.3.0
|
286
286
|
type: :runtime
|
287
287
|
prerelease: false
|
288
288
|
version_requirements: !ruby/object:Gem::Requirement
|
289
289
|
requirements:
|
290
|
-
- -
|
290
|
+
- - '>='
|
291
291
|
- !ruby/object:Gem::Version
|
292
292
|
version: 1.3.0
|
293
293
|
- !ruby/object:Gem::Dependency
|
@@ -336,146 +336,155 @@ dependencies:
|
|
336
336
|
name: ffi
|
337
337
|
requirement: !ruby/object:Gem::Requirement
|
338
338
|
requirements:
|
339
|
-
- -
|
339
|
+
- - '>='
|
340
340
|
- !ruby/object:Gem::Version
|
341
341
|
version: 1.4.0
|
342
342
|
type: :runtime
|
343
343
|
prerelease: false
|
344
344
|
version_requirements: !ruby/object:Gem::Requirement
|
345
345
|
requirements:
|
346
|
-
- -
|
346
|
+
- - '>='
|
347
347
|
- !ruby/object:Gem::Version
|
348
348
|
version: 1.4.0
|
349
349
|
- !ruby/object:Gem::Dependency
|
350
350
|
name: guard
|
351
351
|
requirement: !ruby/object:Gem::Requirement
|
352
352
|
requirements:
|
353
|
-
- -
|
353
|
+
- - '>='
|
354
354
|
- !ruby/object:Gem::Version
|
355
355
|
version: 1.6.2
|
356
356
|
type: :development
|
357
357
|
prerelease: false
|
358
358
|
version_requirements: !ruby/object:Gem::Requirement
|
359
359
|
requirements:
|
360
|
-
- -
|
360
|
+
- - '>='
|
361
361
|
- !ruby/object:Gem::Version
|
362
362
|
version: 1.6.2
|
363
363
|
- !ruby/object:Gem::Dependency
|
364
364
|
name: guard-rake
|
365
365
|
requirement: !ruby/object:Gem::Requirement
|
366
366
|
requirements:
|
367
|
-
- -
|
367
|
+
- - '>='
|
368
368
|
- !ruby/object:Gem::Version
|
369
369
|
version: 0.0.7
|
370
370
|
type: :development
|
371
371
|
prerelease: false
|
372
372
|
version_requirements: !ruby/object:Gem::Requirement
|
373
373
|
requirements:
|
374
|
-
- -
|
374
|
+
- - '>='
|
375
375
|
- !ruby/object:Gem::Version
|
376
376
|
version: 0.0.7
|
377
377
|
- !ruby/object:Gem::Dependency
|
378
378
|
name: guard-rspec
|
379
379
|
requirement: !ruby/object:Gem::Requirement
|
380
380
|
requirements:
|
381
|
-
- -
|
381
|
+
- - '>='
|
382
382
|
- !ruby/object:Gem::Version
|
383
383
|
version: 2.5.0
|
384
384
|
type: :development
|
385
385
|
prerelease: false
|
386
386
|
version_requirements: !ruby/object:Gem::Requirement
|
387
387
|
requirements:
|
388
|
-
- -
|
388
|
+
- - '>='
|
389
389
|
- !ruby/object:Gem::Version
|
390
390
|
version: 2.5.0
|
391
391
|
description: Gem for crawling data from external sources
|
392
392
|
email: korczis@gmail.com
|
393
393
|
executables:
|
394
|
+
- apollo-console
|
394
395
|
- apollo-crawler
|
395
396
|
- apollo-platform
|
396
397
|
extensions: []
|
397
398
|
extra_rdoc_files: []
|
398
399
|
files:
|
399
|
-
- ./config/
|
400
|
-
- ./config/mongo.yml
|
401
|
-
- ./config/memcached.yml
|
402
|
-
- ./config/mongoid.yml
|
403
|
-
- ./config/apollo.yml.default
|
400
|
+
- ./config/amqp.yml
|
404
401
|
- ./config/amqp.yml.default
|
402
|
+
- ./config/apollo.yml
|
403
|
+
- ./config/apollo.yml.default
|
404
|
+
- ./config/deploy.rb
|
405
|
+
- ./config/memcached.yml
|
405
406
|
- ./config/memcached.yml.default
|
406
|
-
- ./config/
|
407
|
+
- ./config/mongo.yml
|
408
|
+
- ./config/mongo.yml.default
|
409
|
+
- ./config/mongoid.yml
|
407
410
|
- ./config/mongoid.yml.default
|
408
|
-
- ./
|
409
|
-
- ./lib/apollo_crawler/fetcher/smart_fetcher.rb
|
410
|
-
- ./lib/apollo_crawler/fetcher/fetchers.rb
|
411
|
-
- ./lib/apollo_crawler/fetcher/simple_fetcher.rb
|
412
|
-
- ./lib/apollo_crawler/fetcher/base_fetcher.rb
|
413
|
-
- ./lib/apollo_crawler/planner/base_planner.rb
|
414
|
-
- ./lib/apollo_crawler/planner/planners.rb
|
415
|
-
- ./lib/apollo_crawler/planner/smart_planner.rb
|
416
|
-
- ./lib/apollo_crawler/lib.rb
|
417
|
-
- ./lib/apollo_crawler/version.rb
|
418
|
-
- ./lib/apollo_crawler/program/console_program.rb
|
419
|
-
- ./lib/apollo_crawler/program/platform_program.rb
|
420
|
-
- ./lib/apollo_crawler/program/crawler_program.rb
|
421
|
-
- ./lib/apollo_crawler/program/base_program.rb
|
422
|
-
- ./lib/apollo_crawler/program/programs.rb
|
423
|
-
- ./lib/apollo_crawler/logger/console_logger.rb
|
424
|
-
- ./lib/apollo_crawler/logger/base_logger.rb
|
425
|
-
- ./lib/apollo_crawler/logger/loggers.rb
|
426
|
-
- ./lib/apollo_crawler/helper/core_helper.rb
|
427
|
-
- ./lib/apollo_crawler/helper/amqp_helper.rb
|
428
|
-
- ./lib/apollo_crawler/helper/helpers.rb
|
429
|
-
- ./lib/apollo_crawler/helper/mongo_helper.rb
|
411
|
+
- ./lib/apollo_crawler.rb
|
430
412
|
- ./lib/apollo_crawler/adapter/adapters.rb
|
431
|
-
- ./lib/apollo_crawler/adapter/mongo_adapter.rb
|
432
413
|
- ./lib/apollo_crawler/adapter/amqp_adapter.rb
|
433
|
-
- ./lib/apollo_crawler/
|
434
|
-
- ./lib/apollo_crawler/
|
414
|
+
- ./lib/apollo_crawler/adapter/mongo_adapter.rb
|
415
|
+
- ./lib/apollo_crawler/agent/agents.rb
|
416
|
+
- ./lib/apollo_crawler/agent/base_agent.rb
|
417
|
+
- ./lib/apollo_crawler/agent/fetcher_agent.rb
|
418
|
+
- ./lib/apollo_crawler/cache/base_cache.rb
|
419
|
+
- ./lib/apollo_crawler/cache/caches.rb
|
435
420
|
- ./lib/apollo_crawler/cache/factory.rb
|
436
|
-
- ./lib/apollo_crawler/cache/
|
421
|
+
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
437
422
|
- ./lib/apollo_crawler/cache/memory_cache.rb
|
438
|
-
- ./lib/apollo_crawler/cache/base_cache.rb
|
439
423
|
- ./lib/apollo_crawler/cache/mongo_cache.rb
|
440
|
-
- ./lib/apollo_crawler/cache/
|
441
|
-
- ./lib/apollo_crawler/cache/
|
442
|
-
- ./lib/apollo_crawler/
|
443
|
-
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
444
|
-
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
445
|
-
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
446
|
-
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
424
|
+
- ./lib/apollo_crawler/cache/null_cache.rb
|
425
|
+
- ./lib/apollo_crawler/cache/sqlite_cache.rb
|
426
|
+
- ./lib/apollo_crawler/config.rb
|
447
427
|
- ./lib/apollo_crawler/crawler/base_crawler.rb
|
448
428
|
- ./lib/apollo_crawler/crawler/crawlers.rb
|
429
|
+
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
430
|
+
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
431
|
+
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
449
432
|
- ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
|
433
|
+
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
434
|
+
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
450
435
|
- ./lib/apollo_crawler/env.rb
|
451
|
-
- ./lib/apollo_crawler/
|
452
|
-
- ./lib/apollo_crawler/
|
453
|
-
- ./lib/apollo_crawler/
|
436
|
+
- ./lib/apollo_crawler/fetcher/base_fetcher.rb
|
437
|
+
- ./lib/apollo_crawler/fetcher/fetchers.rb
|
438
|
+
- ./lib/apollo_crawler/fetcher/simple_fetcher.rb
|
439
|
+
- ./lib/apollo_crawler/fetcher/smart_fetcher.rb
|
454
440
|
- ./lib/apollo_crawler/formatter/base_formatter.rb
|
441
|
+
- ./lib/apollo_crawler/formatter/formatters.rb
|
455
442
|
- ./lib/apollo_crawler/formatter/json_formatter.rb
|
456
443
|
- ./lib/apollo_crawler/formatter/plain_formatter.rb
|
457
|
-
- ./lib/apollo_crawler/formatter/
|
458
|
-
- ./lib/apollo_crawler/
|
444
|
+
- ./lib/apollo_crawler/formatter/table_formatter.rb
|
445
|
+
- ./lib/apollo_crawler/helper/amqp_helper.rb
|
446
|
+
- ./lib/apollo_crawler/helper/core_helper.rb
|
447
|
+
- ./lib/apollo_crawler/helper/helpers.rb
|
448
|
+
- ./lib/apollo_crawler/helper/mongo_helper.rb
|
449
|
+
- ./lib/apollo_crawler/lib.rb
|
450
|
+
- ./lib/apollo_crawler/logger/base_logger.rb
|
451
|
+
- ./lib/apollo_crawler/logger/console_logger.rb
|
452
|
+
- ./lib/apollo_crawler/logger/loggers.rb
|
453
|
+
- ./lib/apollo_crawler/model/base_model.rb
|
454
|
+
- ./lib/apollo_crawler/model/crawler.rb
|
455
|
+
- ./lib/apollo_crawler/model/models.rb
|
456
|
+
- ./lib/apollo_crawler/model/queued_url.rb
|
457
|
+
- ./lib/apollo_crawler/model/raw_document.rb
|
458
|
+
- ./lib/apollo_crawler/planner/base_planner.rb
|
459
|
+
- ./lib/apollo_crawler/planner/planners.rb
|
460
|
+
- ./lib/apollo_crawler/planner/smart_planner.rb
|
461
|
+
- ./lib/apollo_crawler/program/base_program.rb
|
462
|
+
- ./lib/apollo_crawler/program/console_program.rb
|
463
|
+
- ./lib/apollo_crawler/program/crawler_program.rb
|
464
|
+
- ./lib/apollo_crawler/program/platform_program.rb
|
465
|
+
- ./lib/apollo_crawler/program/programs.rb
|
459
466
|
- ./lib/apollo_crawler/store/base_store.rb
|
460
|
-
- ./lib/apollo_crawler.rb
|
467
|
+
- ./lib/apollo_crawler/store/stores.rb
|
468
|
+
- ./lib/apollo_crawler/version.rb
|
469
|
+
- bin/apollo-console
|
461
470
|
- bin/apollo-crawler
|
462
471
|
- bin/apollo-platform
|
463
472
|
homepage: http://apollocrawler.com/
|
464
473
|
licenses:
|
465
474
|
- MIT
|
466
475
|
metadata: {}
|
467
|
-
post_install_message:
|
476
|
+
post_install_message: Thank you for installing Apollo Crawler!
|
468
477
|
rdoc_options: []
|
469
478
|
require_paths:
|
470
479
|
- lib
|
471
480
|
required_ruby_version: !ruby/object:Gem::Requirement
|
472
481
|
requirements:
|
473
|
-
- -
|
482
|
+
- - '>='
|
474
483
|
- !ruby/object:Gem::Version
|
475
484
|
version: 1.9.3
|
476
485
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
477
486
|
requirements:
|
478
|
-
- -
|
487
|
+
- - '>='
|
479
488
|
- !ruby/object:Gem::Version
|
480
489
|
version: 1.8.11
|
481
490
|
requirements: []
|