apollo-crawler 0.1.22 → 0.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +6 -14
- data/bin/apollo-console +30 -0
- data/config/amqp.yml +4 -3
- data/config/deploy.rb +61 -0
- data/config/mongoid.yml +9 -6
- data/lib/apollo_crawler.rb +3 -0
- data/lib/apollo_crawler/agent/agents.rb +2 -1
- data/lib/apollo_crawler/agent/base_agent.rb +3 -0
- data/lib/apollo_crawler/agent/fetcher_agent.rb +55 -0
- data/lib/apollo_crawler/config.rb +2 -2
- data/lib/apollo_crawler/helper/amqp_helper.rb +21 -0
- data/lib/apollo_crawler/helper/core_helper.rb +8 -0
- data/lib/apollo_crawler/helper/mongo_helper.rb +16 -0
- data/lib/apollo_crawler/lib.rb +3 -0
- data/lib/apollo_crawler/model/base_model.rb +29 -0
- data/lib/apollo_crawler/model/crawler.rb +39 -0
- data/lib/apollo_crawler/model/models.rb +24 -0
- data/lib/apollo_crawler/model/queued_url.rb +38 -0
- data/lib/apollo_crawler/model/raw_document.rb +37 -0
- data/lib/apollo_crawler/planner/base_planner.rb +4 -1
- data/lib/apollo_crawler/planner/smart_planner.rb +49 -0
- data/lib/apollo_crawler/program/base_program.rb +59 -2
- data/lib/apollo_crawler/program/crawler_program.rb +18 -10
- data/lib/apollo_crawler/program/platform_program.rb +47 -18
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +96 -87
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
ZDc0YmI0MjEyNTgzNTM0NmY2NGE2YTY0Y2MxNTdiNTBkN2M3ZDM2MDdmNjZm
|
10
|
-
MjNhZDVlNmRkNDdkMDVhNzhjNzg0ZDQ2ZjRkYThhNzJlODMxNjE3NmE4MjVm
|
11
|
-
ZDg0ZjFlZWFjM2I0NWQ0ZmIzYmM5ZmY0MTRiOGE2YTMwZDVmYzE=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
N2I5ZGM1NjI2M2QxN2FmMDZkMThkOGU5NDE5MTEyOTNlZWFkNGQ1N2FlZjFi
|
14
|
-
MTI5ZGNhNjdmZTAyZjAyYTVkZWFlNGJmZDk5YzA3ZjlhN2Q5MTc1NTIyNGVi
|
15
|
-
M2VjODgxNzQyYTAxNzQ5NWQ5MTQzZjUxNWY5MWZlNDQzNjg2YmQ=
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 71fb379b6ae32ceb79e40cce451c8a3646278d32
|
4
|
+
data.tar.gz: 08fdec629298945a86993b91be3a743b880ad4a0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d0789d2ef99358144c90d148c378d9bf53e3084b326ede425ed7ff171ab450a4ed9de7a86494dbf49992ed235807f92e6a795cc52676bd61280a006408fc4e90
|
7
|
+
data.tar.gz: 16ac99fc7e192fe137348c6d364e730c9c0071f274f200b395746c4247bb3958c7238b315b571a730014b5f8286602bbc67eb9f26db30c217440e15401d3ac95
|
data/bin/apollo-console
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
# of this software and associated documentation files (the "Software"), to deal
|
8
|
+
# in the Software without restriction, including without limitation the rights
|
9
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
# copies of the Software, and to permit persons to whom the Software is
|
11
|
+
# furnished to do so, subject to the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be included in
|
14
|
+
# all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
# THE SOFTWARE.
|
23
|
+
|
24
|
+
require "rubygems"
|
25
|
+
require "bundler/setup"
|
26
|
+
|
27
|
+
require File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler")
|
28
|
+
|
29
|
+
Apollo::ConsoleProgram.new.run(ARGV)
|
30
|
+
|
data/config/amqp.yml
CHANGED
@@ -2,17 +2,18 @@ default: &default
|
|
2
2
|
host: localhost
|
3
3
|
username: guest
|
4
4
|
password: guest
|
5
|
+
port: 5672
|
5
6
|
|
6
7
|
development:
|
7
8
|
<<: *default
|
8
|
-
vhost: apollo-crawler-development
|
9
|
+
vhost: /apollo-crawler-development
|
9
10
|
|
10
11
|
test:
|
11
12
|
<<: *default
|
12
13
|
host: apollo-crawler.no-ip.org
|
13
|
-
vhost: apollo-crawler-test
|
14
|
+
vhost: /apollo-crawler-test
|
14
15
|
|
15
16
|
production:
|
16
17
|
<<: *default
|
17
18
|
host: apollo-crawler.no-ip.org
|
18
|
-
vhost: apollo-crawler-production
|
19
|
+
vhost: /apollo-crawler-production
|
data/config/deploy.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'capistrano'
|
2
|
+
|
3
|
+
require "rubygems"
|
4
|
+
require "bundler/setup"
|
5
|
+
require "bundler/capistrano"
|
6
|
+
|
7
|
+
# RVM integration
|
8
|
+
require "rvm/capistrano"
|
9
|
+
|
10
|
+
# Target ruby version
|
11
|
+
set :rvm_ruby_string, '2.0.0'
|
12
|
+
|
13
|
+
set :domain, "apollo-crawler.no-ip.org"
|
14
|
+
set :application, "apollo_platform"
|
15
|
+
# set :deploy_to, File.join(File.expand_path("~"), "/apps/#{application}")
|
16
|
+
set :deploy_to, "/home/ubuntu/apps/#{application}"
|
17
|
+
|
18
|
+
ssh_options[:keys] = [File.join(ENV["HOME"], ".ssh", "key-webs.pem")]
|
19
|
+
|
20
|
+
set :user, "ubuntu"
|
21
|
+
set :use_sudo, false
|
22
|
+
|
23
|
+
set :scm, :git
|
24
|
+
set :repository, "https://github.com/korczis/apollo-crawler.git"
|
25
|
+
set :branch, 'master'
|
26
|
+
set :git_shallow_clone, 1
|
27
|
+
|
28
|
+
role :web, domain
|
29
|
+
role :app, domain
|
30
|
+
role :db, domain, :primary => true
|
31
|
+
|
32
|
+
set :deploy_via, :remote_cache
|
33
|
+
|
34
|
+
namespace :deploy do
|
35
|
+
def remote_cmd(cmd)
|
36
|
+
run "cd #{deploy_to}/current && #{cmd}"
|
37
|
+
end
|
38
|
+
|
39
|
+
task :start, :roles => [:web, :app] do
|
40
|
+
puts "Starting.."
|
41
|
+
remote_cmd "./bin/apollo-platform -V"
|
42
|
+
end
|
43
|
+
|
44
|
+
task :stop, :roles => [:web, :app] do
|
45
|
+
puts "Stopping.."
|
46
|
+
end
|
47
|
+
|
48
|
+
task :status, :roles => [:web, :app] do
|
49
|
+
puts "Statusing.."
|
50
|
+
end
|
51
|
+
|
52
|
+
task :restart, :roles => [:web, :app] do
|
53
|
+
puts "Restarting.."
|
54
|
+
end
|
55
|
+
|
56
|
+
# This will make sure that Capistrano doesn't try to run rake:migrate (this is not a Rails project!)
|
57
|
+
task :cold do
|
58
|
+
deploy.update
|
59
|
+
deploy.start
|
60
|
+
end
|
61
|
+
end
|
data/config/mongoid.yml
CHANGED
@@ -1,23 +1,26 @@
|
|
1
|
-
default: &
|
1
|
+
default: &default_options
|
2
2
|
sessions:
|
3
3
|
default:
|
4
4
|
hosts:
|
5
5
|
- apollo-crawler.no-ip.org:27017
|
6
6
|
|
7
7
|
development:
|
8
|
-
<<: *default
|
9
8
|
sessions:
|
10
9
|
default:
|
10
|
+
hosts:
|
11
|
+
- localhost:27017
|
11
12
|
database: apollo-crawler-development
|
12
13
|
|
13
14
|
test:
|
14
|
-
<<: *default
|
15
15
|
sessions:
|
16
16
|
default:
|
17
|
-
|
17
|
+
hosts:
|
18
|
+
- apollo-crawler.no-ip.org:27017
|
19
|
+
database: apollo-crawler-test
|
18
20
|
|
19
21
|
production:
|
20
|
-
<<: *default
|
21
22
|
sessions:
|
22
23
|
default:
|
23
|
-
|
24
|
+
hosts:
|
25
|
+
- apollo-crawler.no-ip.org:27017
|
26
|
+
database: apollo-crawler-production
|
data/lib/apollo_crawler.rb
CHANGED
@@ -48,6 +48,9 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/helper/helpers')
|
|
48
48
|
# Loggers
|
49
49
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/logger/loggers')
|
50
50
|
|
51
|
+
# Models
|
52
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/model/models')
|
53
|
+
|
51
54
|
# Planner
|
52
55
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/planner/planners')
|
53
56
|
|
@@ -18,4 +18,5 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
-
require File.join(File.dirname(__FILE__), 'base_agent')
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_agent')
|
22
|
+
require File.join(File.dirname(__FILE__), 'fetcher_agent')
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_agent')
|
22
|
+
require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
|
23
|
+
|
24
|
+
module Apollo
|
25
|
+
module Agent
|
26
|
+
class FetcherAgent < BaseAgent
|
27
|
+
attr_accessor :fetcher
|
28
|
+
|
29
|
+
def initialize(amqp, opts={})
|
30
|
+
self.fetcher = Apollo::Fetcher::SmartFetcher.new
|
31
|
+
|
32
|
+
if(opts[:verbose])
|
33
|
+
puts "Initializing fetcher agent..."
|
34
|
+
end
|
35
|
+
|
36
|
+
ch = amqp.create_channel
|
37
|
+
q = ch.queue("fetcher", :auto_delete => false, :durable => true)
|
38
|
+
x = ch.default_exchange
|
39
|
+
|
40
|
+
q.subscribe do |delivery_info, metadata, payload|
|
41
|
+
res = nil
|
42
|
+
|
43
|
+
puts "Received #{payload}" if opts[:verbose]
|
44
|
+
|
45
|
+
Thread.new do |t|
|
46
|
+
queued_url = JSON.parse(payload)
|
47
|
+
# puts queued_url["url"]
|
48
|
+
# res = Apollo::Fetcher::SmartFetcher::fetch(queued_url["url"])
|
49
|
+
# puts "#{queued_url['url']} - " + res.inspect
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end # class FetcherAgent
|
54
|
+
end # module Agent
|
55
|
+
end # module Apollo
|
@@ -87,8 +87,8 @@ module Apollo
|
|
87
87
|
}
|
88
88
|
|
89
89
|
# Used caching mechanism by default
|
90
|
-
CACHE_CLASS = Apollo::Cache::
|
91
|
-
CACHE_CLASS_OPTIONS =
|
90
|
+
CACHE_CLASS = Apollo::Cache::MemcachedCache
|
91
|
+
CACHE_CLASS_OPTIONS = nil
|
92
92
|
|
93
93
|
############################################################
|
94
94
|
# Crawlers - Built-in out-of box working crawlers
|
@@ -18,9 +18,30 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require 'amqp'
|
22
|
+
require 'bunny'
|
23
|
+
require 'thread'
|
24
|
+
|
21
25
|
module Apollo
|
22
26
|
module Helper
|
23
27
|
module Amqp
|
28
|
+
def self.connect(conn, opts={})
|
29
|
+
res = nil
|
30
|
+
|
31
|
+
if(opts[:verbose])
|
32
|
+
puts "AMQP Connecting - #{conn.inspect}"
|
33
|
+
end
|
34
|
+
|
35
|
+
res = Bunny.new(:host => conn['host'], :user => conn['username'], :password => conn['password'], :vhost => conn['vhost'], :port => conn['port'])
|
36
|
+
res.start
|
37
|
+
|
38
|
+
sleep(0.001) until res
|
39
|
+
if(opts[:verbose])
|
40
|
+
puts "AMQP connected - #{res.inspect}"
|
41
|
+
end
|
42
|
+
|
43
|
+
return res
|
44
|
+
end
|
24
45
|
end # Amqp
|
25
46
|
end # module Helper
|
26
47
|
end # module Apollo
|
@@ -18,9 +18,25 @@
|
|
18
18
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
19
|
# THE SOFTWARE.
|
20
20
|
|
21
|
+
require 'mongo'
|
22
|
+
require 'mongoid'
|
23
|
+
|
21
24
|
module Apollo
|
22
25
|
module Helper
|
23
26
|
module Mongo
|
27
|
+
def self.connect(conn, opts={})
|
28
|
+
if(opts[:verbose])
|
29
|
+
puts "MongoDB connecting - '#{conn.inspect}"
|
30
|
+
end
|
31
|
+
|
32
|
+
res = ::Mongo::Connection.new(conn['host'])
|
33
|
+
|
34
|
+
if(opts[:verbose])
|
35
|
+
puts "MongoDB connected: #{res.inspect}"
|
36
|
+
end
|
37
|
+
|
38
|
+
return res
|
39
|
+
end
|
24
40
|
end # Mongo
|
25
41
|
end # module Helper
|
26
42
|
end # module Apollo
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -45,6 +45,9 @@ require File.join(File.dirname(__FILE__), 'helper/helpers')
|
|
45
45
|
# Loggers
|
46
46
|
require File.join(File.dirname(__FILE__), 'logger/loggers')
|
47
47
|
|
48
|
+
# Models
|
49
|
+
require File.join(File.dirname(__FILE__), 'model/models')
|
50
|
+
|
48
51
|
# Programs
|
49
52
|
require File.join(File.dirname(__FILE__), 'planner/planners')
|
50
53
|
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require 'mongo'
|
22
|
+
require 'mongoid'
|
23
|
+
|
24
|
+
module Apollo
|
25
|
+
module Model
|
26
|
+
class BaseModel
|
27
|
+
end # class BaseModel
|
28
|
+
end # module Model
|
29
|
+
end # module Apollo
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class Crawler < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "crawlers"
|
30
|
+
|
31
|
+
field :name
|
32
|
+
field :class_name
|
33
|
+
field :source
|
34
|
+
|
35
|
+
# Indexes
|
36
|
+
index({ created_at: 1, updated_at: 1, name: 1, class_name: 1 })
|
37
|
+
end # class Crawler
|
38
|
+
end # module Model
|
39
|
+
end # module Apollo
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
require File.join(File.dirname(__FILE__), 'crawler')
|
23
|
+
require File.join(File.dirname(__FILE__), 'queued_url')
|
24
|
+
require File.join(File.dirname(__FILE__), 'raw_document')
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class QueuedUrl < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "queued_urls"
|
30
|
+
|
31
|
+
field :url
|
32
|
+
field :state
|
33
|
+
|
34
|
+
# Indexes
|
35
|
+
index({ created_at: 1, updated_at: 1 })
|
36
|
+
end # class QueuedUrl
|
37
|
+
end # module Model
|
38
|
+
end # module Apollo
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
require File.join(File.dirname(__FILE__), 'base_model')
|
22
|
+
|
23
|
+
module Apollo
|
24
|
+
module Model
|
25
|
+
class RawDocument < BaseModel
|
26
|
+
include Mongoid::Document
|
27
|
+
include Mongoid::Timestamps
|
28
|
+
|
29
|
+
store_in collection: "raw_docs"
|
30
|
+
|
31
|
+
field :body
|
32
|
+
|
33
|
+
# Indexes
|
34
|
+
index({ created_at: 1, updated_at: 1 })
|
35
|
+
end # class RawDocument
|
36
|
+
end # module Model
|
37
|
+
end # module Apollo
|
@@ -20,9 +20,58 @@
|
|
20
20
|
|
21
21
|
require File.join(File.dirname(__FILE__),'base_planner')
|
22
22
|
|
23
|
+
require File.join(File.dirname(__FILE__),'../model/models.rb')
|
24
|
+
|
23
25
|
module Apollo
|
24
26
|
module Planner
|
25
27
|
class SmartPlanner < BasePlanner
|
28
|
+
attr_accessor :amqp
|
29
|
+
attr_accessor :mongo
|
30
|
+
|
31
|
+
def initialize(amqp=nil, mongo=nil)
|
32
|
+
self.amqp = amqp
|
33
|
+
self.mongo = mongo
|
34
|
+
end
|
35
|
+
|
36
|
+
def fetch_url(url, opts={})
|
37
|
+
puts "AMQP fetching '#{url.inspect}'"
|
38
|
+
|
39
|
+
ch = amqp.create_channel
|
40
|
+
x = ch.default_exchange
|
41
|
+
x.publish(url.to_json, :routing_key => "fetcher")
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
def fetch_queued_urls(opts={})
|
46
|
+
urls = Apollo::Model::QueuedUrl.where({:state => :queued})
|
47
|
+
return if urls.count < 1
|
48
|
+
|
49
|
+
if(opts[:verbose])
|
50
|
+
puts "Fetching Queued URLS"
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "Count of URLs in Queue: #{urls.count}" if opts[:verbose]
|
54
|
+
|
55
|
+
urls.each do |url|
|
56
|
+
url.state = :fetching
|
57
|
+
url.save
|
58
|
+
|
59
|
+
fetch_url(url, opts)
|
60
|
+
|
61
|
+
# puts "Removing URL from Queue '#{url.inspect}'" if opts[:verbose]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def run(opts={})
|
66
|
+
request_exit = false
|
67
|
+
|
68
|
+
while request_exit == false
|
69
|
+
fetch_queued_urls(opts)
|
70
|
+
sleep 1
|
71
|
+
end
|
72
|
+
|
73
|
+
return 0
|
74
|
+
end
|
26
75
|
end # class SmartPlanner
|
27
76
|
end # module Planner
|
28
77
|
end # module Apollo
|
@@ -20,6 +20,8 @@
|
|
20
20
|
|
21
21
|
require 'yaml'
|
22
22
|
|
23
|
+
require File.join(File.dirname(__FILE__), "../model/models.rb")
|
24
|
+
|
23
25
|
module Apollo
|
24
26
|
class BaseProgram
|
25
27
|
CONFIG_DIR = File.join(Apollo::BASE_DIR, "config")
|
@@ -33,16 +35,16 @@ module Apollo
|
|
33
35
|
attr_accessor :options
|
34
36
|
attr_accessor :optparser
|
35
37
|
|
38
|
+
attr_accessor :amqp
|
36
39
|
attr_accessor :mongo
|
37
|
-
attr_accessor :mongo_db
|
38
40
|
|
39
41
|
def initialize
|
40
42
|
self.config = {}
|
41
43
|
self.options = DEFAULT_OPTIONS
|
42
44
|
self.optparser = nil
|
43
45
|
|
46
|
+
self.amqp = nil
|
44
47
|
self.mongo = nil
|
45
|
-
self.mongo_db = nil
|
46
48
|
end
|
47
49
|
|
48
50
|
def self.get_config_path(config)
|
@@ -113,6 +115,55 @@ module Apollo
|
|
113
115
|
return nil
|
114
116
|
end
|
115
117
|
|
118
|
+
def init_amqp()
|
119
|
+
conn_opts = self.config["amqp"]
|
120
|
+
if(conn_opts)
|
121
|
+
self.amqp = Apollo::Helper::Amqp::connect(conn_opts, self.options)
|
122
|
+
end
|
123
|
+
|
124
|
+
return self.amqp
|
125
|
+
end
|
126
|
+
|
127
|
+
def init_mongo()
|
128
|
+
conn_opts = self.config["mongo"]
|
129
|
+
if(conn_opts)
|
130
|
+
self.mongo = Apollo::Helper::Mongo::connect(conn_opts, self.options)
|
131
|
+
|
132
|
+
# Init Mongoid
|
133
|
+
path = File.join(Apollo::BASE_DIR, "config/mongoid.yml")
|
134
|
+
Mongoid.load!(path, @options[:env])
|
135
|
+
end
|
136
|
+
|
137
|
+
return self.mongo
|
138
|
+
end
|
139
|
+
|
140
|
+
def init_seeds_crawlers(opts={})
|
141
|
+
objs = Apollo::Crawler::BaseCrawler.subclasses
|
142
|
+
objs.each do |o|
|
143
|
+
crawler = Apollo::Model::Crawler.new
|
144
|
+
i = o.new
|
145
|
+
crawler.name = i.name
|
146
|
+
crawler.class_name = o.to_s
|
147
|
+
|
148
|
+
res = Apollo::Model::Crawler.where(class_name: crawler.class_name)
|
149
|
+
# puts "RES: '#{res.inspect}'"
|
150
|
+
if(res.nil? || res.count < 1)
|
151
|
+
crawler.save
|
152
|
+
if(opts[:verbose])
|
153
|
+
puts "Adding new crawler - '#{crawler.inspect}'"
|
154
|
+
end
|
155
|
+
else
|
156
|
+
if(opts[:verbose])
|
157
|
+
puts "Using crawler - '#{res[0].inspect}'"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def init_seeds(opts={})
|
164
|
+
init_seeds_crawlers(opts)
|
165
|
+
end
|
166
|
+
|
116
167
|
# Init program
|
117
168
|
def init_program(args)
|
118
169
|
res = nil
|
@@ -134,6 +185,12 @@ module Apollo
|
|
134
185
|
# Init Mongo Connection
|
135
186
|
init_mongo()
|
136
187
|
|
188
|
+
# Init AMQP
|
189
|
+
init_amqp()
|
190
|
+
|
191
|
+
# Init Seed data
|
192
|
+
init_seeds(@options)
|
193
|
+
|
137
194
|
return nil
|
138
195
|
end
|
139
196
|
|
@@ -45,16 +45,6 @@ require File.join(File.dirname(__FILE__), '..', 'version')
|
|
45
45
|
|
46
46
|
require File.join(File.dirname(__FILE__),'base_program')
|
47
47
|
|
48
|
-
|
49
|
-
# Hack
|
50
|
-
class String
|
51
|
-
def to_class
|
52
|
-
self.split('::').inject(Object) do |mod, class_name|
|
53
|
-
mod.const_get(class_name)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
48
|
module Apollo
|
59
49
|
# Apollo Crawler Base Directory
|
60
50
|
APOLLO_CRAWLER_BASE_DIR = File.join(File.dirname(__FILE__), "..")
|
@@ -174,6 +164,8 @@ module Apollo
|
|
174
164
|
|
175
165
|
opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
|
176
166
|
@options[:crawler_dirs] << path
|
167
|
+
|
168
|
+
init_additional_crawlers([path])
|
177
169
|
end
|
178
170
|
|
179
171
|
opts.on('-n', '--doc-limit [NUM]', 'Limit count of documents to be processed') do |count|
|
@@ -406,6 +398,22 @@ module Apollo
|
|
406
398
|
end
|
407
399
|
end
|
408
400
|
|
401
|
+
def init_additional_crawlers(dirs)
|
402
|
+
# puts "Initializing aditional crawlers ..."
|
403
|
+
dirs.each do |dir|
|
404
|
+
if(@options[:verbose])
|
405
|
+
puts "Registering additional crawler dir '#{dir}'"
|
406
|
+
end
|
407
|
+
|
408
|
+
Dir.glob("#{dir}/*.rb").each do |f|
|
409
|
+
if(@options[:verbose])
|
410
|
+
puts "Registering crawler '#{f}'"
|
411
|
+
end
|
412
|
+
require f
|
413
|
+
end
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
409
417
|
# Init program
|
410
418
|
def init_program(args)
|
411
419
|
init_options()
|
@@ -80,6 +80,10 @@ module Apollo
|
|
80
80
|
self.options[:env] = name
|
81
81
|
end
|
82
82
|
|
83
|
+
opts.on('-d', '--daemon', 'Run Apollo Platform daemon') do
|
84
|
+
self.options[:daemon] = true
|
85
|
+
end
|
86
|
+
|
83
87
|
opts.on('-v', '--verbose', 'Enable verbose output') do
|
84
88
|
self.options[:verbose] = true
|
85
89
|
end
|
@@ -90,6 +94,40 @@ module Apollo
|
|
90
94
|
end
|
91
95
|
end
|
92
96
|
|
97
|
+
def enqueue_crawlers_urls(amqp, crawlers=Apollo::Crawler::BaseCrawler.subclasses, opts={})
|
98
|
+
crawlers.each do |crawler|
|
99
|
+
i = crawler.new
|
100
|
+
puts "Queuying Crawler base URL: '#{i.url}'" if opts[:verbose]
|
101
|
+
qu = Apollo::Model::QueuedUrl.new(:url => i.url, :state => :queued)
|
102
|
+
qu.save
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def init_fetchers(amqp, opts={})
|
107
|
+
fetchers = []
|
108
|
+
fetchers << Apollo::Agent::FetcherAgent.new(amqp, self.options)
|
109
|
+
|
110
|
+
enqueue_crawlers_urls(amqp, Apollo::Crawler::BaseCrawler.subclasses, opts)
|
111
|
+
|
112
|
+
# ch = self.amqp.create_channel
|
113
|
+
# x = ch.default_exchange
|
114
|
+
# x.publish("Hello!", :routing_key => "fetcher")
|
115
|
+
end
|
116
|
+
|
117
|
+
def init_agents(amqp, opts={})
|
118
|
+
puts "Initializing agents"
|
119
|
+
|
120
|
+
init_fetchers(amqp, opts)
|
121
|
+
end
|
122
|
+
|
123
|
+
def init_program(args)
|
124
|
+
res = super(args)
|
125
|
+
return res unless res.nil?
|
126
|
+
|
127
|
+
init_agents(self.amqp, self.options)
|
128
|
+
return nil
|
129
|
+
end
|
130
|
+
|
93
131
|
def process_options(args)
|
94
132
|
if(self.options[:version])
|
95
133
|
puts Apollo::VERSION
|
@@ -105,30 +143,21 @@ module Apollo
|
|
105
143
|
return nil
|
106
144
|
end
|
107
145
|
|
108
|
-
def init_mongo()
|
109
|
-
self.mongo = Mongo::Connection.new(self.config['mongo']['host'])
|
110
|
-
self.mongo_db = self.mongo.db(self.config['mongo']['db'])
|
111
|
-
|
112
|
-
if(self.options[:verbose])
|
113
|
-
puts "(Mongo) Connection Inited: #{self.mongo.inspect}"
|
114
|
-
puts "(Mongo) Database Inited: #{self.mongo_db.inspect}"
|
115
|
-
end
|
116
|
-
|
117
|
-
return self.mongo
|
118
|
-
end
|
119
|
-
|
120
146
|
# Run Program
|
121
147
|
def run(args = ARGV)
|
122
148
|
res = super(args)
|
123
149
|
return res unless res.nil?
|
124
|
-
|
125
|
-
# Print classes
|
126
|
-
# puts Apollo::Crawler::BaseCrawler.subclasses.inspect
|
127
150
|
|
128
151
|
# Here we start
|
129
|
-
if(ARGV.length < 1)
|
130
|
-
|
131
|
-
|
152
|
+
# if(ARGV.length < 1)
|
153
|
+
# puts optparser
|
154
|
+
# return 0
|
155
|
+
# end
|
156
|
+
|
157
|
+
res_code = 0
|
158
|
+
if(self.options[:daemon])
|
159
|
+
planner = Apollo::Planner::SmartPlanner.new(self.amqp, self.mongo)
|
160
|
+
res_code = planner.run(self.options)
|
132
161
|
end
|
133
162
|
|
134
163
|
return request_exit(res_code)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-03-
|
11
|
+
date: 2013-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|
@@ -28,28 +28,28 @@ dependencies:
|
|
28
28
|
name: activesupport
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - '>='
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 3.2.12
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 3.2.12
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: dalli
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: 2.6.2
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - '>='
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: 2.6.2
|
55
55
|
- !ruby/object:Gem::Dependency
|
@@ -70,42 +70,42 @@ dependencies:
|
|
70
70
|
name: eventmachine
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - '>='
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '0'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - '>='
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: em-http-request
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - '>='
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- -
|
94
|
+
- - '>='
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: em-synchrony
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
|
-
- -
|
101
|
+
- - '>='
|
102
102
|
- !ruby/object:Gem::Version
|
103
103
|
version: '0'
|
104
104
|
type: :runtime
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
|
-
- -
|
108
|
+
- - '>='
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
- !ruby/object:Gem::Dependency
|
@@ -154,56 +154,56 @@ dependencies:
|
|
154
154
|
name: memcache-client
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
|
-
- -
|
157
|
+
- - '>='
|
158
158
|
- !ruby/object:Gem::Version
|
159
159
|
version: '0'
|
160
160
|
type: :runtime
|
161
161
|
prerelease: false
|
162
162
|
version_requirements: !ruby/object:Gem::Requirement
|
163
163
|
requirements:
|
164
|
-
- -
|
164
|
+
- - '>='
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
167
|
- !ruby/object:Gem::Dependency
|
168
168
|
name: mongo
|
169
169
|
requirement: !ruby/object:Gem::Requirement
|
170
170
|
requirements:
|
171
|
-
- -
|
171
|
+
- - '>='
|
172
172
|
- !ruby/object:Gem::Version
|
173
173
|
version: 1.8.2
|
174
174
|
type: :runtime
|
175
175
|
prerelease: false
|
176
176
|
version_requirements: !ruby/object:Gem::Requirement
|
177
177
|
requirements:
|
178
|
-
- -
|
178
|
+
- - '>='
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: 1.8.2
|
181
181
|
- !ruby/object:Gem::Dependency
|
182
182
|
name: mongoid
|
183
183
|
requirement: !ruby/object:Gem::Requirement
|
184
184
|
requirements:
|
185
|
-
- -
|
185
|
+
- - '>='
|
186
186
|
- !ruby/object:Gem::Version
|
187
187
|
version: 3.1.2
|
188
188
|
type: :runtime
|
189
189
|
prerelease: false
|
190
190
|
version_requirements: !ruby/object:Gem::Requirement
|
191
191
|
requirements:
|
192
|
-
- -
|
192
|
+
- - '>='
|
193
193
|
- !ruby/object:Gem::Version
|
194
194
|
version: 3.1.2
|
195
195
|
- !ruby/object:Gem::Dependency
|
196
196
|
name: mime-types
|
197
197
|
requirement: !ruby/object:Gem::Requirement
|
198
198
|
requirements:
|
199
|
-
- -
|
199
|
+
- - '>='
|
200
200
|
- !ruby/object:Gem::Version
|
201
201
|
version: '0'
|
202
202
|
type: :runtime
|
203
203
|
prerelease: false
|
204
204
|
version_requirements: !ruby/object:Gem::Requirement
|
205
205
|
requirements:
|
206
|
-
- -
|
206
|
+
- - '>='
|
207
207
|
- !ruby/object:Gem::Version
|
208
208
|
version: '0'
|
209
209
|
- !ruby/object:Gem::Dependency
|
@@ -224,70 +224,70 @@ dependencies:
|
|
224
224
|
name: openurl
|
225
225
|
requirement: !ruby/object:Gem::Requirement
|
226
226
|
requirements:
|
227
|
-
- -
|
227
|
+
- - '>='
|
228
228
|
- !ruby/object:Gem::Version
|
229
229
|
version: 0.4.2
|
230
230
|
type: :runtime
|
231
231
|
prerelease: false
|
232
232
|
version_requirements: !ruby/object:Gem::Requirement
|
233
233
|
requirements:
|
234
|
-
- -
|
234
|
+
- - '>='
|
235
235
|
- !ruby/object:Gem::Version
|
236
236
|
version: 0.4.2
|
237
237
|
- !ruby/object:Gem::Dependency
|
238
238
|
name: parallel
|
239
239
|
requirement: !ruby/object:Gem::Requirement
|
240
240
|
requirements:
|
241
|
-
- -
|
241
|
+
- - '>='
|
242
242
|
- !ruby/object:Gem::Version
|
243
243
|
version: 0.6.2
|
244
244
|
type: :runtime
|
245
245
|
prerelease: false
|
246
246
|
version_requirements: !ruby/object:Gem::Requirement
|
247
247
|
requirements:
|
248
|
-
- -
|
248
|
+
- - '>='
|
249
249
|
- !ruby/object:Gem::Version
|
250
250
|
version: 0.6.2
|
251
251
|
- !ruby/object:Gem::Dependency
|
252
252
|
name: rack
|
253
253
|
requirement: !ruby/object:Gem::Requirement
|
254
254
|
requirements:
|
255
|
-
- -
|
255
|
+
- - '>='
|
256
256
|
- !ruby/object:Gem::Version
|
257
257
|
version: 1.5.2
|
258
258
|
type: :runtime
|
259
259
|
prerelease: false
|
260
260
|
version_requirements: !ruby/object:Gem::Requirement
|
261
261
|
requirements:
|
262
|
-
- -
|
262
|
+
- - '>='
|
263
263
|
- !ruby/object:Gem::Version
|
264
264
|
version: 1.5.2
|
265
265
|
- !ruby/object:Gem::Dependency
|
266
266
|
name: right_aws
|
267
267
|
requirement: !ruby/object:Gem::Requirement
|
268
268
|
requirements:
|
269
|
-
- -
|
269
|
+
- - '>='
|
270
270
|
- !ruby/object:Gem::Version
|
271
271
|
version: 3.0.5
|
272
272
|
type: :runtime
|
273
273
|
prerelease: false
|
274
274
|
version_requirements: !ruby/object:Gem::Requirement
|
275
275
|
requirements:
|
276
|
-
- -
|
276
|
+
- - '>='
|
277
277
|
- !ruby/object:Gem::Version
|
278
278
|
version: 3.0.5
|
279
279
|
- !ruby/object:Gem::Dependency
|
280
280
|
name: right_http_connection
|
281
281
|
requirement: !ruby/object:Gem::Requirement
|
282
282
|
requirements:
|
283
|
-
- -
|
283
|
+
- - '>='
|
284
284
|
- !ruby/object:Gem::Version
|
285
285
|
version: 1.3.0
|
286
286
|
type: :runtime
|
287
287
|
prerelease: false
|
288
288
|
version_requirements: !ruby/object:Gem::Requirement
|
289
289
|
requirements:
|
290
|
-
- -
|
290
|
+
- - '>='
|
291
291
|
- !ruby/object:Gem::Version
|
292
292
|
version: 1.3.0
|
293
293
|
- !ruby/object:Gem::Dependency
|
@@ -336,146 +336,155 @@ dependencies:
|
|
336
336
|
name: ffi
|
337
337
|
requirement: !ruby/object:Gem::Requirement
|
338
338
|
requirements:
|
339
|
-
- -
|
339
|
+
- - '>='
|
340
340
|
- !ruby/object:Gem::Version
|
341
341
|
version: 1.4.0
|
342
342
|
type: :runtime
|
343
343
|
prerelease: false
|
344
344
|
version_requirements: !ruby/object:Gem::Requirement
|
345
345
|
requirements:
|
346
|
-
- -
|
346
|
+
- - '>='
|
347
347
|
- !ruby/object:Gem::Version
|
348
348
|
version: 1.4.0
|
349
349
|
- !ruby/object:Gem::Dependency
|
350
350
|
name: guard
|
351
351
|
requirement: !ruby/object:Gem::Requirement
|
352
352
|
requirements:
|
353
|
-
- -
|
353
|
+
- - '>='
|
354
354
|
- !ruby/object:Gem::Version
|
355
355
|
version: 1.6.2
|
356
356
|
type: :development
|
357
357
|
prerelease: false
|
358
358
|
version_requirements: !ruby/object:Gem::Requirement
|
359
359
|
requirements:
|
360
|
-
- -
|
360
|
+
- - '>='
|
361
361
|
- !ruby/object:Gem::Version
|
362
362
|
version: 1.6.2
|
363
363
|
- !ruby/object:Gem::Dependency
|
364
364
|
name: guard-rake
|
365
365
|
requirement: !ruby/object:Gem::Requirement
|
366
366
|
requirements:
|
367
|
-
- -
|
367
|
+
- - '>='
|
368
368
|
- !ruby/object:Gem::Version
|
369
369
|
version: 0.0.7
|
370
370
|
type: :development
|
371
371
|
prerelease: false
|
372
372
|
version_requirements: !ruby/object:Gem::Requirement
|
373
373
|
requirements:
|
374
|
-
- -
|
374
|
+
- - '>='
|
375
375
|
- !ruby/object:Gem::Version
|
376
376
|
version: 0.0.7
|
377
377
|
- !ruby/object:Gem::Dependency
|
378
378
|
name: guard-rspec
|
379
379
|
requirement: !ruby/object:Gem::Requirement
|
380
380
|
requirements:
|
381
|
-
- -
|
381
|
+
- - '>='
|
382
382
|
- !ruby/object:Gem::Version
|
383
383
|
version: 2.5.0
|
384
384
|
type: :development
|
385
385
|
prerelease: false
|
386
386
|
version_requirements: !ruby/object:Gem::Requirement
|
387
387
|
requirements:
|
388
|
-
- -
|
388
|
+
- - '>='
|
389
389
|
- !ruby/object:Gem::Version
|
390
390
|
version: 2.5.0
|
391
391
|
description: Gem for crawling data from external sources
|
392
392
|
email: korczis@gmail.com
|
393
393
|
executables:
|
394
|
+
- apollo-console
|
394
395
|
- apollo-crawler
|
395
396
|
- apollo-platform
|
396
397
|
extensions: []
|
397
398
|
extra_rdoc_files: []
|
398
399
|
files:
|
399
|
-
- ./config/
|
400
|
-
- ./config/mongo.yml
|
401
|
-
- ./config/memcached.yml
|
402
|
-
- ./config/mongoid.yml
|
403
|
-
- ./config/apollo.yml.default
|
400
|
+
- ./config/amqp.yml
|
404
401
|
- ./config/amqp.yml.default
|
402
|
+
- ./config/apollo.yml
|
403
|
+
- ./config/apollo.yml.default
|
404
|
+
- ./config/deploy.rb
|
405
|
+
- ./config/memcached.yml
|
405
406
|
- ./config/memcached.yml.default
|
406
|
-
- ./config/
|
407
|
+
- ./config/mongo.yml
|
408
|
+
- ./config/mongo.yml.default
|
409
|
+
- ./config/mongoid.yml
|
407
410
|
- ./config/mongoid.yml.default
|
408
|
-
- ./
|
409
|
-
- ./lib/apollo_crawler/fetcher/smart_fetcher.rb
|
410
|
-
- ./lib/apollo_crawler/fetcher/fetchers.rb
|
411
|
-
- ./lib/apollo_crawler/fetcher/simple_fetcher.rb
|
412
|
-
- ./lib/apollo_crawler/fetcher/base_fetcher.rb
|
413
|
-
- ./lib/apollo_crawler/planner/base_planner.rb
|
414
|
-
- ./lib/apollo_crawler/planner/planners.rb
|
415
|
-
- ./lib/apollo_crawler/planner/smart_planner.rb
|
416
|
-
- ./lib/apollo_crawler/lib.rb
|
417
|
-
- ./lib/apollo_crawler/version.rb
|
418
|
-
- ./lib/apollo_crawler/program/console_program.rb
|
419
|
-
- ./lib/apollo_crawler/program/platform_program.rb
|
420
|
-
- ./lib/apollo_crawler/program/crawler_program.rb
|
421
|
-
- ./lib/apollo_crawler/program/base_program.rb
|
422
|
-
- ./lib/apollo_crawler/program/programs.rb
|
423
|
-
- ./lib/apollo_crawler/logger/console_logger.rb
|
424
|
-
- ./lib/apollo_crawler/logger/base_logger.rb
|
425
|
-
- ./lib/apollo_crawler/logger/loggers.rb
|
426
|
-
- ./lib/apollo_crawler/helper/core_helper.rb
|
427
|
-
- ./lib/apollo_crawler/helper/amqp_helper.rb
|
428
|
-
- ./lib/apollo_crawler/helper/helpers.rb
|
429
|
-
- ./lib/apollo_crawler/helper/mongo_helper.rb
|
411
|
+
- ./lib/apollo_crawler.rb
|
430
412
|
- ./lib/apollo_crawler/adapter/adapters.rb
|
431
|
-
- ./lib/apollo_crawler/adapter/mongo_adapter.rb
|
432
413
|
- ./lib/apollo_crawler/adapter/amqp_adapter.rb
|
433
|
-
- ./lib/apollo_crawler/
|
434
|
-
- ./lib/apollo_crawler/
|
414
|
+
- ./lib/apollo_crawler/adapter/mongo_adapter.rb
|
415
|
+
- ./lib/apollo_crawler/agent/agents.rb
|
416
|
+
- ./lib/apollo_crawler/agent/base_agent.rb
|
417
|
+
- ./lib/apollo_crawler/agent/fetcher_agent.rb
|
418
|
+
- ./lib/apollo_crawler/cache/base_cache.rb
|
419
|
+
- ./lib/apollo_crawler/cache/caches.rb
|
435
420
|
- ./lib/apollo_crawler/cache/factory.rb
|
436
|
-
- ./lib/apollo_crawler/cache/
|
421
|
+
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
437
422
|
- ./lib/apollo_crawler/cache/memory_cache.rb
|
438
|
-
- ./lib/apollo_crawler/cache/base_cache.rb
|
439
423
|
- ./lib/apollo_crawler/cache/mongo_cache.rb
|
440
|
-
- ./lib/apollo_crawler/cache/
|
441
|
-
- ./lib/apollo_crawler/cache/
|
442
|
-
- ./lib/apollo_crawler/
|
443
|
-
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
444
|
-
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
445
|
-
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
446
|
-
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
424
|
+
- ./lib/apollo_crawler/cache/null_cache.rb
|
425
|
+
- ./lib/apollo_crawler/cache/sqlite_cache.rb
|
426
|
+
- ./lib/apollo_crawler/config.rb
|
447
427
|
- ./lib/apollo_crawler/crawler/base_crawler.rb
|
448
428
|
- ./lib/apollo_crawler/crawler/crawlers.rb
|
429
|
+
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
430
|
+
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
431
|
+
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
449
432
|
- ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
|
433
|
+
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
434
|
+
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
450
435
|
- ./lib/apollo_crawler/env.rb
|
451
|
-
- ./lib/apollo_crawler/
|
452
|
-
- ./lib/apollo_crawler/
|
453
|
-
- ./lib/apollo_crawler/
|
436
|
+
- ./lib/apollo_crawler/fetcher/base_fetcher.rb
|
437
|
+
- ./lib/apollo_crawler/fetcher/fetchers.rb
|
438
|
+
- ./lib/apollo_crawler/fetcher/simple_fetcher.rb
|
439
|
+
- ./lib/apollo_crawler/fetcher/smart_fetcher.rb
|
454
440
|
- ./lib/apollo_crawler/formatter/base_formatter.rb
|
441
|
+
- ./lib/apollo_crawler/formatter/formatters.rb
|
455
442
|
- ./lib/apollo_crawler/formatter/json_formatter.rb
|
456
443
|
- ./lib/apollo_crawler/formatter/plain_formatter.rb
|
457
|
-
- ./lib/apollo_crawler/formatter/
|
458
|
-
- ./lib/apollo_crawler/
|
444
|
+
- ./lib/apollo_crawler/formatter/table_formatter.rb
|
445
|
+
- ./lib/apollo_crawler/helper/amqp_helper.rb
|
446
|
+
- ./lib/apollo_crawler/helper/core_helper.rb
|
447
|
+
- ./lib/apollo_crawler/helper/helpers.rb
|
448
|
+
- ./lib/apollo_crawler/helper/mongo_helper.rb
|
449
|
+
- ./lib/apollo_crawler/lib.rb
|
450
|
+
- ./lib/apollo_crawler/logger/base_logger.rb
|
451
|
+
- ./lib/apollo_crawler/logger/console_logger.rb
|
452
|
+
- ./lib/apollo_crawler/logger/loggers.rb
|
453
|
+
- ./lib/apollo_crawler/model/base_model.rb
|
454
|
+
- ./lib/apollo_crawler/model/crawler.rb
|
455
|
+
- ./lib/apollo_crawler/model/models.rb
|
456
|
+
- ./lib/apollo_crawler/model/queued_url.rb
|
457
|
+
- ./lib/apollo_crawler/model/raw_document.rb
|
458
|
+
- ./lib/apollo_crawler/planner/base_planner.rb
|
459
|
+
- ./lib/apollo_crawler/planner/planners.rb
|
460
|
+
- ./lib/apollo_crawler/planner/smart_planner.rb
|
461
|
+
- ./lib/apollo_crawler/program/base_program.rb
|
462
|
+
- ./lib/apollo_crawler/program/console_program.rb
|
463
|
+
- ./lib/apollo_crawler/program/crawler_program.rb
|
464
|
+
- ./lib/apollo_crawler/program/platform_program.rb
|
465
|
+
- ./lib/apollo_crawler/program/programs.rb
|
459
466
|
- ./lib/apollo_crawler/store/base_store.rb
|
460
|
-
- ./lib/apollo_crawler.rb
|
467
|
+
- ./lib/apollo_crawler/store/stores.rb
|
468
|
+
- ./lib/apollo_crawler/version.rb
|
469
|
+
- bin/apollo-console
|
461
470
|
- bin/apollo-crawler
|
462
471
|
- bin/apollo-platform
|
463
472
|
homepage: http://apollocrawler.com/
|
464
473
|
licenses:
|
465
474
|
- MIT
|
466
475
|
metadata: {}
|
467
|
-
post_install_message:
|
476
|
+
post_install_message: Thank you for installing Apollo Crawler!
|
468
477
|
rdoc_options: []
|
469
478
|
require_paths:
|
470
479
|
- lib
|
471
480
|
required_ruby_version: !ruby/object:Gem::Requirement
|
472
481
|
requirements:
|
473
|
-
- -
|
482
|
+
- - '>='
|
474
483
|
- !ruby/object:Gem::Version
|
475
484
|
version: 1.9.3
|
476
485
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
477
486
|
requirements:
|
478
|
-
- -
|
487
|
+
- - '>='
|
479
488
|
- !ruby/object:Gem::Version
|
480
489
|
version: 1.8.11
|
481
490
|
requirements: []
|