RubyGems - apollo-crawler - Versions diffs - 0.1.26 → 0.1.27 - Mend

apollo-crawler 0.1.26 → 0.1.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/bin/apollo-aws +187 -0
data/config/mongoid.yml +7 -10
data/lib/apollo_crawler/agent/crawler_agent.rb +8 -6
data/lib/apollo_crawler/agent/domainer_agent.rb +1 -1
data/lib/apollo_crawler/agent/exchanges.rb +23 -8
data/lib/apollo_crawler/agent/fetcher_agent.rb +53 -31
data/lib/apollo_crawler/crawler/crawlers.rb +1 -0
data/lib/apollo_crawler/crawler/spider_crawler.rb +52 -0
data/lib/apollo_crawler/helper/amqp_helper.rb +1 -0
data/lib/apollo_crawler/helper/mongo_helper.rb +32 -0
data/lib/apollo_crawler/model/domain.rb +2 -1
data/lib/apollo_crawler/planner/smart_planner.rb +22 -11
data/lib/apollo_crawler/program/platform_program.rb +9 -7
data/lib/apollo_crawler/scheduler/base_scheduler.rb +6 -8
data/lib/apollo_crawler/version.rb +1 -1
metadata +4 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5e6fa7213e0e7f81364c5dbda6a8f53def1fda6e
-  data.tar.gz: ecd79cd04f4a4331124b3910ef5e85da5f590692
+  metadata.gz: d6e559b9a330a556c3fb9eed416e0e83bdeb5d04
+  data.tar.gz: 289cf132fbd702c2c81d9517f95460aed5a629ee
 SHA512:
-  metadata.gz: ae28d3adaca4125abdeee5ffbf73615a1efc6388e97882b67359846e81b6b58acca74fc08f59034c13f132892cca92b324876c20ab6680222aefc08b5e0d9ea1
-  data.tar.gz: f452e71e138696125affbaa3ae646775857bf7b99a06a0656a85e08881e66fdbe357cd31eac4bc5f3030aadbf0075b4f602f280c669d65b139b14bc20ab974ec
+  metadata.gz: c150d48952a2061a2db796046016d0dc8cf0d3c858a41655995f3dea3b712cbc9470db75461a5598541313ea1de5a1325152579235a17f3c10911745eec7c532
+  data.tar.gz: 40bca94e6c247dcebd06fff4669dce6631a0c8096a3f26b4a3d68da7d03f1c1b9d0388fa067d46907291e64ec49d2ccd4f9c33fef3076c271816172f20d387bb

data/bin/apollo-aws ADDED

@@ -0,0 +1,187 @@
+#! /usr/bin/env ruby
+# encoding: utf-8
+# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+require "rubygems"
+require "bundler/setup"
+require 'fog'
+require 'fileutils'
+require 'yaml'
+module Apollo
+	class AwsConsole
+		AWS_CONFIG_FILE = File.expand_path("~/.apollo/config/apollo-aws.yml")
+		AWS_DEFAULT_CONFIG = {
+			:provider => "aws",
+			:aws_access_key_id => "",
+			:aws_secret_access_key => "",
+			:aws_default_instance => "i-a4039bee",
+			:region => "eu-west-1"
+		}
+		attr_accessor :connection
+		attr_accessor :config
+		def initialize
+			self.connection = nil
+			self.config = nil
+		end
+		def config_init(path)
+			self.config = AWS_DEFAULT_CONFIG
+			AWS_DEFAULT_CONFIG.each do |k, v|
+				print "Override '#{k}' (default: '#{v}') ? > "
+				val = STDIN.gets.chomp!
+				if val.empty?
+					self.config[k] = v
+				else
+					self.config[k] = val
+				end
+			end
+			config_save(AWS_CONFIG_FILE, self.config)
+		end
+		def config_save(path, config)
+			dir = File.dirname(path)
+			if(Dir.exists?(dir) == false)
+				FileUtils.mkpath(dir)
+			end
+			File.open(path, 'w+') do |f|
+				f.write(config.to_yaml)
+			end
+			return config
+		end
+		def config_load(path)
+			self.config = YAML.load_file(path)
+		end
+		def connect()
+			return self.connection if self.connection
+			aws_provider_keys = [:provider, :aws_access_key_id, :aws_secret_access_key, :region]
+			config = self.config.reject { |key, value| !aws_provider_keys.include?(key) }
+			begin
+				self.connection = Fog::Compute.new(config)
+			rescue Exception => e
+				puts "Unable to connect to AWS, reason: #{e.to_s}"
+				return -1
+			end
+			return self.connection
+		end
+		def get_instance(instance_id)
+			puts "Getting instance '#{instance_id}'"
+			self.connection.servers.get(instance_id)
+		end
+		def run_cmd(cmd)
+			if cmd == "help"
+				puts "Supported commands - init, info, interactive, list, start, stop"
+				return 0
+			end
+			if self.config.nil?
+				if(File.exists?(AWS_CONFIG_FILE) == false)
+					self.config = config_init(AWS_CONFIG_FILE)
+				else
+					self.config = config_load(AWS_CONFIG_FILE)
+				end
+			end
+			case cmd
+			when "info"
+				instance = self.config[:aws_default_instance]
+				puts "Inspecting instance '#{instance}'"
+				connect()
+				server = get_instance(instance)
+				puts server.inspect
+				return 0
+			when "init"
+				config_init(AWS_CONFIG_FILE)
+				return 0
+			when "list"
+				connect()
+				instance_list = connection.servers.all
+				instance_list.table([:id, :flavor_id, :public_ip_address, :private_ip_address, :image_id ])
+				return 0
+			when "start"
+				instance = self.config[:aws_default_instance]
+				puts "Starting instance '#{instance}'"
+				connect()
+				server = get_instance(instance)
+				res = server.start
+				puts " => Success: #{res}"
+				puts " => DNS: #{server.dns_name}"
+				return 0
+			when "stop"
+				instance = self.config[:aws_default_instance]
+				puts "Stopping instance '#{instance}'"
+				connect()
+				server = get_instance(instance)
+				res = server.stop
+				puts " => Success: #{res}"
+				puts " => DNS: #{server.dns_name}"
+				return 0
+			end
+			puts "Unknown command '#{cmd}'"
+			return -1
+		end
+		def run()
+			cmd = ARGV.length > 0 ? ARGV[0] : "help"
+			if cmd == "interactive"
+				print "> "
+				while cmd = STDIN.gets.chomp!
+					break if cmd.downcase == "quit"
+					res = run_cmd(cmd)
+					print "> "
+				end
+				return 0
+			end
+			return run_cmd(cmd)
+		end
+	end
+end
+if __FILE__ == $0
+	Apollo::AwsConsole::new.run()
+end

data/config/mongoid.yml CHANGED

@@ -1,26 +1,23 @@
-default: &default_options
+default:
   sessions:
     default:
       hosts:
         - apollo-crawler.no-ip.org:27017
 development:
   sessions:
     default:
       hosts:
         - localhost:27017
       database: apollo-crawler-development
 test:
   sessions:
     default:
-    hosts:
-      - apollo-crawler.no-ip.org:27017
-    database: apollo-crawler-test
+      hosts:
+        - apollo-crawler.no-ip.org:27017
+      database: apollo-crawler-test
 production:
   sessions:
     default:
-    hosts:
-      - apollo-crawler.no-ip.org:27017
-    database: apollo-crawler-production
+      hosts:
+        - apollo-crawler.no-ip.org:27017
+      database: apollo-crawler-production

data/lib/apollo_crawler/agent/crawler_agent.rb CHANGED

@@ -37,18 +37,20 @@ module Apollo
 				# Declarations
 				channel = amqp.create_channel
-				declarations = Apollo::Agent.declare_exchanges(channel, opts)
+				self.declarations = Apollo::Agent.declare_entities(channel, opts)# Binding
 				# Binding
-				declarations[:queues]["crawler.queue"].bind(declarations[:exchanges]["crawler"]).subscribe do |delivery_info, metadata, payload|
-					puts "CrawlerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
+				queue = self.declarations[:queues]["crawler.queue"]
+				exchange = self.declarations[:exchanges]["crawler"]
+				queue.bind(exchange).subscribe do |delivery_info, metadata, payload|
 					msg = JSON.parse(payload)
 					request = msg['request']
 					response = msg['response']
+					url = request["url"]
-					# puts "PLANEEEER: #{msg.inspect}"
+					puts "CrawlerAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
 					doc = Nokogiri::HTML(response['body'])
 					crawler = request['crawler_name'].constantize.new
@@ -59,7 +61,7 @@ module Apollo
 					# puts res.inspect
 					if(metadata[:reply_to] != nil)
-						x = declarations[:exchanges][metadata[:reply_to]]
+						x = self.declarations[:exchanges][metadata[:reply_to]]
 						msg = {
 							:request => request,

data/lib/apollo_crawler/agent/domainer_agent.rb CHANGED

@@ -37,7 +37,7 @@ module Apollo
 				# Declarations
 				channel = amqp.create_channel
-				declarations = Apollo::Agent.declare_exchanges(channel, opts)
+				declarations = Apollo::Agent.declare_entities(channel, opts)
 				# Binding
 				declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|

data/lib/apollo_crawler/agent/exchanges.rb CHANGED

@@ -20,6 +20,23 @@
 module Apollo
 	module Agent
+		def self.declare_queues(channel, opts={})
+			if(opts[:verbose])
+				puts "Declaring AMQP Queues"
+			end
+			# Queues
+			queues = {}
+			queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
+			queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
+			queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
+			queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
+			queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
+			queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
+			return queues
+		end
 		def self.declare_exchanges(channel, opts={})
 			if(opts[:verbose])
 				puts "Declaring AMQP Exchanges"
@@ -34,14 +51,12 @@ module Apollo
 			exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
 			exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
-			# Queues
-			queues = {}
-			queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
-			queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
-			queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
-			queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
-			queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
-			queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
+			return exchanges
+		end
+		def self.declare_entities(channel, opts={})
+			exchanges = self.declare_exchanges(channel, opts)
+			queues = self.declare_queues(channel, opts)
 			# Compose res
 			res = {

data/lib/apollo_crawler/agent/fetcher_agent.rb CHANGED

@@ -23,17 +23,20 @@ require File.join(File.dirname(__FILE__), 'base_agent')
 require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
+require 'amqp'
+require 'amqp/extensions/rabbitmq'
 require 'digest/sha1'
-require 'thread/pool'
+require 'thread'
 module Apollo
 	module Agent
 		class FetcherAgent < BaseAgent
-			THREAD_POOL_SIZE = 10
+			THREAD_POOL_SIZE = 1
 			attr_accessor :fetcher
 			attr_accessor :declarations
-			attr_accessor :thread_pool
+			attr_accessor :mutex
 			def initialize(amqp, opts={})
 				self.fetcher = Apollo::Fetcher::SmartFetcher.new
@@ -42,46 +45,54 @@ module Apollo
 					puts "Initializing fetcher agent..."
 				end
-				thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
 				# Declarations
 				channel = amqp.create_channel
-				declarations = Apollo::Agent.declare_exchanges(channel, opts)
+				channel.prefetch(THREAD_POOL_SIZE)
+				# Binding (Default)
+				self.declarations = Apollo::Agent.declare_entities(channel, opts)
+				queue = declarations[:queues]["fetcher.queue"]
+				# AMQP contexts for threads
+				contexts = []
+				(0...THREAD_POOL_SIZE).each do |i|
+					puts "FetcherAgent::initialize() - Creating context #{i}" if opts[:verbose]
+				end
-				# Binding
-				declarations[:queues]["fetcher.queue"].bind(declarations[:exchanges]["fetcher"]).subscribe do |delivery_info, metadata, payload|
-					thread_pool.process {
-						queued_url = JSON.parse(payload)
-						url = queued_url["url"]
+				# AMQP contexts mutex/lock
+				self.mutex = Mutex.new()
-						puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
+				exchange = self.declarations[:exchanges]["fetcher"]
-						doc = nil
-						begin
-							doc = Apollo::Fetcher::SmartFetcher::fetch(url)
-						rescue Exception => e
-							puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
-						end
+				queue.bind(exchange).subscribe(:ack => true) do |delivery_info, metadata, payload|
+					# There can be troubles with concurency, please see https://groups.google.com/forum/?fromgroups=#!topic/ruby-amqp/aO9GPu-jxuE
+					queued_url = JSON.parse(payload)
+					url = queued_url["url"]
-						doc = get_fetched_doc(queued_url, doc, metadata, opts)
-						# send_response_msg(metadata[:reply_to], get_response_msg(queued_url, doc))
+					puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
+					self.mutex.synchronize {
+						puts "FetcherAgent: Acking - '#{delivery_info.delivery_tag}'" if opts[:verbose]
+						channel.basic_ack(delivery_info.delivery_tag, true)
+					}
+					begin
+						doc = Apollo::Fetcher::SmartFetcher::fetch(url)
+						doc = process_fetched_doc(queued_url, doc, metadata, opts)
 						if(metadata && metadata[:reply_to])
-							exchange_name = metadata[:reply_to]
+							puts "Replying to '#{metadata[:reply_to]}'"
+							send_response_msg(metadata[:reply_to], queued_url, doc)
+						end
-							if(exchange_name != nil)
-								msg = get_response_msg(queued_url, doc)
+					rescue Exception => e
+						puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
+					end
-								x = declarations[:exchanges][exchange_name]
-								x.publish(msg.to_json)
-							end
-						end
-					}
+					doc
 				end
 			end
-			def get_fetched_doc(queued_url, doc, metadata, opts={})
+			def process_fetched_doc(queued_url, doc, metadata, opts={})
 				url = queued_url["url"]
 				res = Apollo::Model::RawDocument.new
@@ -94,12 +105,23 @@ module Apollo
 				return res
 			end
-			def get_response_msg(queued_url, doc)
+			def format_response_msg(queued_url, doc)
 				return {
 					:request => queued_url,
 					:response => doc
 				}
 			end
+			def send_response_msg(dest, queued_url, doc)
+				if(dest != nil)
+					msg = format_response_msg(queued_url, doc)
+					self.mutex.synchronize {
+						exchange = self.declarations[:exchanges][dest]
+						exchange.publish(msg.to_json)
+					}
+				end
+			end
 		end # class FetcherAgent
 	end # module Agent
 end # module Apollo

data/lib/apollo_crawler/crawler/crawlers.rb CHANGED

@@ -22,6 +22,7 @@ require File.join(File.dirname(__FILE__), 'base_crawler')
 require File.join(File.dirname(__FILE__), 'google_crawler')
 require File.join(File.dirname(__FILE__), 'hacker_news_crawler')
 require File.join(File.dirname(__FILE__), 'slashdot_crawler')
+require File.join(File.dirname(__FILE__), 'spider_crawler')
 require File.join(File.dirname(__FILE__), 'stackoverflow_crawler')
 require File.join(File.dirname(__FILE__), 'xkcd_crawler')
 require File.join(File.dirname(__FILE__), 'youjizz_crawler')

data/lib/apollo_crawler/crawler/spider_crawler.rb ADDED

@@ -0,0 +1,52 @@
+# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+require File.join(File.dirname(__FILE__), 'base_crawler')
+module Apollo
+	module Crawler
+		class SpiderCrawler < BaseCrawler
+			def name()
+				return "Spider"
+			end
+			def url()
+				return "http://www.wikipedia.org/"
+			end
+			def extract_data(doc)
+				[]
+			end
+			def extract_links(doc)
+				res = doc.xpath("//a").map { |node|
+					url = BaseCrawler.try_get_url(self.url, node['href']).to_s
+					next if url.nil?
+					{
+						:link => url
+					}
+				}
+				return res.uniq
+			end
+		end # class SpiderCrawler
+	end # module Crawler
+end # module Apollo

data/lib/apollo_crawler/helper/amqp_helper.rb CHANGED

@@ -19,6 +19,7 @@
 # THE SOFTWARE.
 require 'amqp'
+require 'amqp/extensions/rabbitmq'
 require 'bunny'
 require 'thread'

data/lib/apollo_crawler/helper/mongo_helper.rb CHANGED

@@ -21,6 +21,8 @@
 require 'mongo'
 require 'mongoid'
+require 'csv'
 module Apollo
 	module Helper
 		module Mongo
@@ -37,6 +39,36 @@ module Apollo
 				return res
 			end
+			def self.csv_bulk_insert(path, model, bulk_size, validate=false, &block)
+				batch = []
+				CSV.foreach(path) do |row|
+					res = nil
+					if block_given?
+						res = yield row
+					end
+					if res.nil? == false
+						if(!validate || model.where(res).length == 0)
+							batch << res
+						end
+					end
+					if((batch.length % bulk_size) == 0)
+						# puts "Inserting batch '#{batch.inspect}'"
+						model.collection.insert(batch)
+						batch.clear
+					end
+				end
+				if batch.empty? == false
+					model.collection.insert(batch)
+					batch.clear
+				end
+			end
 		end # Mongo
 	end # module Helper
 end # module Apollo

data/lib/apollo_crawler/model/domain.rb CHANGED

@@ -29,9 +29,10 @@ module Apollo
 			store_in collection: "domains"
 			field :name
+			field :rank
 			# Indexes
-			index({ created_at: 1, updated_at: 1, name: 1 })
+			index({ created_at: 1, updated_at: 1, name: 1, rank: 1})
 		end # class Domain
 	end # module Model
 end # module Apollo

data/lib/apollo_crawler/planner/smart_planner.rb CHANGED

@@ -41,7 +41,8 @@ module Apollo
 				# Declarations
 				channel = amqp.create_channel
-				self.declarations = Apollo::Agent.declare_exchanges(channel, opts)
+				# channel.prefetch(1)
+				self.declarations = Apollo::Agent.declare_entities(channel, opts)
 				# Bindings
 				declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
@@ -67,14 +68,17 @@ module Apollo
 						end
 					else
 						doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
+						if(doc.nil? == false)
+							puts "Same as #{doc.inspect}"
+						end
 					end
 					if(doc.nil?)
 						doc = Apollo::Model::RawDocument.new(response).save
-					end
-					# Publish
-					declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
+						# Publish
+						declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
+					end
 				end
 				declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
@@ -103,6 +107,10 @@ module Apollo
 				end
 			end
+			def get_url_count(state, opts={})
+				Apollo::Model::QueuedUrl.where({:state => state}).count
+			end
 			def fetch_url(url, opts={})
 				if(opts[:verbose])
 					puts "AMQP fetching '#{url.inspect}'"
@@ -113,19 +121,22 @@ module Apollo
 			end
 			def get_next_url(opts={})
-				Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
+				tmp = Apollo::Model::QueuedUrl.where({:state => :queued}).order_by(:created_at.asc)
+				tmp.find_and_modify({ "$set" => { state: :fetching }}, new: true)
 			end
 			def fetch_queued_urls(opts={})
-				url = get_next_url(opts)
+				fetching_count = Apollo::Model::QueuedUrl.where({:state => :fetching}).count
-				while url
-					puts url.inspect
-					# puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
+				if(fetching_count > 4)
+					puts "Fetching too many URLs. Slowing down for a while ..."
+					return
+				end
+				while get_url_count(:fetching) < 4
+					url = get_next_url(opts)
+					puts "SmartPlanner::fetch_queued_urls() - Queueing: #{url.inspect}"
 					fetch_url(url, opts)
-					url = get_next_url()
 				end
 			end

data/lib/apollo_crawler/program/platform_program.rb CHANGED

@@ -138,14 +138,16 @@ module Apollo
 			end
 			Thread::new {
-				CSV.foreach(path) do |row|
+				Apollo::Helper::Mongo::csv_bulk_insert(path, Apollo::Model::Domain, 1000, false) do |row|
+					rank = row[0].to_i
 					name = row[1]
-					domain = Apollo::Model::Domain.where({:name => name}).first()
-					if(domain.nil?)
-						domain = Apollo::Model::Domain.new({:name => name})
-						domain.save
-						print "."
-					end
+					res = {
+						:rank => rank,
+						:name => name
+					}
+					res
 				end
 			}
 		end

data/lib/apollo_crawler/scheduler/base_scheduler.rb CHANGED

@@ -18,7 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
-require File.join(File.dirname(__FILE__), "../model/models")
+require File.join(File.dirname(__FILE__), "../model/models.rb")
 module Apollo
 	module Scheduler
@@ -26,13 +26,11 @@ module Apollo
 			def self.schedule(url, crawler=nil, opts={})
 				queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
-				if(queued_url.nil?)
-					qu = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
-					qu.save
-					return qu
-				end
-				return nil
+				return queued_url if queued_url
+				res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
+				res.save
+				return res
 			end
 		end # class BaseScheduler
 	end # module Scheduler

data/lib/apollo_crawler/version.rb CHANGED

@@ -19,5 +19,5 @@
 # THE SOFTWARE.
 module Apollo
-	VERSION = '0.1.26'
+	VERSION = '0.1.27'
 end # Apollo

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: apollo-crawler
 version: !ruby/object:Gem::Version
-  version: 0.1.26
+  version: 0.1.27
 platform: ruby
 authors:
 - Tomas Korcak
@@ -391,6 +391,7 @@ dependencies:
 description: Gem for crawling data from external sources
 email: korczis@gmail.com
 executables:
+- apollo-aws
 - apollo-console
 - apollo-crawler
 - apollo-platform
@@ -432,6 +433,7 @@ files:
 - ./lib/apollo_crawler/crawler/google_crawler.rb
 - ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
 - ./lib/apollo_crawler/crawler/slashdot_crawler.rb
+- ./lib/apollo_crawler/crawler/spider_crawler.rb
 - ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
 - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
 - ./lib/apollo_crawler/crawler/youjizz_crawler.rb
@@ -473,6 +475,7 @@ files:
 - ./lib/apollo_crawler/store/base_store.rb
 - ./lib/apollo_crawler/store/stores.rb
 - ./lib/apollo_crawler/version.rb
+- bin/apollo-aws
 - bin/apollo-console
 - bin/apollo-crawler
 - bin/apollo-platform