RubyGems - apollo-crawler - Versions diffs - 0.1.26 → 0.1.27 - Mend

apollo-crawler 0.1.26 → 0.1.27

Files changed (17) hide show

checksums.yaml +4 -4
data/bin/apollo-aws +187 -0
data/config/mongoid.yml +7 -10
data/lib/apollo_crawler/agent/crawler_agent.rb +8 -6
data/lib/apollo_crawler/agent/domainer_agent.rb +1 -1
data/lib/apollo_crawler/agent/exchanges.rb +23 -8
data/lib/apollo_crawler/agent/fetcher_agent.rb +53 -31
data/lib/apollo_crawler/crawler/crawlers.rb +1 -0
data/lib/apollo_crawler/crawler/spider_crawler.rb +52 -0
data/lib/apollo_crawler/helper/amqp_helper.rb +1 -0
data/lib/apollo_crawler/helper/mongo_helper.rb +32 -0
data/lib/apollo_crawler/model/domain.rb +2 -1
data/lib/apollo_crawler/planner/smart_planner.rb +22 -11
data/lib/apollo_crawler/program/platform_program.rb +9 -7
data/lib/apollo_crawler/scheduler/base_scheduler.rb +6 -8
data/lib/apollo_crawler/version.rb +1 -1
metadata +4 -1

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5e6fa7213e0e7f81364c5dbda6a8f53def1fda6e
-  data.tar.gz: ecd79cd04f4a4331124b3910ef5e85da5f590692
+  metadata.gz: d6e559b9a330a556c3fb9eed416e0e83bdeb5d04
+  data.tar.gz: 289cf132fbd702c2c81d9517f95460aed5a629ee
 SHA512:
-  metadata.gz: ae28d3adaca4125abdeee5ffbf73615a1efc6388e97882b67359846e81b6b58acca74fc08f59034c13f132892cca92b324876c20ab6680222aefc08b5e0d9ea1
-  data.tar.gz: f452e71e138696125affbaa3ae646775857bf7b99a06a0656a85e08881e66fdbe357cd31eac4bc5f3030aadbf0075b4f602f280c669d65b139b14bc20ab974ec
+  metadata.gz: c150d48952a2061a2db796046016d0dc8cf0d3c858a41655995f3dea3b712cbc9470db75461a5598541313ea1de5a1325152579235a17f3c10911745eec7c532
+  data.tar.gz: 40bca94e6c247dcebd06fff4669dce6631a0c8096a3f26b4a3d68da7d03f1c1b9d0388fa067d46907291e64ec49d2ccd4f9c33fef3076c271816172f20d387bb

data/bin/apollo-aws ADDED

@@ -0,0 +1,187 @@
+#! /usr/bin/env ruby
+# encoding: utf-8
+# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+require "rubygems"
+require "bundler/setup"
+require 'fog'
+require 'fileutils'
+require 'yaml'
+module Apollo
+	class AwsConsole
+		AWS_CONFIG_FILE = File.expand_path("~/.apollo/config/apollo-aws.yml")
+		AWS_DEFAULT_CONFIG = {
+			:provider => "aws",
+			:aws_access_key_id => "",
+			:aws_secret_access_key => "",
+			:aws_default_instance => "i-a4039bee",
+			:region => "eu-west-1"
+		}
+		attr_accessor :connection
+		attr_accessor :config
+		def initialize
+			self.connection = nil
+			self.config = nil
+		end
+		def config_init(path)
+			self.config = AWS_DEFAULT_CONFIG
+			AWS_DEFAULT_CONFIG.each do |k, v|
+				print "Override '#{k}' (default: '#{v}') ? > "
+				val = STDIN.gets.chomp!
+				if val.empty?
+					self.config[k] = v
+				else
+					self.config[k] = val
+				end
+			end
+			config_save(AWS_CONFIG_FILE, self.config)
+		end
+		def config_save(path, config)
+			dir = File.dirname(path)
+			if(Dir.exists?(dir) == false)
+				FileUtils.mkpath(dir)
+			end
+			File.open(path, 'w+') do |f|
+				f.write(config.to_yaml)
+			end
+			return config
+		end
+		def config_load(path)
+			self.config = YAML.load_file(path)
+		end
+		def connect()
+			return self.connection if self.connection
+			aws_provider_keys = [:provider, :aws_access_key_id, :aws_secret_access_key, :region]
+			config = self.config.reject { |key, value| !aws_provider_keys.include?(key) }
+			begin
+				self.connection = Fog::Compute.new(config)
+			rescue Exception => e
+				puts "Unable to connect to AWS, reason: #{e.to_s}"
+				return -1
+			end
+			return self.connection
+		end
+		def get_instance(instance_id)
+			puts "Getting instance '#{instance_id}'"
+			self.connection.servers.get(instance_id)
+		end
+		def run_cmd(cmd)
+			if cmd == "help"
+				puts "Supported commands - init, info, interactive, list, start, stop"
+				return 0
+			end
+			if self.config.nil?
+				if(File.exists?(AWS_CONFIG_FILE) == false)
+					self.config = config_init(AWS_CONFIG_FILE)
+				else
+					self.config = config_load(AWS_CONFIG_FILE)
+				end
+			end
+			case cmd
+			when "info"
+				instance = self.config[:aws_default_instance]
+				puts "Inspecting instance '#{instance}'"
+				connect()
+				server = get_instance(instance)
+				puts server.inspect
+				return 0
+			when "init"
+				config_init(AWS_CONFIG_FILE)
+				return 0
+			when "list"
+				connect()
+				instance_list = connection.servers.all
+				instance_list.table([:id, :flavor_id, :public_ip_address, :private_ip_address, :image_id ])
+				return 0
+			when "start"
+				instance = self.config[:aws_default_instance]
+				puts "Starting instance '#{instance}'"
+				connect()
+				server = get_instance(instance)
+				res = server.start
+				puts " => Success: #{res}"
+				puts " => DNS: #{server.dns_name}"
+				return 0
+			when "stop"
+				instance = self.config[:aws_default_instance]
+				puts "Stopping instance '#{instance}'"
+				connect()
+				server = get_instance(instance)
+				res = server.stop
+				puts " => Success: #{res}"
+				puts " => DNS: #{server.dns_name}"
+				return 0
+			end
+			puts "Unknown command '#{cmd}'"
+			return -1
+		end
+		def run()
+			cmd = ARGV.length > 0 ? ARGV[0] : "help"
+			if cmd == "interactive"
+				print "> "
+				while cmd = STDIN.gets.chomp!
+					break if cmd.downcase == "quit"
+					res = run_cmd(cmd)
+					print "> "
+				end
+				return 0
+			end
+			return run_cmd(cmd)
+		end
+	end
+end
+if __FILE__ == $0
+	Apollo::AwsConsole::new.run()
+end

data/config/mongoid.yml CHANGED

@@ -1,26 +1,23 @@
-default: &default_options
+default:
   sessions:
     default:
       hosts:
         - apollo-crawler.no-ip.org:27017
 development:
   sessions:
     default:
       hosts:
         - localhost:27017
       database: apollo-crawler-development
 test:
   sessions:
     default:
-    hosts:
-      - apollo-crawler.no-ip.org:27017
-    database: apollo-crawler-test
+      hosts:
+        - apollo-crawler.no-ip.org:27017
+      database: apollo-crawler-test
 production:
   sessions:
     default:
-    hosts:
-      - apollo-crawler.no-ip.org:27017
-    database: apollo-crawler-production
+      hosts:
+        - apollo-crawler.no-ip.org:27017
+      database: apollo-crawler-production

data/lib/apollo_crawler/agent/crawler_agent.rb CHANGED

@@ -37,18 +37,20 @@ module Apollo
 				# Declarations
 				channel = amqp.create_channel
-				declarations = Apollo::Agent.declare_exchanges(channel, opts)
+				self.declarations = Apollo::Agent.declare_entities(channel, opts)# Binding
 				# Binding
-				declarations[:queues]["crawler.queue"].bind(declarations[:exchanges]["crawler"]).subscribe do |delivery_info, metadata, payload|
-					puts "CrawlerAgent: Received, metadata #{metadata.inspect}" if opts[:verbose]
+				queue = self.declarations[:queues]["crawler.queue"]
+				exchange = self.declarations[:exchanges]["crawler"]
+				queue.bind(exchange).subscribe do |delivery_info, metadata, payload|
 					msg = JSON.parse(payload)
 					request = msg['request']
 					response = msg['response']
+					url = request["url"]
-					# puts "PLANEEEER: #{msg.inspect}"
+					puts "CrawlerAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
 					doc = Nokogiri::HTML(response['body'])
 					crawler = request['crawler_name'].constantize.new
@@ -59,7 +61,7 @@ module Apollo
 					# puts res.inspect
 					if(metadata[:reply_to] != nil)
-						x = declarations[:exchanges][metadata[:reply_to]]
+						x = self.declarations[:exchanges][metadata[:reply_to]]
 						msg = {
 							:request => request,

data/lib/apollo_crawler/agent/domainer_agent.rb CHANGED

@@ -37,7 +37,7 @@ module Apollo
 				# Declarations
 				channel = amqp.create_channel
-				declarations = Apollo::Agent.declare_exchanges(channel, opts)
+				declarations = Apollo::Agent.declare_entities(channel, opts)
 				# Binding
 				declarations[:queues]["domainer.queue"].bind(declarations[:exchanges]["domainer"]).subscribe do |delivery_info, metadata, payload|

data/lib/apollo_crawler/agent/exchanges.rb CHANGED

@@ -20,6 +20,23 @@
 module Apollo
 	module Agent
+		def self.declare_queues(channel, opts={})
+			if(opts[:verbose])
+				puts "Declaring AMQP Queues"
+			end
+			# Queues
+			queues = {}
+			queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
+			queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
+			queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
+			queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
+			queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
+			queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
+			return queues
+		end
 		def self.declare_exchanges(channel, opts={})
 			if(opts[:verbose])
 				puts "Declaring AMQP Exchanges"
@@ -34,14 +51,12 @@ module Apollo
 			exchanges["planner.domained"] = channel.direct("planner.domained", :auto_delete => false, :durable => true)
 			exchanges["planner.fetched"] = channel.direct("planner.fetched", :auto_delete => false, :durable => true)
-			# Queues
-			queues = {}
-			queues["crawler.queue"] = channel.queue("crawler.queue", :auto_delete => false, :durable => true)
-			queues["domainer.queue"] = channel.queue("domainer.queue", :auto_delete => false, :durable => true)
-			queues["fetcher.queue"] = channel.queue("fetcher.queue", :auto_delete => false, :durable => true)
-			queues["planner.crawled.queue"] = channel.queue("planner.crawled.queue", :auto_delete => false, :durable => true)
-			queues["planner.domained.queue"] = channel.queue("planner.domained.queue", :auto_delete => false, :durable => true)
-			queues["planner.fetched.queue"] = channel.queue("planner.fetched.queue", :auto_delete => false, :durable => true)
+			return exchanges
+		end
+		def self.declare_entities(channel, opts={})
+			exchanges = self.declare_exchanges(channel, opts)
+			queues = self.declare_queues(channel, opts)
 			# Compose res
 			res = {

data/lib/apollo_crawler/agent/fetcher_agent.rb CHANGED

@@ -23,17 +23,20 @@ require File.join(File.dirname(__FILE__), 'base_agent')
 require File.join(File.dirname(__FILE__), '../fetcher/fetchers')
+require 'amqp'
+require 'amqp/extensions/rabbitmq'
 require 'digest/sha1'
-require 'thread/pool'
+require 'thread'
 module Apollo
 	module Agent
 		class FetcherAgent < BaseAgent
-			THREAD_POOL_SIZE = 10
+			THREAD_POOL_SIZE = 1
 			attr_accessor :fetcher
 			attr_accessor :declarations
-			attr_accessor :thread_pool
+			attr_accessor :mutex
 			def initialize(amqp, opts={})
 				self.fetcher = Apollo::Fetcher::SmartFetcher.new
@@ -42,46 +45,54 @@ module Apollo
 					puts "Initializing fetcher agent..."
 				end
-				thread_pool = Thread::Pool.new(THREAD_POOL_SIZE)
 				# Declarations
 				channel = amqp.create_channel
-				declarations = Apollo::Agent.declare_exchanges(channel, opts)
+				channel.prefetch(THREAD_POOL_SIZE)
+				# Binding (Default)
+				self.declarations = Apollo::Agent.declare_entities(channel, opts)
+				queue = declarations[:queues]["fetcher.queue"]
+				# AMQP contexts for threads
+				contexts = []
+				(0...THREAD_POOL_SIZE).each do |i|
+					puts "FetcherAgent::initialize() - Creating context #{i}" if opts[:verbose]
+				end
-				# Binding
-				declarations[:queues]["fetcher.queue"].bind(declarations[:exchanges]["fetcher"]).subscribe do |delivery_info, metadata, payload|
-					thread_pool.process {
-						queued_url = JSON.parse(payload)
-						url = queued_url["url"]
+				# AMQP contexts mutex/lock
+				self.mutex = Mutex.new()
-						puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
+				exchange = self.declarations[:exchanges]["fetcher"]
-						doc = nil
-						begin
-							doc = Apollo::Fetcher::SmartFetcher::fetch(url)
-						rescue Exception => e
-							puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
-						end
+				queue.bind(exchange).subscribe(:ack => true) do |delivery_info, metadata, payload|
+					# There can be troubles with concurency, please see https://groups.google.com/forum/?fromgroups=#!topic/ruby-amqp/aO9GPu-jxuE
+					queued_url = JSON.parse(payload)
+					url = queued_url["url"]
-						doc = get_fetched_doc(queued_url, doc, metadata, opts)
-						# send_response_msg(metadata[:reply_to], get_response_msg(queued_url, doc))
+					puts "FetcherAgent: Received - '#{url}', metadata #{metadata.inspect}" if opts[:verbose]
+					self.mutex.synchronize {
+						puts "FetcherAgent: Acking - '#{delivery_info.delivery_tag}'" if opts[:verbose]
+						channel.basic_ack(delivery_info.delivery_tag, true)
+					}
+					begin
+						doc = Apollo::Fetcher::SmartFetcher::fetch(url)
+						doc = process_fetched_doc(queued_url, doc, metadata, opts)
 						if(metadata && metadata[:reply_to])
-							exchange_name = metadata[:reply_to]
+							puts "Replying to '#{metadata[:reply_to]}'"
+							send_response_msg(metadata[:reply_to], queued_url, doc)
+						end
-							if(exchange_name != nil)
-								msg = get_response_msg(queued_url, doc)
+					rescue Exception => e
+						puts "EXCEPTION: FetcherAgent::initialize() - Unable to fetch '#{url}', reason: '#{e.to_s}'"
+					end
-								x = declarations[:exchanges][exchange_name]
-								x.publish(msg.to_json)
-							end
-						end
-					}
+					doc
 				end
 			end
-			def get_fetched_doc(queued_url, doc, metadata, opts={})
+			def process_fetched_doc(queued_url, doc, metadata, opts={})
 				url = queued_url["url"]
 				res = Apollo::Model::RawDocument.new
@@ -94,12 +105,23 @@ module Apollo
 				return res
 			end
-			def get_response_msg(queued_url, doc)
+			def format_response_msg(queued_url, doc)
 				return {
 					:request => queued_url,
 					:response => doc
 				}
 			end
+			def send_response_msg(dest, queued_url, doc)
+				if(dest != nil)
+					msg = format_response_msg(queued_url, doc)
+					self.mutex.synchronize {
+						exchange = self.declarations[:exchanges][dest]
+						exchange.publish(msg.to_json)
+					}
+				end
+			end
 		end # class FetcherAgent
 	end # module Agent
 end # module Apollo

data/lib/apollo_crawler/crawler/crawlers.rb CHANGED

@@ -22,6 +22,7 @@ require File.join(File.dirname(__FILE__), 'base_crawler')
 require File.join(File.dirname(__FILE__), 'google_crawler')
 require File.join(File.dirname(__FILE__), 'hacker_news_crawler')
 require File.join(File.dirname(__FILE__), 'slashdot_crawler')
+require File.join(File.dirname(__FILE__), 'spider_crawler')
 require File.join(File.dirname(__FILE__), 'stackoverflow_crawler')
 require File.join(File.dirname(__FILE__), 'xkcd_crawler')
 require File.join(File.dirname(__FILE__), 'youjizz_crawler')

data/lib/apollo_crawler/crawler/spider_crawler.rb ADDED

@@ -0,0 +1,52 @@
+# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+require File.join(File.dirname(__FILE__), 'base_crawler')
+module Apollo
+	module Crawler
+		class SpiderCrawler < BaseCrawler
+			def name()
+				return "Spider"
+			end
+			def url()
+				return "http://www.wikipedia.org/"
+			end
+			def extract_data(doc)
+				[]
+			end
+			def extract_links(doc)
+				res = doc.xpath("//a").map { |node|
+					url = BaseCrawler.try_get_url(self.url, node['href']).to_s
+					next if url.nil?
+					{
+						:link => url
+					}
+				}
+				return res.uniq
+			end
+		end # class SpiderCrawler
+	end # module Crawler
+end # module Apollo

data/lib/apollo_crawler/helper/amqp_helper.rb CHANGED

@@ -19,6 +19,7 @@
 # THE SOFTWARE.
 require 'amqp'
+require 'amqp/extensions/rabbitmq'
 require 'bunny'
 require 'thread'

data/lib/apollo_crawler/helper/mongo_helper.rb CHANGED

@@ -21,6 +21,8 @@
 require 'mongo'
 require 'mongoid'
+require 'csv'
 module Apollo
 	module Helper
 		module Mongo
@@ -37,6 +39,36 @@ module Apollo
 				return res
 			end
+			def self.csv_bulk_insert(path, model, bulk_size, validate=false, &block)
+				batch = []
+				CSV.foreach(path) do |row|
+					res = nil
+					if block_given?
+						res = yield row
+					end
+					if res.nil? == false
+						if(!validate || model.where(res).length == 0)
+							batch << res
+						end
+					end
+					if((batch.length % bulk_size) == 0)
+						# puts "Inserting batch '#{batch.inspect}'"
+						model.collection.insert(batch)
+						batch.clear
+					end
+				end
+				if batch.empty? == false
+					model.collection.insert(batch)
+					batch.clear
+				end
+			end
 		end # Mongo
 	end # module Helper
 end # module Apollo

data/lib/apollo_crawler/model/domain.rb CHANGED

@@ -29,9 +29,10 @@ module Apollo
 			store_in collection: "domains"
 			field :name
+			field :rank
 			# Indexes
-			index({ created_at: 1, updated_at: 1, name: 1 })
+			index({ created_at: 1, updated_at: 1, name: 1, rank: 1})
 		end # class Domain
 	end # module Model
 end # module Apollo

data/lib/apollo_crawler/planner/smart_planner.rb CHANGED

@@ -41,7 +41,8 @@ module Apollo
 				# Declarations
 				channel = amqp.create_channel
-				self.declarations = Apollo::Agent.declare_exchanges(channel, opts)
+				# channel.prefetch(1)
+				self.declarations = Apollo::Agent.declare_entities(channel, opts)
 				# Bindings
 				declarations[:queues]["planner.fetched.queue"].bind(declarations[:exchanges]["planner.fetched"]).subscribe do |delivery_info, metadata, payload|
@@ -67,14 +68,17 @@ module Apollo
 						end
 					else
 						doc = Apollo::Model::RawDocument.where(:sha_hash => response['sha_hash']).first
+						if(doc.nil? == false)
+							puts "Same as #{doc.inspect}"
+						end
 					end
 					if(doc.nil?)
 						doc = Apollo::Model::RawDocument.new(response).save
-					end
-					# Publish
-					declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
+						# Publish
+						declarations[:exchanges]["crawler"].publish(msg.to_json, :reply_to => "planner.crawled")
+					end
 				end
 				declarations[:queues]["planner.domained.queue"].bind(declarations[:exchanges]["planner.domained"]).subscribe do |delivery_info, metadata, payload|
@@ -103,6 +107,10 @@ module Apollo
 				end
 			end
+			def get_url_count(state, opts={})
+				Apollo::Model::QueuedUrl.where({:state => state}).count
+			end
 			def fetch_url(url, opts={})
 				if(opts[:verbose])
 					puts "AMQP fetching '#{url.inspect}'"
@@ -113,19 +121,22 @@ module Apollo
 			end
 			def get_next_url(opts={})
-				Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
+				tmp = Apollo::Model::QueuedUrl.where({:state => :queued}).order_by(:created_at.asc)
+				tmp.find_and_modify({ "$set" => { state: :fetching }}, new: true)
 			end
 			def fetch_queued_urls(opts={})
-				url = get_next_url(opts)
+				fetching_count = Apollo::Model::QueuedUrl.where({:state => :fetching}).count
-				while url
-					puts url.inspect
-					# puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
+				if(fetching_count > 4)
+					puts "Fetching too many URLs. Slowing down for a while ..."
+					return
+				end
+				while get_url_count(:fetching) < 4
+					url = get_next_url(opts)
+					puts "SmartPlanner::fetch_queued_urls() - Queueing: #{url.inspect}"
 					fetch_url(url, opts)
-					url = get_next_url()
 				end
 			end

data/lib/apollo_crawler/program/platform_program.rb CHANGED

@@ -138,14 +138,16 @@ module Apollo
 			end
 			Thread::new {
-				CSV.foreach(path) do |row|
+				Apollo::Helper::Mongo::csv_bulk_insert(path, Apollo::Model::Domain, 1000, false) do |row|
+					rank = row[0].to_i
 					name = row[1]
-					domain = Apollo::Model::Domain.where({:name => name}).first()
-					if(domain.nil?)
-						domain = Apollo::Model::Domain.new({:name => name})
-						domain.save
-						print "."
-					end
+					res = {
+						:rank => rank,
+						:name => name
+					}
+					res
 				end
 			}
 		end

data/lib/apollo_crawler/scheduler/base_scheduler.rb CHANGED

@@ -18,7 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
-require File.join(File.dirname(__FILE__), "../model/models")
+require File.join(File.dirname(__FILE__), "../model/models.rb")
 module Apollo
 	module Scheduler
@@ -26,13 +26,11 @@ module Apollo
 			def self.schedule(url, crawler=nil, opts={})
 				queued_url = Apollo::Model::QueuedUrl.where(:url => url).first
-				if(queued_url.nil?)
-					qu = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
-					qu.save
-					return qu
-				end
-				return nil
+				return queued_url if queued_url
+				res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
+				res.save
+				return res
 			end
 		end # class BaseScheduler
 	end # module Scheduler

data/lib/apollo_crawler/version.rb CHANGED

@@ -19,5 +19,5 @@
 # THE SOFTWARE.
 module Apollo
-	VERSION = '0.1.26'
+	VERSION = '0.1.27'
 end # Apollo

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: apollo-crawler
 version: !ruby/object:Gem::Version
-  version: 0.1.26
+  version: 0.1.27
 platform: ruby
 authors:
 - Tomas Korcak
@@ -391,6 +391,7 @@ dependencies:
 description: Gem for crawling data from external sources
 email: korczis@gmail.com
 executables:
+- apollo-aws
 - apollo-console
 - apollo-crawler
 - apollo-platform
@@ -432,6 +433,7 @@ files:
 - ./lib/apollo_crawler/crawler/google_crawler.rb
 - ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
 - ./lib/apollo_crawler/crawler/slashdot_crawler.rb
+- ./lib/apollo_crawler/crawler/spider_crawler.rb
 - ./lib/apollo_crawler/crawler/stackoverflow_crawler.rb
 - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
 - ./lib/apollo_crawler/crawler/youjizz_crawler.rb
@@ -473,6 +475,7 @@ files:
 - ./lib/apollo_crawler/store/base_store.rb
 - ./lib/apollo_crawler/store/stores.rb
 - ./lib/apollo_crawler/version.rb
+- bin/apollo-aws
 - bin/apollo-console
 - bin/apollo-crawler
 - bin/apollo-platform