RubyGems - apollo-crawler - Versions diffs - 0.1.5 → 0.1.6 - Mend

apollo-crawler 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +8 -8
data/bin/apollo-crawler +12 -410
data/lib/apollo_crawler.rb +31 -20
data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} +37 -34
data/lib/apollo_crawler/cache/factory.rb +35 -0
data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb +37 -34
data/lib/apollo_crawler/cache/memcached_cache.rb +51 -0
data/lib/apollo_crawler/{caches → cache}/memory_cache.rb +46 -43
data/lib/apollo_crawler/{caches → cache}/null_cache.rb +33 -30
data/lib/apollo_crawler/config.rb +53 -0
data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} +157 -155
data/lib/apollo_crawler/{crawler_template.rb → crawler/crawler_template.rb} +24 -24
data/lib/apollo_crawler/{crawlers → crawler}/google_com/google.rb +40 -40
data/lib/apollo_crawler/{crawlers → crawler}/slashdot_org/slashdot.rb +40 -40
data/lib/apollo_crawler/{crawlers → crawler}/stackoverflow_com/stackoverflow.rb +44 -44
data/lib/apollo_crawler/{crawlers → crawler}/xkcd_com/xkcd.rb +35 -35
data/lib/apollo_crawler/{crawlers → crawler}/ycombinator_com/hacker_news.rb +44 -44
data/lib/apollo_crawler/fetcher/fetcher_base.rb +6 -0
data/lib/apollo_crawler/fetcher/simple_fetcher.rb +8 -0
data/lib/apollo_crawler/formatter/formatter_base.rb +6 -0
data/lib/apollo_crawler/{formatters → formatter}/formatter_json.rb +17 -17
data/lib/apollo_crawler/{formatters → formatter}/formatter_plain.rb +17 -17
data/lib/apollo_crawler/{formatters → formatter}/formatter_table.rb +35 -35
data/lib/apollo_crawler/lib.rb +28 -0
data/lib/apollo_crawler/program.rb +406 -0
data/lib/apollo_crawler/store/store_base.rb +6 -0
data/lib/apollo_crawler/version.rb +2 -2
metadata +52 -17
data/lib/apollo_crawler/caches/factory.rb +0 -30
data/lib/apollo_crawler/formatter.rb +0 -6

data/lib/apollo_crawler/{cache.rb → cache/cache_base.rb} RENAMED

@@ -1,34 +1,37 @@
-module Apollo
-	module Caches
-		class Cache
-			# Get value associated with key from cache
-			def get(key, *args)
-				# Not found, Create, cache and return
-				res = yield args
-				return res
-			end
-			# Set value associated with key
-			# Return cached value
-			def set(key, value)
-				return value
-			end
-			# Check if cache contains specified key
-			def contains(key)
-				return false
-			end
-			# Invalidate key/value pair
-			def invalidate(key)
-				return true
-			end
-			# Clear cache
-			def clear
-				return
-			end
-		end # Cache
-	end # Caches
-end # Apollo
+module Apollo
+	module Cache
+		class CacheBase
+			# Get value associated with key from cache
+			def get(key, *args)
+				# Not found, Create, cache and return
+				if block_given?
+					res = yield args
+				end
+				return res
+			end
+			# Set value associated with key
+			# Return cached value
+			def set(key, value)
+				return value
+			end
+			# Check if cache contains specified key
+			def contains(key)
+				return false
+			end
+			# Invalidate key/value pair
+			def invalidate(key)
+				return true
+			end
+			# Clear cache
+			def clear
+				return
+			end
+		end # CacheBase
+	end # Cache
+end # Apollo

data/lib/apollo_crawler/cache/factory.rb ADDED

@@ -0,0 +1,35 @@
+# Global config file
+require File.join(File.dirname(__FILE__), '..', 'config')
+# Cache instance base class
+require File.join(File.dirname(__FILE__), 'cache_base')
+# Factory uses singleton pattern
+require 'singleton'
+module Apollo
+	module Cache
+		class Factory
+			include Singleton
+			def initialize
+				@cache = nil
+			end
+			def self.construct()
+				self.singleton.construct()
+			end
+			def construct()
+				if(@cache.nil? == false)
+					return @cache
+				end
+				res = RbConfig::CACHE_CLASS.new
+				@cache = res
+				return res
+			end
+		end # Factory
+	end # Cache
+end # Apollo

data/lib/apollo_crawler/{caches → cache}/filesystem_cache.rb RENAMED

@@ -1,34 +1,37 @@
-require File.join(File.dirname(__FILE__), '..', 'cache')
-module Apollo
-	module Caches
-		class Filesystem < Cache
-			def initialize
-				# puts "This if Filesystem cache"
-			end
-			# Get value associated with key from cache
-			def get(key, *args)
-				# Not found, Create, cache and return
-				res = yield args
-				return res
-			end
-			# Set value associated with key
-			# Return cached value
-			def set(key, value)
-				return value
-			end
-			# Check if cache contains specified key
-			def contains(key)
-				return false
-			end
-			# Invalidate key/value pair
-			def invalidate(key)
-				return true
-			end
-		end # Filesystem
-	end # Caches
-end # Apollo
+require File.join(File.dirname(__FILE__), 'cache_base')
+module Apollo
+	module Cache
+		class Filesystem < CacheBase
+			def initialize
+				# puts "This if Filesystem cache"
+			end
+			# Get value associated with key from cache
+			def get(key, *args)
+				# Not found, Create, cache and return
+				if block_given?
+					res = yield args
+				end
+				return res
+			end
+			# Set value associated with key
+			# Return cached value
+			def set(key, value)
+				return value
+			end
+			# Check if cache contains specified key
+			def contains(key)
+				return false
+			end
+			# Invalidate key/value pair
+			def invalidate(key)
+				return true
+			end
+		end # Filesystem
+	end # Cache
+end # Apollo

data/lib/apollo_crawler/cache/memcached_cache.rb ADDED

@@ -0,0 +1,51 @@
+require File.join(File.dirname(__FILE__), 'cache_base')
+require 'dalli'
+module Apollo
+	module Cache
+		class Memcached < CacheBase
+			@cache = nil
+			def initialize
+				@cache = Dalli::Client.new()
+			end
+			# Get value associated with key from cache
+			def get(key, *args)
+				res = @cache.get(key)
+				# Not found, Create, cache and return
+				if res.nil? && block_given?
+					res = yield args
+					self.set(key, res)
+				end
+				return res
+			end
+			# Set value associated with key
+			# Return cached value
+			def set(key, value)
+				@cache.set(key, value)
+				return key
+			end
+			# Check if cache contains specified key
+			def contains(key)
+				# TODO: Implement
+			end
+			# Invalidate key/value pair
+			def invalidate(key)
+				# TODO: Implement
+			end
+			# Clear cache
+			def clear
+				# TODO: Implement
+			end
+		end # Null
+	end # Cache
+end # Apollo

data/lib/apollo_crawler/{caches → cache}/memory_cache.rb RENAMED

@@ -1,43 +1,46 @@
-require File.join(File.dirname(__FILE__), '..', 'cache')
-module Apollo
-	module Caches
-		class Memory < Cache
-			@storage = nil
-			def initialize
-				@storage = {}
-			end
-			# Get value associated with key from cache
-			def get(key, *args)
-				@storage[key]
-				# Not found, Create, cache and return
-				res = yield args
-				return res
-			end
-			# Set value associated with key
-			# Return cached value
-			def set(key, value)
-				@storage[key] = value
-			end
-			# Check if cache contains specified key
-			def contains(key)
-				@storage.has_key?(key)
-			end
-			# Invalidate key/value pair
-			def invalidate(key)
-				@storage.delete(key)
-			end
-			# Clear cache
-			def clear
-				@storage.clear
-			end
-		end # Null
-	end # Caches
-end # Apollo
+require File.join(File.dirname(__FILE__), 'cache_base')
+module Apollo
+	module Cache
+		class Memory < CacheBase
+			@cache = nil
+			def initialize
+				@cache = {}
+			end
+			# Get value associated with key from cache
+			def get(key, *args)
+				res = @cache[key]
+				# Not found, Create, cache and return
+				if res.nil? && block_given?
+					res = yield args
+				end
+				return res
+			end
+			# Set value associated with key
+			# Return cached value
+			def set(key, value)
+				@cache[key] = value
+			end
+			# Check if cache contains specified key
+			def contains(key)
+				@cache.has_key?(key)
+			end
+			# Invalidate key/value pair
+			def invalidate(key)
+				@cache.delete(key)
+			end
+			# Clear cache
+			def clear
+				@cache.clear
+			end
+		end # Null
+	end # Cache
+end # Apollo

data/lib/apollo_crawler/{caches → cache}/null_cache.rb RENAMED

@@ -1,30 +1,33 @@
-require File.join(File.dirname(__FILE__), '..', 'cache')
-module Apollo
-	module Caches
-		class Null < Cache
-			# Get value associated with key from cache
-			def get(key, *args)
-				# Not found, Create, cache and return
-				res = yield args
-				return res
-			end
-			# Set value associated with key
-			# Return cached value
-			def set(key, value)
-				return value
-			end
-			# Check if cache contains specified key
-			def contains(key)
-				return false
-			end
-			# Invalidate key/value pair
-			def invalidate(key)
-				return true
-			end
-		end # Null
-	end # Caches
-end # Apollo
+require File.join(File.dirname(__FILE__), 'cache_base')
+module Apollo
+	module Cache
+		class Null < CacheBase
+			# Get value associated with key from cache
+			def get(key, *args)
+				# Not found, Create, cache and return
+				if block_given?
+					res = yield args
+				end
+				return res
+			end
+			# Set value associated with key
+			# Return cached value
+			def set(key, value)
+				return value
+			end
+			# Check if cache contains specified key
+			def contains(key)
+				return false
+			end
+			# Invalidate key/value pair
+			def invalidate(key)
+				return true
+			end
+		end # Null
+	end # Cache
+end # Apollo

data/lib/apollo_crawler/config.rb ADDED

@@ -0,0 +1,53 @@
+# Caches
+require File.join(File.dirname(__FILE__), 'lib')
+module RbConfig
+	############################################################
+	# Caches - caches implementations
+	############################################################
+	CACHES_DIR = File.join(File.dirname(__FILE__), "caches")
+	############################################################
+	# Cache implementation used for chaching pages retreived
+	############################################################
+	#
+	# Filesystem backend
+	# CACHE_CLASS = Apollo::Cache::Filesystem
+	#
+	# Memcached - expects localhost:11211
+	# CACHE_CLASS = Apollo::Cache::Memcached
+	#
+	# Pure naive ruby in-memory implementation
+	# CACHE_CLASS = Apollo::Cache::Memory
+	#
+	# Null caching - no caching at all
+	# CACHE_CLASS = Apollo::Cache::Null
+	# Used caching mechanism by default
+	CACHE_CLASS = Apollo::Cache::Memcached
+	############################################################
+	# Crawlers - Built-in out-of box working crawlers
+	############################################################
+	CRAWLERS_DIR = File.join(File.dirname(__FILE__), "crawlers")
+	# Template used for generated crawlers
+	CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
+	# Path of template
+	CRAWLER_TEMPLATE_PATH = File.join(File.dirname(__FILE__), "crawler_template.rb")
+	############################################################
+	# Formatters - used for formatting crawled documents results
+	############################################################
+	FORMATTERS_DIR = File.join(File.dirname(__FILE__), "formatters")
+	# Default formatter if no other specified
+	DEFAULT_FORMATTER = Apollo::Formatter::Json
+end # Config

data/lib/apollo_crawler/{crawler.rb → crawler/crawler_base.rb} RENAMED

@@ -1,155 +1,157 @@
-require "open-uri"
-require "nokogiri"
-module Apollo
-	module Crawlers
-		class Crawler
-			@backlog = nil
-			def initialize
-				@backlog = []
-			end
-			# Name of the crawler
-			def name
-				return "Crawler Base"
-			end
-			def url
-				return nil
-			end
-			def self.try_get_url(root, url)
-				begin
-					return URI.join(root, url)
-				rescue
-					return nil
-				end
-			end
-			# - (0) Figure out URL
-			# - (1) Extract Data
-			# - (2) Extract Links
-			# - (3) Go to (0) eventually
-			def etl(url=nil, &block)
-				# Look for passed URL use default instead and fail if it is not valid
-				if(url.nil? || url.empty?)
-					url = self.url
-				end
-				if(url.nil?)
-					return nil
-				end
-				if(url.kind_of?(Array))
-					@backlog.concat(url)
-				else
-					@backlog << url
-				end
-				res = []
-				# TODO: Respect limit of documents/urls processed
-				while(@backlog.empty? == false)
-					url = @backlog.shift
-					# puts "Processing '#{url}'"
-					doc = self.process_url(url)
-					res << doc
-					# TODO: Use log4r and log it only on info level
-					# TODO: Add some async/callback signal for document processed
-					yield res
-					if(!doc.nil? && !doc.empty?)
-						doc[:links].each do |link|
-							url = link[:link].to_s
-							# TODO: Use log4r and log it only on info level
-							#puts url
-							# TODO: Check if it is unique
-							@backlog << url
-						end
-					end
-				end
-				return res
-			end
-			def process_url(url)
-				# Try fetch document
-				doc = self.fetch_document(url)
-				if(doc.nil?)
-					return nil
-				end
-				# Try extract data from document
-				data = self.extract_data(doc)
-				# Try extract links for another documents
-				links = self.extract_links(doc)
-				puts links.inspect
-				# Format ETL result
-				res = {
-					:crawler => self.class.name,
-					:title => doc.title,
-					:data => data,
-					:links => links
-				}
-				return res
-			end
-			# Fetch document
-			def fetch_document(url)
-				# TODO: Refactor following idiom
-				if(url == nil)
-					url = self.url
-				end
-				if(url.nil?)
-					return nil
-				end
-				# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
-				cache = Apollo::Caches::Factory.instance.construct
-				raw = cache.get(url) do
-					max_attempts = 3
-					attempt_no = 0
-					success = false
-					res = nil
-					while(attempt_no < max_attempts && success == false) do
-						begin
-							res = open(url).read
-							success = true
-						rescue Exception => e
-							puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
-							sleep 1
-							attempt_no = attempt_no + 1
-							success = false
-						end
-					end
-					res
-				end
-				# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-				doc = Nokogiri::HTML(raw)
-				return doc
-			end
-			# Extracts data from document
-			def extract_data(doc)
-				res = []
-				return res
-			end
-			# Extract links to another documents from this document
-			def extract_links(doc)
-				res = []
-				return res
-			end
-		end
-	end
-end
+require "open-uri"
+require "nokogiri"
+module Apollo
+	module Crawler
+		class CrawlerBase
+			@backlog = nil
+			def initialize
+				@backlog = []
+			end
+			# Name of the crawler
+			def name
+				return "Crawler Base"
+			end
+			def url
+				return nil
+			end
+			def self.try_get_url(root, url)
+				begin
+					return URI.join(root, url)
+				rescue
+					return nil
+				end
+			end
+			# - (0) Figure out URL
+			# - (1) Extract Data
+			# - (2) Extract Links
+			# - (3) Go to (0) eventually
+			def etl(url=nil, &block)
+				# Look for passed URL use default instead and fail if it is not valid
+				if(url.nil? || url.empty?)
+					url = self.url
+				end
+				if(url.nil?)
+					return nil
+				end
+				if(url.kind_of?(Array))
+					@backlog.concat(url)
+				else
+					@backlog << url
+				end
+				res = []
+				# TODO: Respect limit of documents/urls processed
+				while(@backlog.empty? == false)
+					url = @backlog.shift
+					# puts "Processing '#{url}'"
+					doc = self.process_url(url)
+					res << doc
+					# TODO: Use log4r and log it only on info level
+					# TODO: Add some async/callback signal for document processed
+					if block_given?
+						yield res
+					end
+					if(!doc.nil? && !doc.empty?)
+						doc[:links].each do |link|
+							url = link[:link].to_s
+							# TODO: Use log4r and log it only on info level
+							#puts url
+							# TODO: Check if it is unique
+							@backlog << url
+						end
+					end
+				end
+				return res
+			end
+			def process_url(url)
+				# Try fetch document
+				doc = self.fetch_document(url)
+				if(doc.nil?)
+					return nil
+				end
+				# Try extract data from document
+				data = self.extract_data(doc)
+				# Try extract links for another documents
+				links = self.extract_links(doc)
+				puts links.inspect
+				# Format ETL result
+				res = {
+					:crawler => self.class.name,
+					:title => doc.title,
+					:data => data,
+					:links => links
+				}
+				return res
+			end
+			# Fetch document
+			def fetch_document(url)
+				# TODO: Refactor following idiom
+				if(url == nil)
+					url = self.url
+				end
+				if(url.nil?)
+					return nil
+				end
+				# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
+				cache = Apollo::Cache::Factory.instance.construct
+				raw = cache.get(url) do
+					max_attempts = 3
+					attempt_no = 0
+					success = false
+					res = nil
+					while(attempt_no < max_attempts && success == false) do
+						begin
+							res = open(url).read
+							success = true
+						rescue Exception => e
+							puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
+							sleep 1
+							attempt_no = attempt_no + 1
+							success = false
+						end
+					end
+					res
+				end
+				# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
+				doc = Nokogiri::HTML(raw)
+				return doc
+			end
+			# Extracts data from document
+			def extract_data(doc)
+				res = []
+				return res
+			end
+			# Extract links to another documents from this document
+			def extract_links(doc)
+				res = []
+				return res
+			end
+		end # CrawlerBase
+	end # Crawler
+end # Apollo