RubyGems - apollo-crawler - Versions diffs - 0.0.44 → 0.0.45 - Mend

apollo-crawler 0.0.44 → 0.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/bin/apollo-crawler CHANGED Viewed

@@ -24,22 +24,22 @@ require File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', 'versio
 module Crawler
 	class Program
-		@@PLUGIN_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "plugins")
+		@@CRAWLERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "crawlers")
 		@@FORMATTERS_DIR = File.join(File.dirname(__FILE__), "..", "lib", "apollo_crawler", "formatters")
-		@@PLUGIN_TEMPLATE_NAME = "plugin_template.rb"
+		@@CRAWLER_TEMPLATE_NAME = "crawler_template.rb"
 		# This hash will hold all of the options
 		# parsed from the command-line by
 		# OptionParser.
 		@options = nil
 		@optparser = nil
-		@plugins = nil
+		@crawlers = nil
 		@formatters = nil
 		@formatter = nil
 		# Initializer - Constructor
 		def initialize
-			@plugins = {}
+			@crawlers = {}
 			@formatters = {}
 		end
@@ -48,13 +48,13 @@ module Crawler
 			@options = {}
 			@options[:verbose] = false
 			@options[:version] = false
-			@options[:plugin_dirs] = [
-				@@PLUGIN_DIR
+			@options[:crawler_dirs] = [
+				@@CRAWLERS_DIR
 			]
 			@options[:formatter_dirs] = [
 				@@FORMATTERS_DIR
 			]
-			@options[:generate_plugin] = nil
+			@options[:generate_crawler] = nil
 			@optparser = OptionParser.new do | opts |
 				# This displays the help screen, all programs are
@@ -64,7 +64,7 @@ module Crawler
 					exit
 				end
-				opts.on('-a', '--all', 'Run all plugins') do
+				opts.on('-a', '--all', 'Run all crawlers') do
 					@options[:run_all] = true
 				end
@@ -72,12 +72,12 @@ module Crawler
 					@options[:formatter] = name
 				end
-				opts.on('-g', '--generate [NAME]', "Generate scaffold for new plugin") do |name|
-					@options[:generate_plugin] = name
+				opts.on('-g', '--generate [NAME]', "Generate scaffold for new crawler") do |name|
+					@options[:generate_crawler] = name
 				end
-				opts.on('-i', '--include [PATH]', 'Include additional plugins or plugin directories') do |path|
-					@options[:plugin_dirs] << path
+				opts.on('-i', '--include [PATH]', 'Include additional crawler or crawler directory') do |path|
+					@options[:crawler_dirs] << path
 				end
 				opts.on('-v', '--verbose', 'Enable verbose output') do
@@ -88,8 +88,8 @@ module Crawler
 					@options[:version] = true
 				end
-				opts.on('-l', '--list-plugins', 'List of plugins') do
-					@options[:list_plugins] = true
+				opts.on('-l', '--list-crawlers', 'List of crawlers') do
+					@options[:list_crawlers] = true
 				end
 				opts.on(nil, '--list-formatters', 'List of formatters available') do
@@ -163,10 +163,10 @@ module Crawler
 			end
 		end
-		# Register plugins (specific crawlers)
-		def register_plugins(dir)
+		# Register crawlers
+		def register_crawlers(dir)
 			if(@options[:verbose])
-				puts "Registering plugins - '#{dir}'"
+				puts "Registering crawlers - '#{dir}'"
 			end
 			files = File.join(dir, "**", "*.rb")
@@ -174,36 +174,36 @@ module Crawler
 				require file
 			end
-			tmp = Apollo::Crawler::Plugins.constants.select { |c|
-				Class === Apollo::Crawler::Plugins.const_get(c)
+			tmp = Apollo::Crawler::Crawlers.constants.select { |c|
+				Class === Apollo::Crawler::Crawlers.const_get(c)
 			}
 			tmp.each do |x|
-				klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
-				@plugins.merge!({ x.downcase.to_s => klass})
+				klass = Object.const_get('Apollo').const_get('Crawler').const_get('Crawlers').const_get(x)
+				@crawlers.merge!({ x.downcase.to_s => klass})
 			end
 			if(@options[:verbose])
-				@plugins.each do |plugin, klass|
+				@crawlers.each do |crawler, klass|
 					name = klass.new.class.name
-					if  name == "Apollo::Crawler::Plugins::Plugin"
+					if  name == "Apollo::Crawler::Crawlers::Crawler"
 						next
 					end
-					puts "Registered plugin '#{plugin}' -> '#{name}'"
+					puts "Registered crawler '#{crawler}' -> '#{name}'"
 				end
 			end
 		end
-		def generate_plugin(name, url = nil, matcher = nil)
+		def generate_crawler(name, url = nil, matcher = nil)
 			name = name.titleize.gsub(" ", "")
 			if(@options[:verbose])
-				puts "Generating new plugin '#{name}'"
+				puts "Generating new crawler '#{name}'"
 			end
-			template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@PLUGIN_TEMPLATE_NAME)
+			template_path = File.join(File.dirname(__FILE__), '..', 'lib', 'apollo_crawler', @@CRAWLER_TEMPLATE_NAME)
 			if(File.exists?(template_path) == false)
 				puts "Template file '#{template_path}' does not exists!"
 				return
@@ -219,23 +219,23 @@ module Crawler
 			matcher = matcher ? matcher : "//a"
 			placeholders = {
-				"PLUGIN_CLASS_NAME" => name,
-				"PLUGIN_NAME" => name.titleize,
-				"PLUGIN_URL"  => url,
-				"PLUGIN_MATCHER" => matcher
+				"CRAWLER_CLASS_NAME" => name,
+				"CRAWLER_NAME" => name.titleize,
+				"CRAWLER_URL"  => url,
+				"CRAWLER_MATCHER" => matcher
 			}
-			puts "Generating plugin '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
+			puts "Generating crawler '#{name.titleize}', class: '#{name}', path: '#{dest_path}'"
 			File.open(template_path, 'r') do |tmpl|
-				File.open(dest_path, 'w') do |plugin|
+				File.open(dest_path, 'w') do |crawler|
 					while line = tmpl.gets
 						#puts line
 						placeholders.each do |k, v|
 							line.gsub!(k, v)
 						end
-						plugin.puts line
+						crawler.puts line
 					end
 				end
 			end
@@ -253,18 +253,18 @@ module Crawler
 			load_config_file()
-			if(@options[:generate_plugin])
-				name = @options[:generate_plugin]
+			if(@options[:generate_crawler])
+				name = @options[:generate_crawler]
 				url = ARGV.length > 0 ? ARGV[0] : nil
 				matcher = ARGV.length > 1 ? ARGV[1] : nil
-				self.generate_plugin(name, url, matcher)
+				self.generate_crawler(name, url, matcher)
 				exit
 			end
 			# Register sites which can be crawled
-			@options[:plugin_dirs].each do |dir|
-				register_plugins(dir)
+			@options[:crawler_dirs].each do |dir|
+				register_crawlers(dir)
 			end
 			# Register sites which can be crawled
@@ -298,9 +298,9 @@ module Crawler
 				return
 			end
-			if(@options[:list_plugins])
+			if(@options[:list_crawlers])
 				headings = ['name', 'class']
-				rows = @plugins
+				rows = @crawlers
 				table = Terminal::Table.new :headings => headings, :rows => rows
@@ -308,27 +308,27 @@ module Crawler
 				return
 			end
-			plugins = ARGV
+			crawlers = ARGV
 			if(@options[:run_all])
-				plugins = @plugins.keys
+				crawlers = @crawlers.keys
 			end
-			if(plugins.empty?)
+			if(crawlers.empty?)
 				puts @optparser
 				exit
 			end
-			plugins.each do |plugin|
-				p = @plugins[plugin.downcase]
+			crawlers.each do |crawler|
+				p = @crawlers[crawler.downcase]
 				if(p == nil)
-					puts "Invalid plugin name - '#{plugin}'"
+					puts "Invalid crawler name - '#{crawler}'"
 					puts "See program help"
 					next
 				end
 				if(@options[:verbose])
-					puts "Running '#{plugin}'"
+					puts "Running '#{crawler}'"
 				end
 				res = p.new.etl

data/lib/apollo_crawler.rb CHANGED Viewed

@@ -1,16 +1,13 @@
-# require 'apollo_crawler/plugin'
 require 'apollo_crawler/crawler'
 require 'apollo_crawler/formatter'
-require 'apollo_crawler/plugin'
+# Crawlers
+require 'apollo_crawler/crawlers/alexa_com/alexa'
+require 'apollo_crawler/crawlers/firmy_cz/firmy'
+require 'apollo_crawler/crawlers/slashdot_org/slashdot'
+require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
 # Formatters
 require 'apollo_crawler/formatters/formatter_json'
 require 'apollo_crawler/formatters/formatter_plain'
 require 'apollo_crawler/formatters/formatter_table'
-# Plugins
-require 'apollo_crawler/plugins/alexa_com/alexa'
-require 'apollo_crawler/plugins/firmy_cz/firmy'
-require 'apollo_crawler/plugins/slashdot_org/slashdot'
-require 'apollo_crawler/plugins/ycombinator_com/hacker_news'

data/lib/apollo_crawler/crawler.rb CHANGED Viewed

@@ -0,0 +1,77 @@
+require "open-uri"
+require "nokogiri"
+module Apollo
+	module Crawler
+		module Crawlers
+			class Crawler
+				# Name of the crawler
+				def name
+					return "Crawler Base"
+				end
+				def url
+					return nil
+				end
+				# - (0) Figure out URL
+				# - (1) Extract Data
+				# - (2) Extract Links
+				# - (3) Go to (0) eventually
+				def etl(url=nil)
+					# Look for passed URL use default instead and fail if it is not valid
+					url = url ? url : self.url
+					if(url.nil?)
+						return nil
+					end
+					# Try fetch document
+					doc = self.fetch_document(url)
+					if(doc.nil?)
+						return nil
+					end
+					# Try extract data from document
+					data = self.extract_data(doc)
+					# Try extract links for another documents
+					links = self.extract_links(doc)
+					# Return ETL result
+					return {
+						:crawler => self.class.name,
+						:title => doc.title,
+						:data => data,
+						:links => links
+					}
+				end
+				# Fetch document
+				def fetch_document(url)
+					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+					if(self.url.nil?)
+						return nil
+					end
+					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
+					doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
+					return doc
+				end
+				# Extracts data from document
+				def extract_data(doc)
+					res = []
+					return res
+				end
+				# Extract links to another documents from this document
+				def extract_links(doc)
+					res = []
+					return res
+				end
+			end
+		end
+	end
+end

data/lib/apollo_crawler/{plugin_template.rb → crawler_template.rb} RENAMED Viewed

@@ -2,17 +2,16 @@ require 'iconv'
 module Apollo
 	module Crawler
-		module Plugins
-			# PARAMATRIZE: Plugin class name
-			class PLUGIN_CLASS_NAME < Plugin
-				@@MATCHER_ITEM = "PLUGIN_MATCHER"
+		module Crawlers
+			class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
+				@@MATCHER_ITEM = "CRAWLER_MATCHER"
 				def name()
-					return "PLUGIN_NAME"
+					return "CRAWLER_NAME"
 				end
 				def url()
-					return "PLUGIN_URL"
+					return "CRAWLER_URL"
 				end
 				def extract_data(doc)
@@ -24,6 +23,6 @@ module Apollo
 					}
 				end
 			end
-		end # Plugins
+		end # Crawlers
 	end # Crawler
 end # Apollo

data/lib/apollo_crawler/{plugins → crawlers}/alexa_com/alexa.rb RENAMED Viewed

@@ -1,11 +1,11 @@
 require 'iconv'
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
 module Apollo
 	module Crawler
-		module Plugins
-			class Alexa < Plugin
+		module Crawlers
+			class Alexa < Apollo::Crawler::Crawlers::Crawler
 				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
 				def name()
@@ -25,6 +25,6 @@ module Apollo
 					}
 				end
 			end
-		end # Plugins
+		end # Crawlers
 	end # Crawler
 end # Apollo

data/lib/apollo_crawler/{plugins → crawlers}/firmy_cz/firmy.rb RENAMED Viewed

@@ -1,11 +1,11 @@
 require 'iconv'
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
 module Apollo
 	module Crawler
-		module Plugins
-			class Firmy < Plugin
+		module Crawlers
+			class Firmy < Apollo::Crawler::Crawlers::Crawler
 				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
 				def name()
@@ -25,6 +25,6 @@ module Apollo
 					}
 				end
 			end
-		end # Plugins
+		end # Crawlers
 	end # Crawler
 end # Apollo

data/lib/apollo_crawler/{plugins → crawlers}/slashdot_org/slashdot.rb RENAMED Viewed

@@ -1,11 +1,11 @@
 require 'iconv'
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
 module Apollo
 	module Crawler
-		module Plugins
-			class Slashdot < Plugin
+		module Crawlers
+			class Slashdot < Apollo::Crawler::Crawlers::Crawler
 				@@MATCHER_ITEM = "//article/header/h2/span/a"
 				def name
@@ -25,6 +25,6 @@ module Apollo
 					}
 				end
 			end
-		end # Plugins
+		end # Crawlers
 	end # Crawler
 end # Apollo

data/lib/apollo_crawler/{plugins → crawlers}/stackoverflow_com/stackoverflow.rb RENAMED Viewed

@@ -1,11 +1,11 @@
 require 'iconv'
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
 module Apollo
-	module Crawler
-		module Plugins
-			class StackOverflow < Plugin
+	module Crawlers
+		module Crawler
+			class StackOverflow < Apollo::Crawler::Crawlers::Crawler
 				@@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
 				def name
@@ -25,6 +25,6 @@ module Apollo
 					}
 				end
 			end
-		end # Plugins
+		end # Crawlers
 	end # Crawler
 end # Apollo

data/lib/apollo_crawler/{plugins → crawlers}/xkcd_com/xkcd.rb RENAMED Viewed

@@ -1,11 +1,11 @@
 require 'iconv'
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
 module Apollo
 	module Crawler
-		module Plugins
-			class Xkcd < Plugin
+		module Crawlers
+			class Xkcd < Apollo::Crawler::Crawlers::Crawler
 				@@MATCHER_ITEM = "//div[@id = 'comic']/img"
 				def name()
@@ -25,6 +25,6 @@ module Apollo
 					}
 				end
 			end
-		end # Plugins
+		end # Crawlers
 	end # Crawler
 end # Apollo

data/lib/apollo_crawler/{plugins → crawlers}/ycombinator_com/hacker_news.rb RENAMED Viewed

@@ -1,11 +1,11 @@
 require 'iconv'
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
 module Apollo
 	module Crawler
-		module Plugins
-			class HackerNews < Plugin
+		module Crawlers
+			class HackerNews < Apollo::Crawler::Crawlers::Crawler
 				@@MATCHER_ITEM = "//td[@class = 'title']/a"
 				def name
@@ -25,6 +25,6 @@ module Apollo
 					}
 				end
 			end
-		end # Plugins
+		end # Crawlers
 	end # Crawler
 end # Apollo

data/lib/apollo_crawler/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Apollo
 	module Crawler
-		VERSION = '0.0.44'
+		VERSION = '0.0.45'
 	end # Crawler
 end # Apollo

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: apollo-crawler
 version: !ruby/object:Gem::Version
-  version: 0.0.44
+  version: 0.0.45
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-02-23 00:00:00.000000000 Z
+date: 2013-02-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: amqp
@@ -91,6 +91,22 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: writeexcel
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: iconv
   requirement: !ruby/object:Gem::Requirement
@@ -219,7 +235,7 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
-description: Gem for crawling data from external resources
+description: Gem for crawling data from external sources
 email: korczis@gmail.com
 executables:
 - apollo-crawler
@@ -230,16 +246,15 @@ files:
 - ./lib/apollo_crawler/formatters/formatter_json.rb
 - ./lib/apollo_crawler/formatters/formatter_table.rb
 - ./lib/apollo_crawler/version.rb
+- ./lib/apollo_crawler/crawler_template.rb
 - ./lib/apollo_crawler/crawler.rb
+- ./lib/apollo_crawler/crawlers/stackoverflow_com/stackoverflow.rb
+- ./lib/apollo_crawler/crawlers/xkcd_com/xkcd.rb
+- ./lib/apollo_crawler/crawlers/slashdot_org/slashdot.rb
+- ./lib/apollo_crawler/crawlers/firmy_cz/firmy.rb
+- ./lib/apollo_crawler/crawlers/alexa_com/alexa.rb
+- ./lib/apollo_crawler/crawlers/ycombinator_com/hacker_news.rb
 - ./lib/apollo_crawler/formatter.rb
-- ./lib/apollo_crawler/plugin_template.rb
-- ./lib/apollo_crawler/plugins/stackoverflow_com/stackoverflow.rb
-- ./lib/apollo_crawler/plugins/xkcd_com/xkcd.rb
-- ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
-- ./lib/apollo_crawler/plugins/firmy_cz/firmy.rb
-- ./lib/apollo_crawler/plugins/alexa_com/alexa.rb
-- ./lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb
-- ./lib/apollo_crawler/plugin.rb
 - ./lib/apollo_crawler.rb
 - bin/apollo-crawler
 homepage: https://github.com/korczis/apollo-crawler

data/lib/apollo_crawler/plugin.rb DELETED Viewed

@@ -1,73 +0,0 @@
-require "open-uri"
-require "nokogiri"
-module Apollo
-	module Crawler
-		module Plugins
-			class Plugin
-				# Name of the plugin, used in docs, lookups, etc ...
-				def name
-					return "Plugin Base"
-				end
-				def url
-					return nil
-				end
-				def etl(url=nil)
-					# Look for passed URL use default instead and fail if it is not valid
-					url = url ? url : self.url
-					if(url.nil?)
-						return nil
-					end
-					# Try fetch document
-					doc = self.fetch_document(url)
-					if(doc.nil?)
-						return nil
-					end
-					# Try extract data from document
-					data = self.extract_data(doc)
-					# Try extract links for another documents
-					links = self.extract_links(doc)
-					# Return ETL result
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:data => data,
-						:links => links
-					}
-				end
-				# Fetch document
-				def fetch_document(url)
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
-					if(self.url.nil?)
-						return nil
-					end
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
-					return doc
-				end
-				# Extracts data from document
-				def extract_data(doc)
-					res = []
-					return res
-				end
-				# Extract links to another documents from this document
-				def extract_links(doc)
-					res = []
-					return res
-				end
-			end
-		end
-	end
-end