RubyGems - apollo-crawler - Versions diffs - 0.0.3 → 0.0.5 - Mend

apollo-crawler 0.0.3 → 0.0.5

Files changed (8) hide show

data/lib/apollo_crawler/crawler.rb +0 -0
data/lib/apollo_crawler/plugin.rb +37 -0
data/lib/apollo_crawler/plugins/alexa_com/alexa.rb +36 -0
data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb +35 -0
data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb +35 -0
data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb +35 -0
data/main.rb +170 -0
metadata +9 -2

data/lib/apollo_crawler/crawler.rb ADDED

File without changes

@@ -0,0 +1,37 @@
+require "open-uri"
+require "nokogiri"
+module Apollo
+	module Crawler
+		module Plugins
+			class Plugin
+				# Name of the plugin, used in docs, lookups, etc ...
+				def name
+					return "Plugin Base"
+				end
+				# - Fetch default URL (and transform it to document)
+				# - Extract and Load (Store) important data
+				# - Look for another documents
+				#    Examples:
+				#     - "next page"
+				#     - "people you may know on Linked in"
+				#     - "will attend on FB")
+				def run
+					return {
+						:plugin => self.class.name
+					}
+				end
+				# Extracts data from  currently processed URL (called document here)
+				def extract_doc_data
+				end
+				# This function tries to get links of another URLs (called leaf here) to crawl
+				def fetch_leafs
+				end
+			end
+		end
+	end
+end

data/lib/apollo_crawler/plugins/alexa_com/alexa.rb ADDED

@@ -0,0 +1,36 @@
+require File.join(F'..', '..', '..', 'plugin')
+module Apollo
+	module Crawler
+		module Plugins
+			# PARAMATRIZE: Plugin class name
+			class Alexa < Plugin
+				@@URL = "http://www.alexa.com/"
+				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
+				def name()
+					return "Alexa Rank"
+				end
+				def run()
+					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
+					doc = Nokogiri::HTML(open(@@URL))
+					res = doc.xpath(@@MATCHER_ITEM).map { |i|
+						{
+							:text => i.text,
+							:link => URI.join(@@URL, i['href'])
+						}
+					}
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:res => res
+					}
+				end
+			end
+		end # Plugins
+	end # Crawler
+end # Apollo

data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb ADDED

@@ -0,0 +1,35 @@
+require File.join('..', '..', '..', 'plugin')
+module Apollo
+	module Crawler
+		module Plugins
+			# PARAMATRIZE: Plugin class name
+			class Firmy < Plugin
+				@@URL = "http://www.firmy.cz/"
+				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
+				def name()
+					return "Firmy.cz"
+				end
+				def run()
+					doc = Nokogiri::HTML(open(@@URL))
+					res = doc.xpath(@@MATCHER_ITEM).map { |i|
+						{
+							:text => i.text,
+							:link => URI.join(@@URL, i['href'])
+						}
+					}
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:res => res
+					}
+				end
+			end
+		end # Plugins
+	end # Crawler
+end # Apollo

data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb ADDED

@@ -0,0 +1,35 @@
+require File.join('..', '..', '..', 'plugin')
+module Apollo
+	module Crawler
+		module Plugins
+			# PARAMATRIZE: Plugin class name
+			class Slashdot < Plugin
+				@@URL = "http://slashdot.org/"
+				@@MATCHER_ITEM = "//article/header/h2/span/a"
+				def name
+					return "Slashdot"
+				end
+				def run()
+					doc = Nokogiri::HTML(open(@@URL))
+					res = doc.xpath(@@MATCHER_ITEM).map { |i|
+						{
+							:text => i.text,
+							:link => URI.join(@@URL, i['href'])
+						}
+					}
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:res => res
+					}
+				end
+			end
+		end # Plugins
+	end # Crawler
+end # Apollo

data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb ADDED

@@ -0,0 +1,35 @@
+require File.join('..', '..', '..', 'plugin')
+module Apollo
+	module Crawler
+		module Plugins
+			# PARAMATRIZE: Plugin class name
+			class HackerNews < Plugin
+				@@URL = "http://news.ycombinator.com/"
+				@@MATCHER_ITEM = "//td[@class = 'title']/a"
+				def name
+					return "Hacker News"
+				end
+				def run()
+					doc = Nokogiri::HTML(open(@@URL))
+					res = doc.xpath(@@MATCHER_ITEM).map { |i|
+						{
+							:text => i.text,
+							:link => URI.join(@@URL, i['href'])
+						}
+					}
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:res => res
+					}
+				end
+			end
+		end # Plugins
+	end # Crawler
+end # Apollo

data/main.rb ADDED

@@ -0,0 +1,170 @@
+#! /usr/bin/env ruby
+require "rubygems"
+require "bundler/setup"
+require 'json'
+require "thor"
+require "open-uri"
+require "nokogiri"
+require "pp"
+require "optparse"
+module Crawler
+	class Program
+		# This hash will hold all of the options
+		# parsed from the command-line by
+		# OptionParser.
+		@options = nil
+		@optparser = nil
+		@plugins = nil
+		# Initializer - Constructor
+		def initialize
+			@plugins = {}
+		end
+		# Initialize command-line options
+		def init_options
+			@options = {}
+			@options[:verbose] = false
+			@optparser = OptionParser.new do | opts |
+				# This displays the help screen, all programs are
+				# assumed to have this option.
+				opts.on('-h', '--help', 'Display this screen') do
+					puts opts
+					exit
+				end
+				opts.on('-a', '--all', 'Run all plugins') do
+					@options[:run_all] = true
+				end
+				opts.on('-v', '--verbose', 'Enable verbose output') do
+					@options[:verbose] = true
+				end
+				opts.on('-l', '--list-plugins', 'List of plugins') do
+					@options[:list_plugins] = true
+				end
+			end
+		end
+		# Parse the options passed to command-line
+		def parse_options
+			# Parse the command-line. Remember there are two forms
+			# of the parse method. The 'parse' method simply parses
+			# ARGV, while the 'parse!' method parses ARGV and removes
+			# any options found there, as well as any parameters for
+			# the options. What's left is the list of files to resize.
+			@optparser.parse!
+		end
+		# Load global options first
+		# Merge it with local options (if they exists)
+		def load_config_file()
+			config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
+			puts "Inspecting #{config} ..."
+			if(File.exists?(config))
+				if(@options[:verbose])
+					puts "Loading config '#{config}'"
+				end
+				puts "Let's require '#{@options[:verbose]}'"
+				require config
+			else
+				if(@options[:verbose])
+					# TODO: Add support for initial rake task generation
+					#          Something like this:
+					#          rake config:init # Initializes config files with
+					#            their defaults (if not exists already)
+					puts "Default config does not exist, skipping - '#{config}'"
+				end
+			end
+		end
+		# Register plugins (specific crawlers)
+		def register_plugins()
+			dir = File.join(File.dirname(__FILE__), "lib", "apollo_crawler", "plugins")
+			if(@options[:verbose])
+				puts "Registering plugins - '#{dir}'"
+			end
+			sites = File.join(dir, "**", "*.rb")
+			Dir.glob(sites).each do |site|
+				require site
+			end
+			tmp = Apollo::Crawler::Plugins.constants.select { |c|
+				Class === Apollo::Crawler::Plugins.const_get(c)
+			}
+			tmp.each do |x|
+				klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
+				@plugins.merge!({ x.downcase.to_s => klass})
+			end
+			if(@options[:verbose])
+				@plugins.each do |plugin, klass|
+					name = klass.new.class.name
+					if  name == "Apollo::Crawler::Plugins::Plugin"
+						next
+					end
+					puts "Registered '#{plugin}' -> '#{name}'"
+				end
+			end
+		end
+		def run
+			init_options()
+			load_config_file()
+			parse_options()
+			# Register sites which can be crawled
+			register_plugins()
+			if(@options[:list_plugins])
+				puts "Listing plugins"
+				puts "----------------------------------------"
+				i = 0
+				@plugins.sort.each do |plugin, klass|
+					instance = klass.new
+					# puts klass.class_eval("@@NAME")
+					puts "(#{i}) #{plugin} - #{instance.name}"
+					i += 1
+				end
+				puts "----------------------------------------"
+				return
+			end
+			plugins = ARGV
+			if(@options[:run_all])
+				plugins = @plugins.keys
+			end
+			if(plugins.empty?)
+				puts @optparser
+			end
+			plugins.each do |plugin|
+				p = @plugins[plugin.downcase].new
+				# puts "Running '#{plugin}'"
+				puts JSON.pretty_generate(p.run)
+			end
+		end
+	end
+end
+if __FILE__ == $0
+	Crawler::Program.new.run()
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: apollo-crawler
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.5
   prerelease:
 platform: ruby
 authors:
@@ -16,7 +16,14 @@ email: korczis@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
-files: []
+files:
+- ./main.rb
+- ./lib/apollo_crawler/crawler.rb
+- ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
+- ./lib/apollo_crawler/plugins/firmy_cz/firmy.rb
+- ./lib/apollo_crawler/plugins/alexa_com/alexa.rb
+- ./lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb
+- ./lib/apollo_crawler/plugin.rb
 homepage: https://github.com/korczis/apollo-crawler
 licenses: []
 post_install_message: