RubyGems - apollo-crawler - Versions diffs - 0.0.2 → 0.0.3 - Mend

apollo-crawler 0.0.2 → 0.0.3

Files changed (8) hide show

metadata +2 -9
data/lib/crawler.rb +0 -0
data/lib/plugin.rb +0 -37
data/lib/plugins/alexa_com/alexa.rb +0 -36
data/lib/plugins/firmy_cz/firmy.rb +0 -35
data/lib/plugins/slashdot_org/slashdot.rb +0 -35
data/lib/plugins/ycombinator_com/hacker_news.rb +0 -35
data/main.rb +0 -170

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: apollo-crawler
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -16,14 +16,7 @@ email: korczis@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
-files:
-- ./main.rb
-- ./lib/crawler.rb
-- ./lib/plugins/slashdot_org/slashdot.rb
-- ./lib/plugins/firmy_cz/firmy.rb
-- ./lib/plugins/alexa_com/alexa.rb
-- ./lib/plugins/ycombinator_com/hacker_news.rb
-- ./lib/plugin.rb
+files: []
 homepage: https://github.com/korczis/apollo-crawler
 licenses: []
 post_install_message:

data/lib/crawler.rb DELETED Viewed

File without changes

data/lib/plugin.rb DELETED Viewed

@@ -1,37 +0,0 @@
-require "open-uri"
-require "nokogiri"
-module Apollo
-	module Crawler
-		module Plugins
-			class Plugin
-				# Name of the plugin, used in docs, lookups, etc ...
-				def name
-					return "Plugin Base"
-				end
-				# - Fetch default URL (and transform it to document)
-				# - Extract and Load (Store) important data
-				# - Look for another documents
-				#    Examples:
-				#     - "next page"
-				#     - "people you may know on Linked in"
-				#     - "will attend on FB")
-				def run
-					return {
-						:plugin => self.class.name
-					}
-				end
-				# Extracts data from  currently processed URL (called document here)
-				def extract_doc_data
-				end
-				# This function tries to get links of another URLs (called leaf here) to crawl
-				def fetch_leafs
-				end
-			end
-		end
-	end
-end

data/lib/plugins/alexa_com/alexa.rb DELETED Viewed

@@ -1,36 +0,0 @@
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
-module Apollo
-	module Crawler
-		module Plugins
-			# PARAMATRIZE: Plugin class name
-			class Alexa < Plugin
-				@@URL = "http://www.alexa.com/"
-				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
-				def name()
-					return "Alexa Rank"
-				end
-				def run()
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(open(@@URL))
-					res = doc.xpath(@@MATCHER_ITEM).map { |i|
-						{
-							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
-						}
-					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
-				end
-			end
-		end # Plugins
-	end # Crawler
-end # Apollo

data/lib/plugins/firmy_cz/firmy.rb DELETED Viewed

@@ -1,35 +0,0 @@
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
-module Apollo
-	module Crawler
-		module Plugins
-			# PARAMATRIZE: Plugin class name
-			class Firmy < Plugin
-				@@URL = "http://www.firmy.cz/"
-				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
-				def name()
-					return "Firmy.cz"
-				end
-				def run()
-					doc = Nokogiri::HTML(open(@@URL))
-					res = doc.xpath(@@MATCHER_ITEM).map { |i|
-						{
-							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
-						}
-					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
-				end
-			end
-		end # Plugins
-	end # Crawler
-end # Apollo

data/lib/plugins/slashdot_org/slashdot.rb DELETED Viewed

@@ -1,35 +0,0 @@
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
-module Apollo
-	module Crawler
-		module Plugins
-			# PARAMATRIZE: Plugin class name
-			class Slashdot < Plugin
-				@@URL = "http://slashdot.org/"
-				@@MATCHER_ITEM = "//article/header/h2/span/a"
-				def name
-					return "Slashdot"
-				end
-				def run()
-					doc = Nokogiri::HTML(open(@@URL))
-					res = doc.xpath(@@MATCHER_ITEM).map { |i|
-						{
-							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
-						}
-					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
-				end
-			end
-		end # Plugins
-	end # Crawler
-end # Apollo

data/lib/plugins/ycombinator_com/hacker_news.rb DELETED Viewed

@@ -1,35 +0,0 @@
-require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
-module Apollo
-	module Crawler
-		module Plugins
-			# PARAMATRIZE: Plugin class name
-			class HackerNews < Plugin
-				@@URL = "http://news.ycombinator.com/"
-				@@MATCHER_ITEM = "//td[@class = 'title']/a"
-				def name
-					return "Hacker News"
-				end
-				def run()
-					doc = Nokogiri::HTML(open(@@URL))
-					res = doc.xpath(@@MATCHER_ITEM).map { |i|
-						{
-							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
-						}
-					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
-				end
-			end
-		end # Plugins
-	end # Crawler
-end # Apollo

data/main.rb DELETED Viewed

@@ -1,170 +0,0 @@
-#! /usr/bin/env ruby
-require "rubygems"
-require "bundler/setup"
-require 'json'
-require "thor"
-require "open-uri"
-require "nokogiri"
-require "pp"
-require "optparse"
-module Crawler
-	class Program
-		# This hash will hold all of the options
-		# parsed from the command-line by
-		# OptionParser.
-		@options = nil
-		@optparser = nil
-		@plugins = nil
-		# Initializer - Constructor
-		def initialize
-			@plugins = {}
-		end
-		# Initialize command-line options
-		def init_options
-			@options = {}
-			@options[:verbose] = false
-			@optparser = OptionParser.new do | opts |
-				# This displays the help screen, all programs are
-				# assumed to have this option.
-				opts.on('-h', '--help', 'Display this screen') do
-					puts opts
-					exit
-				end
-				opts.on('-a', '--all', 'Run all plugins') do
-					@options[:run_all] = true
-				end
-				opts.on('-v', '--verbose', 'Enable verbose output') do
-					@options[:verbose] = true
-				end
-				opts.on('-l', '--list-plugins', 'List of plugins') do
-					@options[:list_plugins] = true
-				end
-			end
-		end
-		# Parse the options passed to command-line
-		def parse_options
-			# Parse the command-line. Remember there are two forms
-			# of the parse method. The 'parse' method simply parses
-			# ARGV, while the 'parse!' method parses ARGV and removes
-			# any options found there, as well as any parameters for
-			# the options. What's left is the list of files to resize.
-			@optparser.parse!
-		end
-		# Load global options first
-		# Merge it with local options (if they exists)
-		def load_config_file()
-			config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
-			puts "Inspecting #{config} ..."
-			if(File.exists?(config))
-				if(@options[:verbose])
-					puts "Loading config '#{config}'"
-				end
-				puts "Let's require '#{@options[:verbose]}'"
-				require config
-			else
-				if(@options[:verbose])
-					# TODO: Add support for initial rake task generation
-					#          Something like this:
-					#          rake config:init # Initializes config files with
-					#            their defaults (if not exists already)
-					puts "Default config does not exist, skipping - '#{config}'"
-				end
-			end
-		end
-		# Register plugins (specific crawlers)
-		def register_plugins()
-			dir = File.join(File.dirname(__FILE__), "lib","plugins")
-			if(@options[:verbose])
-				puts "Registering plugins - '#{dir}'"
-			end
-			sites = File.join(dir, "**", "*.rb")
-			Dir.glob(sites).each do |site|
-				require site
-			end
-			tmp = Apollo::Crawler::Plugins.constants.select { |c|
-				Class === Apollo::Crawler::Plugins.const_get(c)
-			}
-			tmp.each do |x|
-				klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
-				@plugins.merge!({ x.downcase.to_s => klass})
-			end
-			if(@options[:verbose])
-				@plugins.each do |plugin, klass|
-					name = klass.new.class.name
-					if  name == "Apollo::Crawler::Plugins::Plugin"
-						next
-					end
-					puts "Registered '#{plugin}' -> '#{name}'"
-				end
-			end
-		end
-		def run
-			init_options()
-			load_config_file()
-			parse_options()
-			# Register sites which can be crawled
-			register_plugins()
-			if(@options[:list_plugins])
-				puts "Listing plugins"
-				puts "----------------------------------------"
-				i = 0
-				@plugins.sort.each do |plugin, klass|
-					instance = klass.new
-					# puts klass.class_eval("@@NAME")
-					puts "(#{i}) #{plugin} - #{instance.name}"
-					i += 1
-				end
-				puts "----------------------------------------"
-				return
-			end
-			plugins = ARGV
-			if(@options[:run_all])
-				plugins = @plugins.keys
-			end
-			if(plugins.empty?)
-				puts @optparser
-			end
-			plugins.each do |plugin|
-				p = @plugins[plugin.downcase].new
-				# puts "Running '#{plugin}'"
-				puts JSON.pretty_generate(p.run)
-			end
-		end
-	end
-end
-if __FILE__ == $0
-	Crawler::Program.new.run()
-end