RubyGems - apollo-crawler - Versions diffs - 0.0.1 - Mend

apollo-crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/lib/crawler.rb +0 -0
data/lib/plugin.rb +37 -0
data/lib/plugins/alexa_com/alexa.rb +36 -0
data/lib/plugins/firmy_cz/firmy.rb +35 -0
data/lib/plugins/slashdot_org/slashdot.rb +35 -0
data/lib/plugins/ycombinator_com/hacker_news.rb +35 -0
data/main.rb +170 -0
metadata +51 -0

data/lib/crawler.rb ADDED Viewed

File without changes

data/lib/plugin.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require "open-uri"
+require "nokogiri"
+module Apollo
+	module Crawler
+		module Plugins
+			class Plugin
+				# Name of the plugin, used in docs, lookups, etc ...
+				def name
+					return "Plugin Base"
+				end
+				# - Fetch default URL (and transform it to document)
+				# - Extract and Load (Store) important data
+				# - Look for another documents
+				#    Examples:
+				#     - "next page"
+				#     - "people you may know on Linked in"
+				#     - "will attend on FB")
+				def run
+					return {
+						:plugin => self.class.name
+					}
+				end
+				# Extracts data from  currently processed URL (called document here)
+				def extract_doc_data
+				end
+				# This function tries to get links of another URLs (called leaf here) to crawl
+				def fetch_leafs
+				end
+			end
+		end
+	end
+end

data/lib/plugins/alexa_com/alexa.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+module Apollo
+	module Crawler
+		module Plugins
+			# PARAMATRIZE: Plugin class name
+			class Alexa < Plugin
+				@@URL = "http://www.alexa.com/"
+				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
+				def name()
+					return "Alexa Rank"
+				end
+				def run()
+					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
+					doc = Nokogiri::HTML(open(@@URL))
+					res = doc.xpath(@@MATCHER_ITEM).map { |i|
+						{
+							:text => i.text,
+							:link => URI.join(@@URL, i['href'])
+						}
+					}
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:res => res
+					}
+				end
+			end
+		end # Plugins
+	end # Crawler
+end # Apollo

data/lib/plugins/firmy_cz/firmy.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+module Apollo
+	module Crawler
+		module Plugins
+			# PARAMATRIZE: Plugin class name
+			class Firmy < Plugin
+				@@URL = "http://www.firmy.cz/"
+				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
+				def name()
+					return "Firmy.cz"
+				end
+				def run()
+					doc = Nokogiri::HTML(open(@@URL))
+					res = doc.xpath(@@MATCHER_ITEM).map { |i|
+						{
+							:text => i.text,
+							:link => URI.join(@@URL, i['href'])
+						}
+					}
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:res => res
+					}
+				end
+			end
+		end # Plugins
+	end # Crawler
+end # Apollo

data/lib/plugins/slashdot_org/slashdot.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+module Apollo
+	module Crawler
+		module Plugins
+			# PARAMATRIZE: Plugin class name
+			class Slashdot < Plugin
+				@@URL = "http://slashdot.org/"
+				@@MATCHER_ITEM = "//article/header/h2/span/a"
+				def name
+					return "Slashdot"
+				end
+				def run()
+					doc = Nokogiri::HTML(open(@@URL))
+					res = doc.xpath(@@MATCHER_ITEM).map { |i|
+						{
+							:text => i.text,
+							:link => URI.join(@@URL, i['href'])
+						}
+					}
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:res => res
+					}
+				end
+			end
+		end # Plugins
+	end # Crawler
+end # Apollo

data/lib/plugins/ycombinator_com/hacker_news.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
+module Apollo
+	module Crawler
+		module Plugins
+			# PARAMATRIZE: Plugin class name
+			class HackerNews < Plugin
+				@@URL = "http://news.ycombinator.com/"
+				@@MATCHER_ITEM = "//td[@class = 'title']/a"
+				def name
+					return "Hacker News"
+				end
+				def run()
+					doc = Nokogiri::HTML(open(@@URL))
+					res = doc.xpath(@@MATCHER_ITEM).map { |i|
+						{
+							:text => i.text,
+							:link => URI.join(@@URL, i['href'])
+						}
+					}
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:res => res
+					}
+				end
+			end
+		end # Plugins
+	end # Crawler
+end # Apollo

data/main.rb ADDED Viewed

@@ -0,0 +1,170 @@
+#! /usr/bin/env ruby
+require "rubygems"
+require "bundler/setup"
+require 'json'
+require "thor"
+require "open-uri"
+require "nokogiri"
+require "pp"
+require "optparse"
+module Crawler
+	class Program
+		# This hash will hold all of the options
+		# parsed from the command-line by
+		# OptionParser.
+		@options = nil
+		@optparser = nil
+		@plugins = nil
+		# Initializer - Constructor
+		def initialize
+			@plugins = {}
+		end
+		# Initialize command-line options
+		def init_options
+			@options = {}
+			@options[:verbose] = false
+			@optparser = OptionParser.new do | opts |
+				# This displays the help screen, all programs are
+				# assumed to have this option.
+				opts.on('-h', '--help', 'Display this screen') do
+					puts opts
+					exit
+				end
+				opts.on('-a', '--all', 'Run all plugins') do
+					@options[:run_all] = true
+				end
+				opts.on('-v', '--verbose', 'Enable verbose output') do
+					@options[:verbose] = true
+				end
+				opts.on('-l', '--list-plugins', 'List of plugins') do
+					@options[:list_plugins] = true
+				end
+			end
+		end
+		# Parse the options passed to command-line
+		def parse_options
+			# Parse the command-line. Remember there are two forms
+			# of the parse method. The 'parse' method simply parses
+			# ARGV, while the 'parse!' method parses ARGV and removes
+			# any options found there, as well as any parameters for
+			# the options. What's left is the list of files to resize.
+			@optparser.parse!
+		end
+		# Load global options first
+		# Merge it with local options (if they exists)
+		def load_config_file()
+			config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
+			puts "Inspecting #{config} ..."
+			if(File.exists?(config))
+				if(@options[:verbose])
+					puts "Loading config '#{config}'"
+				end
+				puts "Let's require '#{@options[:verbose]}'"
+				require config
+			else
+				if(@options[:verbose])
+					# TODO: Add support for initial rake task generation
+					#          Something like this:
+					#          rake config:init # Initializes config files with
+					#            their defaults (if not exists already)
+					puts "Default config does not exist, skipping - '#{config}'"
+				end
+			end
+		end
+		# Register plugins (specific crawlers)
+		def register_plugins()
+			dir = File.join(File.dirname(__FILE__), "lib","plugins")
+			if(@options[:verbose])
+				puts "Registering plugins - '#{dir}'"
+			end
+			sites = File.join(dir, "**", "*.rb")
+			Dir.glob(sites).each do |site|
+				require site
+			end
+			tmp = Apollo::Crawler::Plugins.constants.select { |c|
+				Class === Apollo::Crawler::Plugins.const_get(c)
+			}
+			tmp.each do |x|
+				klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
+				@plugins.merge!({ x.downcase.to_s => klass})
+			end
+			if(@options[:verbose])
+				@plugins.each do |plugin, klass|
+					name = klass.new.class.name
+					if  name == "Apollo::Crawler::Plugins::Plugin"
+						next
+					end
+					puts "Registered '#{plugin}' -> '#{name}'"
+				end
+			end
+		end
+		def run
+			init_options()
+			load_config_file()
+			parse_options()
+			# Register sites which can be crawled
+			register_plugins()
+			if(@options[:list_plugins])
+				puts "Listing plugins"
+				puts "----------------------------------------"
+				i = 0
+				@plugins.sort.each do |plugin, klass|
+					instance = klass.new
+					# puts klass.class_eval("@@NAME")
+					puts "(#{i}) #{plugin} - #{instance.name}"
+					i += 1
+				end
+				puts "----------------------------------------"
+				return
+			end
+			plugins = ARGV
+			if(@options[:run_all])
+				plugins = @plugins.keys
+			end
+			if(plugins.empty?)
+				puts @optparser
+			end
+			plugins.each do |plugin|
+				p = @plugins[plugin.downcase].new
+				# puts "Running '#{plugin}'"
+				puts JSON.pretty_generate(p.run)
+			end
+		end
+	end
+end
+if __FILE__ == $0
+	Crawler::Program.new.run()
+end

metadata ADDED Viewed

@@ -0,0 +1,51 @@
+--- !ruby/object:Gem::Specification
+name: apollo-crawler
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Tomas Korcak
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-02-23 00:00:00.000000000 Z
+dependencies: []
+description: Gem for crawling data from external resources
+email: korczis@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ./main.rb
+- ./lib/crawler.rb
+- ./lib/plugins/slashdot_org/slashdot.rb
+- ./lib/plugins/firmy_cz/firmy.rb
+- ./lib/plugins/alexa_com/alexa.rb
+- ./lib/plugins/ycombinator_com/hacker_news.rb
+- ./lib/plugin.rb
+homepage: https://github.com/korczis/apollo-crawler
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: Apollo Platform Crawler
+test_files: []