RubyGems - apollo-crawler - Versions diffs - 0.0.35 → 0.0.36 - Mend

apollo-crawler 0.0.35 → 0.0.36

Files changed (10) hide show

data/bin/apollo-crawler +6 -1
data/lib/apollo_crawler/plugin.rb +47 -4
data/lib/apollo_crawler/plugin_template.rb +5 -14
data/lib/apollo_crawler/plugins/alexa_com/alexa.rb +5 -13
data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb +5 -14
data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb +5 -14
data/lib/apollo_crawler/plugins/xkcd_com/xkcd.rb +5 -14
data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb +5 -14
data/lib/apollo_crawler/version.rb +1 -1
metadata +1 -1

@@ -251,7 +251,12 @@ module Crawler
 				end
 				# puts "Running '#{plugin}'"
-				puts JSON.pretty_generate(p.new.run)
+				res = p.new.etl
+				if(res.nil?)
+					next
+				end
+				puts JSON.pretty_generate(res)
 			end
 		end
 	end

data/lib/apollo_crawler/plugin.rb CHANGED

@@ -11,6 +11,32 @@ module Apollo
 					return "Plugin Base"
 				end
+				def url
+					return nil
+				end
+				def etl()
+					#return run()
+					res = []
+					if(self.url.nil?)
+						return nil
+					end
+					doc = self.fetch_document(self.url)
+					if(doc.nil?)
+						return nil
+					end
+					res = self.extract_data(doc)
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:data => res
+					}
+				end
 				# - Fetch default URL (and transform it to document)
 				# - Extract and Load (Store) important data
 				# - Look for another documents
@@ -24,12 +50,29 @@ module Apollo
 					}
 				end
-				# Extracts data from  currently processed URL (called document here)
-				def extract_doc_data
+				# Fetch document
+				def fetch_document(url)
+					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+					if(self.url.nil?)
+						return nil
+					end
+					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
+					doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
+					return doc
+				end
+				# Extracts data from document
+				def extract_data(doc)
+					res = []
+					return res
 				end
-				# This function tries to get links of another URLs (called leaf here) to crawl
-				def fetch_leafs
+				# Extract links to another documents from this document
+				def extract_links(doc)
+					res = []
+					return res
 				end
 			end
 		end

data/lib/apollo_crawler/plugin_template.rb CHANGED

@@ -5,32 +5,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class PLUGIN_CLASS_NAME < Plugin
-				@@URL = "PLUGIN_URL"
 				@@MATCHER_ITEM = "PLUGIN_MATCHER"
 				def name()
 					return "PLUGIN_NAME"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+				def url()
+					return "PLUGIN_URL"
+				end
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/alexa_com/alexa.rb CHANGED

@@ -7,31 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class Alexa < Plugin
-				@@URL = "http://www.alexa.com/"
 				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
 				def name()
 					return "Alexa Rank"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+				def url()
+					return "http://www.alexa.com/"
+				end
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb CHANGED

@@ -7,32 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class Firmy < Plugin
-				@@URL = "http://www.firmy.cz/"
 				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
 				def name()
 					return "Firmy.cz"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def url()
+					return "http://www.firmy.cz/"
+				end
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb CHANGED

@@ -7,32 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class Slashdot < Plugin
-				@@URL = "http://slashdot.org/"
 				@@MATCHER_ITEM = "//article/header/h2/span/a"
 				def name
 					return "Slashdot"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def url()
+					return"http://slashdot.org/"
+				end
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/xkcd_com/xkcd.rb CHANGED

@@ -7,32 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class Xkcd < Plugin
-				@@URL = "http://xkcd.com/"
 				@@MATCHER_ITEM = "//div[@id = 'comic']/img"
 				def name()
 					return "Xkcd"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+				def url()
+					return "http://xkcd.com/"
+				end
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |node|
 						{
 							:text => node['title'],
-							:link => URI.join(@@URL, node['src'])
+							:link => URI.join(self.url, node['src'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb CHANGED

@@ -7,32 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class HackerNews < Plugin
-				@@URL = "http://news.ycombinator.com/"
 				@@MATCHER_ITEM = "//td[@class = 'title']/a"
 				def name
 					return "Hacker News"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def url()
+					return "http://news.ycombinator.com/"
+				end
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/version.rb CHANGED

@@ -1,5 +1,5 @@
 module Apollo
 	module Crawler
-		VERSION = '0.0.35'
+		VERSION = '0.0.36'
 	end # Crawler
 end # Apollo

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: apollo-crawler
 version: !ruby/object:Gem::Version
-  version: 0.0.35
+  version: 0.0.36
   prerelease:
 platform: ruby
 authors: