RubyGems - apollo-crawler - Versions diffs - 0.0.35 → 0.0.36 - Mend

apollo-crawler 0.0.35 → 0.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/bin/apollo-crawler +6 -1
data/lib/apollo_crawler/plugin.rb +47 -4
data/lib/apollo_crawler/plugin_template.rb +5 -14
data/lib/apollo_crawler/plugins/alexa_com/alexa.rb +5 -13
data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb +5 -14
data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb +5 -14
data/lib/apollo_crawler/plugins/xkcd_com/xkcd.rb +5 -14
data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb +5 -14
data/lib/apollo_crawler/version.rb +1 -1
metadata +1 -1

data/bin/apollo-crawler CHANGED

@@ -251,7 +251,12 @@ module Crawler
 				end
 				# puts "Running '#{plugin}'"
-				puts JSON.pretty_generate(p.new.run)
+				res = p.new.etl
+				if(res.nil?)
+					next
+				end
+				puts JSON.pretty_generate(res)
 			end
 		end
 	end

data/lib/apollo_crawler/plugin.rb CHANGED

@@ -11,6 +11,32 @@ module Apollo
 					return "Plugin Base"
 				end
+				def url
+					return nil
+				end
+				def etl()
+					#return run()
+					res = []
+					if(self.url.nil?)
+						return nil
+					end
+					doc = self.fetch_document(self.url)
+					if(doc.nil?)
+						return nil
+					end
+					res = self.extract_data(doc)
+					return {
+						:plugin => self.class.name,
+						:title => doc.title,
+						:data => res
+					}
+				end
 				# - Fetch default URL (and transform it to document)
 				# - Extract and Load (Store) important data
 				# - Look for another documents
@@ -24,12 +50,29 @@ module Apollo
 					}
 				end
-				# Extracts data from  currently processed URL (called document here)
-				def extract_doc_data
+				# Fetch document
+				def fetch_document(url)
+					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+					if(self.url.nil?)
+						return nil
+					end
+					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
+					doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
+					return doc
+				end
+				# Extracts data from document
+				def extract_data(doc)
+					res = []
+					return res
 				end
-				# This function tries to get links of another URLs (called leaf here) to crawl
-				def fetch_leafs
+				# Extract links to another documents from this document
+				def extract_links(doc)
+					res = []
+					return res
 				end
 			end
 		end

data/lib/apollo_crawler/plugin_template.rb CHANGED

@@ -5,32 +5,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class PLUGIN_CLASS_NAME < Plugin
-				@@URL = "PLUGIN_URL"
 				@@MATCHER_ITEM = "PLUGIN_MATCHER"
 				def name()
 					return "PLUGIN_NAME"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+				def url()
+					return "PLUGIN_URL"
+				end
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/alexa_com/alexa.rb CHANGED

@@ -7,31 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class Alexa < Plugin
-				@@URL = "http://www.alexa.com/"
 				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
 				def name()
 					return "Alexa Rank"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+				def url()
+					return "http://www.alexa.com/"
+				end
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb CHANGED

@@ -7,32 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class Firmy < Plugin
-				@@URL = "http://www.firmy.cz/"
 				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
 				def name()
 					return "Firmy.cz"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def url()
+					return "http://www.firmy.cz/"
+				end
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb CHANGED

@@ -7,32 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class Slashdot < Plugin
-				@@URL = "http://slashdot.org/"
 				@@MATCHER_ITEM = "//article/header/h2/span/a"
 				def name
 					return "Slashdot"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def url()
+					return"http://slashdot.org/"
+				end
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/xkcd_com/xkcd.rb CHANGED

@@ -7,32 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class Xkcd < Plugin
-				@@URL = "http://xkcd.com/"
 				@@MATCHER_ITEM = "//div[@id = 'comic']/img"
 				def name()
 					return "Xkcd"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+				def url()
+					return "http://xkcd.com/"
+				end
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |node|
 						{
 							:text => node['title'],
-							:link => URI.join(@@URL, node['src'])
+							:link => URI.join(self.url, node['src'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb CHANGED

@@ -7,32 +7,23 @@ module Apollo
 		module Plugins
 			# PARAMATRIZE: Plugin class name
 			class HackerNews < Plugin
-				@@URL = "http://news.ycombinator.com/"
 				@@MATCHER_ITEM = "//td[@class = 'title']/a"
 				def name
 					return "Hacker News"
 				end
-				def run()
-					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
-					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
-					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
+				def url()
+					return "http://news.ycombinator.com/"
+				end
+				def extract_data(doc)
 					res = doc.xpath(@@MATCHER_ITEM).map { |i|
 						{
 							:text => i.text,
-							:link => URI.join(@@URL, i['href'])
+							:link => URI.join(self.url, i['href'])
 						}
 					}
-					return {
-						:plugin => self.class.name,
-						:title => doc.title,
-						:res => res
-					}
 				end
 			end
 		end # Plugins

data/lib/apollo_crawler/version.rb CHANGED

@@ -1,5 +1,5 @@
 module Apollo
 	module Crawler
-		VERSION = '0.0.35'
+		VERSION = '0.0.36'
 	end # Crawler
 end # Apollo

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: apollo-crawler
 version: !ruby/object:Gem::Version
-  version: 0.0.35
+  version: 0.0.36
   prerelease:
 platform: ruby
 authors: