apollo-crawler 0.0.35 → 0.0.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/apollo-crawler +6 -1
- data/lib/apollo_crawler/plugin.rb +47 -4
- data/lib/apollo_crawler/plugin_template.rb +5 -14
- data/lib/apollo_crawler/plugins/alexa_com/alexa.rb +5 -13
- data/lib/apollo_crawler/plugins/firmy_cz/firmy.rb +5 -14
- data/lib/apollo_crawler/plugins/slashdot_org/slashdot.rb +5 -14
- data/lib/apollo_crawler/plugins/xkcd_com/xkcd.rb +5 -14
- data/lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb +5 -14
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +1 -1
    
        data/bin/apollo-crawler
    CHANGED
    
    
| @@ -11,6 +11,32 @@ module Apollo | |
| 11 11 | 
             
            					return "Plugin Base" 
         | 
| 12 12 | 
             
            				end
         | 
| 13 13 |  | 
| 14 | 
            +
            				def url
         | 
| 15 | 
            +
            					return nil
         | 
| 16 | 
            +
            				end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            				def etl()
         | 
| 19 | 
            +
            					#return run()
         | 
| 20 | 
            +
            					res = []
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            					if(self.url.nil?)
         | 
| 23 | 
            +
            						return nil
         | 
| 24 | 
            +
            					end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            					doc = self.fetch_document(self.url)
         | 
| 27 | 
            +
            					if(doc.nil?)
         | 
| 28 | 
            +
            						return nil
         | 
| 29 | 
            +
            					end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            					res = self.extract_data(doc)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            					return { 
         | 
| 34 | 
            +
            						:plugin => self.class.name,
         | 
| 35 | 
            +
            						:title => doc.title,
         | 
| 36 | 
            +
            						:data => res
         | 
| 37 | 
            +
            					}
         | 
| 38 | 
            +
            				end
         | 
| 39 | 
            +
             | 
| 14 40 | 
             
            				# - Fetch default URL (and transform it to document)
         | 
| 15 41 | 
             
            				# - Extract and Load (Store) important data
         | 
| 16 42 | 
             
            				# - Look for another documents
         | 
| @@ -24,12 +50,29 @@ module Apollo | |
| 24 50 | 
             
            					}
         | 
| 25 51 | 
             
            				end
         | 
| 26 52 |  | 
| 27 | 
            -
            				#  | 
| 28 | 
            -
            				def  | 
| 53 | 
            +
            				# Fetch document
         | 
| 54 | 
            +
            				def fetch_document(url)
         | 
| 55 | 
            +
            					ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            					if(self.url.nil?)
         | 
| 58 | 
            +
            						return nil
         | 
| 59 | 
            +
            					end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
         | 
| 62 | 
            +
            					doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
         | 
| 63 | 
            +
            					return doc
         | 
| 64 | 
            +
            				end
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            				# Extracts data from document
         | 
| 67 | 
            +
            				def extract_data(doc)
         | 
| 68 | 
            +
            					res = []
         | 
| 69 | 
            +
            					return res
         | 
| 29 70 | 
             
            				end
         | 
| 30 71 |  | 
| 31 | 
            -
            				#  | 
| 32 | 
            -
            				def  | 
| 72 | 
            +
            				# Extract links to another documents from this document
         | 
| 73 | 
            +
            				def extract_links(doc)
         | 
| 74 | 
            +
            					res = []
         | 
| 75 | 
            +
            					return res
         | 
| 33 76 | 
             
            				end
         | 
| 34 77 | 
             
            			end
         | 
| 35 78 | 
             
            		end
         | 
| @@ -5,32 +5,23 @@ module Apollo | |
| 5 5 | 
             
            		module Plugins
         | 
| 6 6 | 
             
            			# PARAMATRIZE: Plugin class name
         | 
| 7 7 | 
             
            			class PLUGIN_CLASS_NAME < Plugin
         | 
| 8 | 
            -
            				@@URL = "PLUGIN_URL"
         | 
| 9 | 
            -
             | 
| 10 8 | 
             
            				@@MATCHER_ITEM = "PLUGIN_MATCHER"
         | 
| 11 9 |  | 
| 12 10 | 
             
            				def name()
         | 
| 13 11 | 
             
            					return "PLUGIN_NAME"
         | 
| 14 12 | 
             
            				end
         | 
| 15 13 |  | 
| 16 | 
            -
            				def  | 
| 17 | 
            -
            					 | 
| 14 | 
            +
            				def url()
         | 
| 15 | 
            +
            					return "PLUGIN_URL"
         | 
| 16 | 
            +
            				end
         | 
| 18 17 |  | 
| 19 | 
            -
             | 
| 20 | 
            -
            					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
         | 
| 21 | 
            -
            					
         | 
| 18 | 
            +
            				def extract_data(doc)
         | 
| 22 19 | 
             
            					res = doc.xpath(@@MATCHER_ITEM).map { |i|
         | 
| 23 20 | 
             
            						{ 
         | 
| 24 21 | 
             
            							:text => i.text,
         | 
| 25 | 
            -
            							:link => URI.join( | 
| 22 | 
            +
            							:link => URI.join(self.url, i['href'])
         | 
| 26 23 | 
             
            						}
         | 
| 27 24 | 
             
            					}
         | 
| 28 | 
            -
             | 
| 29 | 
            -
            					return { 
         | 
| 30 | 
            -
            						:plugin => self.class.name,
         | 
| 31 | 
            -
            						:title => doc.title,
         | 
| 32 | 
            -
            						:res => res
         | 
| 33 | 
            -
            					}
         | 
| 34 25 | 
             
            				end
         | 
| 35 26 | 
             
            			end
         | 
| 36 27 | 
             
            		end # Plugins
         | 
| @@ -7,31 +7,23 @@ module Apollo | |
| 7 7 | 
             
            		module Plugins
         | 
| 8 8 | 
             
            			# PARAMATRIZE: Plugin class name
         | 
| 9 9 | 
             
            			class Alexa < Plugin
         | 
| 10 | 
            -
            				@@URL = "http://www.alexa.com/"
         | 
| 11 | 
            -
             | 
| 12 10 | 
             
            				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
         | 
| 13 11 |  | 
| 14 12 | 
             
            				def name()
         | 
| 15 13 | 
             
            					return "Alexa Rank"
         | 
| 16 14 | 
             
            				end
         | 
| 17 15 |  | 
| 18 | 
            -
            				def  | 
| 19 | 
            -
            					 | 
| 16 | 
            +
            				def url()
         | 
| 17 | 
            +
            					return "http://www.alexa.com/"
         | 
| 18 | 
            +
            				end
         | 
| 20 19 |  | 
| 21 | 
            -
             | 
| 22 | 
            -
            					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
         | 
| 20 | 
            +
            				def extract_data(doc)
         | 
| 23 21 | 
             
            					res = doc.xpath(@@MATCHER_ITEM).map { |i|
         | 
| 24 22 | 
             
            						{ 
         | 
| 25 23 | 
             
            							:text => i.text,
         | 
| 26 | 
            -
            							:link => URI.join( | 
| 24 | 
            +
            							:link => URI.join(self.url, i['href'])
         | 
| 27 25 | 
             
            						}
         | 
| 28 26 | 
             
            					}
         | 
| 29 | 
            -
             | 
| 30 | 
            -
            					return { 
         | 
| 31 | 
            -
            						:plugin => self.class.name,
         | 
| 32 | 
            -
            						:title => doc.title,
         | 
| 33 | 
            -
            						:res => res
         | 
| 34 | 
            -
            					}
         | 
| 35 27 | 
             
            				end
         | 
| 36 28 | 
             
            			end
         | 
| 37 29 | 
             
            		end # Plugins
         | 
| @@ -7,32 +7,23 @@ module Apollo | |
| 7 7 | 
             
            		module Plugins
         | 
| 8 8 | 
             
            			# PARAMATRIZE: Plugin class name
         | 
| 9 9 | 
             
            			class Firmy < Plugin
         | 
| 10 | 
            -
            				@@URL = "http://www.firmy.cz/"
         | 
| 11 | 
            -
             | 
| 12 10 | 
             
            				@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
         | 
| 13 11 |  | 
| 14 12 | 
             
            				def name()
         | 
| 15 13 | 
             
            					return "Firmy.cz"
         | 
| 16 14 | 
             
            				end
         | 
| 17 15 |  | 
| 18 | 
            -
            				def  | 
| 19 | 
            -
            					 | 
| 20 | 
            -
             | 
| 21 | 
            -
            					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
         | 
| 22 | 
            -
            					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
         | 
| 16 | 
            +
            				def url()
         | 
| 17 | 
            +
            					return "http://www.firmy.cz/"
         | 
| 18 | 
            +
            				end
         | 
| 23 19 |  | 
| 20 | 
            +
            				def extract_data(doc)
         | 
| 24 21 | 
             
            					res = doc.xpath(@@MATCHER_ITEM).map { |i|
         | 
| 25 22 | 
             
            						{ 
         | 
| 26 23 | 
             
            							:text => i.text,
         | 
| 27 | 
            -
            							:link => URI.join( | 
| 24 | 
            +
            							:link => URI.join(self.url, i['href'])
         | 
| 28 25 | 
             
            						}
         | 
| 29 26 | 
             
            					}
         | 
| 30 | 
            -
             | 
| 31 | 
            -
            					return { 
         | 
| 32 | 
            -
            						:plugin => self.class.name,
         | 
| 33 | 
            -
            						:title => doc.title,
         | 
| 34 | 
            -
            						:res => res
         | 
| 35 | 
            -
            					}
         | 
| 36 27 | 
             
            				end
         | 
| 37 28 | 
             
            			end
         | 
| 38 29 | 
             
            		end # Plugins
         | 
| @@ -7,32 +7,23 @@ module Apollo | |
| 7 7 | 
             
            		module Plugins
         | 
| 8 8 | 
             
            			# PARAMATRIZE: Plugin class name
         | 
| 9 9 | 
             
            			class Slashdot < Plugin
         | 
| 10 | 
            -
            				@@URL = "http://slashdot.org/"
         | 
| 11 | 
            -
             | 
| 12 10 | 
             
            				@@MATCHER_ITEM = "//article/header/h2/span/a"
         | 
| 13 11 |  | 
| 14 12 | 
             
            				def name
         | 
| 15 13 | 
             
            					return "Slashdot"
         | 
| 16 14 | 
             
            				end
         | 
| 17 15 |  | 
| 18 | 
            -
            				def  | 
| 19 | 
            -
            					 | 
| 20 | 
            -
             | 
| 21 | 
            -
            					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
         | 
| 22 | 
            -
            					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
         | 
| 16 | 
            +
            				def url()
         | 
| 17 | 
            +
            					return"http://slashdot.org/"
         | 
| 18 | 
            +
            				end
         | 
| 23 19 |  | 
| 20 | 
            +
            				def extract_data(doc)
         | 
| 24 21 | 
             
            					res = doc.xpath(@@MATCHER_ITEM).map { |i|
         | 
| 25 22 | 
             
            						{ 
         | 
| 26 23 | 
             
            							:text => i.text,
         | 
| 27 | 
            -
            							:link => URI.join( | 
| 24 | 
            +
            							:link => URI.join(self.url, i['href'])
         | 
| 28 25 | 
             
            						}
         | 
| 29 26 | 
             
            					}
         | 
| 30 | 
            -
             | 
| 31 | 
            -
            					return { 
         | 
| 32 | 
            -
            						:plugin => self.class.name,
         | 
| 33 | 
            -
            						:title => doc.title,
         | 
| 34 | 
            -
            						:res => res
         | 
| 35 | 
            -
            					}
         | 
| 36 27 | 
             
            				end
         | 
| 37 28 | 
             
            			end
         | 
| 38 29 | 
             
            		end # Plugins
         | 
| @@ -7,32 +7,23 @@ module Apollo | |
| 7 7 | 
             
            		module Plugins
         | 
| 8 8 | 
             
            			# PARAMATRIZE: Plugin class name
         | 
| 9 9 | 
             
            			class Xkcd < Plugin
         | 
| 10 | 
            -
            				@@URL = "http://xkcd.com/"
         | 
| 11 | 
            -
             | 
| 12 10 | 
             
            				@@MATCHER_ITEM = "//div[@id = 'comic']/img"
         | 
| 13 11 |  | 
| 14 12 | 
             
            				def name()
         | 
| 15 13 | 
             
            					return "Xkcd"
         | 
| 16 14 | 
             
            				end
         | 
| 17 15 |  | 
| 18 | 
            -
            				def  | 
| 19 | 
            -
            					 | 
| 16 | 
            +
            				def url()
         | 
| 17 | 
            +
            					return "http://xkcd.com/"
         | 
| 18 | 
            +
            				end
         | 
| 20 19 |  | 
| 21 | 
            -
             | 
| 22 | 
            -
            					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
         | 
| 23 | 
            -
            					
         | 
| 20 | 
            +
            				def extract_data(doc)
         | 
| 24 21 | 
             
            					res = doc.xpath(@@MATCHER_ITEM).map { |node|
         | 
| 25 22 | 
             
            						{ 
         | 
| 26 23 | 
             
            							:text => node['title'],
         | 
| 27 | 
            -
            							:link => URI.join( | 
| 24 | 
            +
            							:link => URI.join(self.url, node['src'])
         | 
| 28 25 | 
             
            						}
         | 
| 29 26 | 
             
            					}
         | 
| 30 | 
            -
             | 
| 31 | 
            -
            					return { 
         | 
| 32 | 
            -
            						:plugin => self.class.name,
         | 
| 33 | 
            -
            						:title => doc.title,
         | 
| 34 | 
            -
            						:res => res
         | 
| 35 | 
            -
            					}
         | 
| 36 27 | 
             
            				end
         | 
| 37 28 | 
             
            			end
         | 
| 38 29 | 
             
            		end # Plugins
         | 
| @@ -7,32 +7,23 @@ module Apollo | |
| 7 7 | 
             
            		module Plugins
         | 
| 8 8 | 
             
            			# PARAMATRIZE: Plugin class name
         | 
| 9 9 | 
             
            			class HackerNews < Plugin
         | 
| 10 | 
            -
            				@@URL = "http://news.ycombinator.com/"
         | 
| 11 | 
            -
             | 
| 12 10 | 
             
            				@@MATCHER_ITEM = "//td[@class = 'title']/a"
         | 
| 13 11 |  | 
| 14 12 | 
             
            				def name
         | 
| 15 13 | 
             
            					return "Hacker News"
         | 
| 16 14 | 
             
            				end
         | 
| 17 15 |  | 
| 18 | 
            -
            				def  | 
| 19 | 
            -
            					 | 
| 20 | 
            -
             | 
| 21 | 
            -
            					# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
         | 
| 22 | 
            -
            					doc = Nokogiri::HTML(ic.iconv(open(@@URL).read))
         | 
| 16 | 
            +
            				def url()
         | 
| 17 | 
            +
            					return "http://news.ycombinator.com/"
         | 
| 18 | 
            +
            				end
         | 
| 23 19 |  | 
| 20 | 
            +
            				def extract_data(doc)
         | 
| 24 21 | 
             
            					res = doc.xpath(@@MATCHER_ITEM).map { |i|
         | 
| 25 22 | 
             
            						{ 
         | 
| 26 23 | 
             
            							:text => i.text,
         | 
| 27 | 
            -
            							:link => URI.join( | 
| 24 | 
            +
            							:link => URI.join(self.url, i['href'])
         | 
| 28 25 | 
             
            						}
         | 
| 29 26 | 
             
            					}
         | 
| 30 | 
            -
             | 
| 31 | 
            -
            					return { 
         | 
| 32 | 
            -
            						:plugin => self.class.name,
         | 
| 33 | 
            -
            						:title => doc.title,
         | 
| 34 | 
            -
            						:res => res
         | 
| 35 | 
            -
            					}
         | 
| 36 27 | 
             
            				end
         | 
| 37 28 | 
             
            			end
         | 
| 38 29 | 
             
            		end # Plugins
         |