RubyGems - rubyretriever - Versions diffs - 0.1.3 → 0.1.4 - Mend

rubyretriever 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/bin/rr +3 -2
data/lib/retriever/fetch.rb +22 -62
data/lib/retriever/fetchfiles.rb +5 -4
data/lib/retriever/fetchsitemap.rb +7 -7
data/lib/retriever/link.rb +29 -0
data/lib/retriever/target.rb +41 -0
data/lib/retriever/version.rb +1 -1
data/lib/retriever.rb +7 -16
data/readme.md +3 -2
data/spec/link_spec.rb +66 -0
data/spec/retriever_spec.rb +9 -22
data/spec/target_spec.rb +39 -0
metadata +5 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f27793b1294bd489c3338aa5bc739dc6058d479a
-  data.tar.gz: e48491765a741087ba7bf708d62a8bbf8aa3cf80
+  metadata.gz: 924c9958e88587353cc80f4b134cca91f73f3e57
+  data.tar.gz: 393457cd37ad3fb372008a7829c8028f658f2b58
 SHA512:
-  metadata.gz: 60c017cf5dda8c659b12c3146f565df743dc4f6cf0e8436889bebb86cdccd5b128cca233d76b1413f526c21d1d99478c8149e6c43772e5dd0db1067f8dea5263
-  data.tar.gz: ec8142f3a8cbd75861c74322dd0358da51ee45388376ca0118d0998b81cf302b182eef93c39d88ef17dd35ef80a6253bf7a058e2e2052ea60ea3266d401892ab
+  metadata.gz: 5dcde12eb9fea2181b6a91c0d798351b78efa80652547afd02db536ab9d139de2969b08326d61363435baf04cc4036a0ed0a4cbdd9c884bd05314b8210c38938
+  data.tar.gz: 8c226a13d4e0b29beffc1940b6ca05ff9f4ae403decc1990a2a6418f90fca12e132852c48eea082918d74cb593006e98ed14ffbe9366d9dd64ef0f058eefd7a2

data/bin/rr CHANGED Viewed

@@ -1,5 +1,6 @@
 #! /usr/bin/env ruby
 require 'retriever'
+require 'optparse'
 options = {}
  optparse = OptionParser.new do|opts|
    # Set a banner, displayed at the top
@@ -61,8 +62,8 @@ ARGV.each do|q|
   end
   puts "###############################"
   puts "### [RubyRetriever] go fetch #{q}"
-  test = Retriever::FetchFiles.new(q, options) if options[:fileharvest]
-  test = Retriever::FetchSitemap.new(q, options)  if options[:sitemap]
+  Retriever::FetchFiles.new(q, options) if options[:fileharvest]
+  Retriever::FetchSitemap.new(q, options) if options[:sitemap]
   puts "### [RubyRetriever] is done."
   puts "###############################"
   puts

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -1,19 +1,20 @@
+require 'em-synchrony'
+require 'em-synchrony/em-http'
+require 'em-synchrony/fiber_iterator'
+require 'ruby-progressbar'
+require 'open-uri'
+require 'csv'
+require 'bloomfilter-rb'
 module Retriever
 	class Fetch
-		attr_reader :target, :host, :host_re, :maxPages
+		attr_reader :maxPages, :t
 		#constants
-		HTTP_RE = Regexp.new(/^http/i).freeze
 		HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
 		NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
-		SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
-		DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
-		NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
-		DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
 		def initialize(url,options)
-			new_uri = URI(url)
-			@target = new_uri.to_s
-			@host = new_uri.host
+			@t = Retriever::Target.new(url)
 			#OPTIONS
 			@prgrss = options[:progress] ? options[:progress] : false
 			@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
@@ -24,14 +25,13 @@ module Retriever
 			@s = options[:sitemap] ? options[:sitemap] : false
 			@autodown = options[:autodown] ? true : false
 			#
-			@host_re = Regexp.new(host).freeze
 			if @fh
 				tempExtStr = "."+@file_ext+'\z'
 				@file_re = Regexp.new(tempExtStr).freeze
 			else
 				errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
 				if !@output
-					@output = "rr-#{@host.split('.')[1]}"
+					@output = "rr-#{@t.host.split('.')[1]}"
 				end
 			end
 			if @prgrss
@@ -45,7 +45,7 @@ module Retriever
 				@progressbar = ProgressBar.create(prgressVars)
 			end
 			@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
-			@already_crawled.insert(@target)
+			@already_crawled.insert(@t.target)
 		end
 		def errlog(msg)
 			raise "ERROR: #{msg}"
@@ -56,10 +56,10 @@ module Retriever
 		def dump(data)
 			puts "###############################"
 			if @s
-				puts "#{@target} Sitemap"
+				puts "#{@t.target} Sitemap"
 				puts "Page Count: #{data.size}"
 			elsif @fh
-				puts "Target URL: #{@target}"
+				puts "Target URL: #{@t.target}"
 				puts "Filetype: #{@file_ext}"
 				puts "File Count: #{data.size}"
 			else
@@ -84,58 +84,20 @@ module Retriever
 				puts
 			end
 		end
-		def fetchPage(url)
-			resp = false
-			EM.synchrony do
-				begin
-					resp = EventMachine::HttpRequest.new(url).get
-				rescue StandardError => e
-					#puts e.message + " ## " + url
-					#the trap abrt is nescessary to handle the SSL error
-					#for some ungodly reason it's the only way I found to handle it
-					trap("ABRT"){
-						puts "#{url} failed SSL Certification Verification"
-					}
-					return false
-				end
-				lg("URL Crawled: #{url}")
-		    	EventMachine.stop
-			end
-			if resp.response == ""
-				errlog("Domain is not working. Try the non-WWW version.")
-			end
-			return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
-		end
 		#recieves page source as string
 		#returns array of unique href links
 		def fetchLinks(doc)
 			return false if !doc
-			linkArray = []
-			doc.scan(HREF_CONTENTS_RE) do |arr|  #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
-				link = arr[0]
-				if (!(HTTP_RE =~ link))
-					if (DUB_DUB_DUB_DOT_RE =~ link)
-						link = "http://#{link}"
-					elsif SINGLE_SLASH_RE =~ link #link uses relative path
-						link = "http://#{@host}"+link #appending hostname to relative paths
-					elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
-						link = "http:#{link}" #appending current url to relative paths
-					elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
-						link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
-					else
-						next
-					end
-				end
-				linkArray.push(link)
-			end
-			linkArray.uniq!
+			doc.scan(HREF_CONTENTS_RE).map do |match|  #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
+				link = match[0]
+				Link.new(@t.host, link).path
+			end.uniq
 		end
 		def parseInternalLinks(all_links)
-			if all_links
-				all_links.select{ |linky| (@host_re =~ linky && (!(NONPAGE_EXT_RE =~linky)))}
-			else
-				return false
-			end
+				all_links.select{ |linky| (@t.host_re =~ linky) }
+		end
+		def parseInternalVisitableLinks(all_links)
+				parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
 		end
 		def async_crawl_and_collect()
 			while (@already_crawled.size < @maxPages)
@@ -147,8 +109,6 @@ module Retriever
 					end
 					break;
 				end
-				#puts "New loop"
-				#puts @linkStack
 				new_links_arr = self.asyncGetWave()
 				next if (new_links_arr.nil? || new_links_arr.empty?)
 				new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -4,16 +4,17 @@ module Retriever
 		def initialize(url,options)
 			super
 			@fileStack = []
-			all_links = self.fetchLinks(fetchPage(@target))
-			@linkStack = self.parseInternalLinks(all_links)
+			all_links = self.fetchLinks(@t.source)
+			@linkStack = self.parseInternalVisitableLinks(all_links)
+			lg("URL Crawled: #{@t.target}")
 			self.lg("#{@linkStack.size-1} new links found")
 			tempFileCollection = self.parseFiles(all_links)
 			@fileStack.concat(tempFileCollection) if tempFileCollection.size>0
 			self.lg("#{@fileStack.size} new files found")
-			errlog("Bad URL -- #{@target}") if !@linkStack
+			errlog("Bad URL -- #{@t.target}") if !@linkStack
-			@linkStack.delete(@target) if @linkStack.include?(@target)
+			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
 			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
 			self.async_crawl_and_collect()

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -3,12 +3,13 @@ module Retriever
 		attr_reader :sitemap
 		def initialize(url,options)
 			super
-			@sitemap = [@target]
-			@linkStack = self.parseInternalLinks(self.fetchLinks(fetchPage(@target)))
+			@sitemap = [@t.target]
+			@linkStack = self.parseInternalVisitableLinks(self.fetchLinks(@t.source))
+			lg("URL Crawled: #{@t.target}")
 			self.lg("#{@linkStack.size-1} new links found")
-			errlog("Bad URL -- #{@target}") if !@linkStack
+			errlog("Bad URL -- #{@t.target}") if !@linkStack
-			@linkStack.delete(@target) if @linkStack.include?(@target)
+			@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
 			@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
 			@sitemap.concat(@linkStack)
@@ -16,14 +17,13 @@ module Retriever
 			@sitemap.sort_by!	 {|x| x.length} if @sitemap.size>1
 			@sitemap.uniq!
-			@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
 			self.dump(self.sitemap)
 			self.write(self.sitemap) if /CSV/i =~ @s
 			self.gen_xml(self.sitemap) if /XML/i =~ @s
 		end
 		def gen_xml(data)
-			f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
+			f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
 			f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
 				data.each do |url|
 					f << "<url><loc>#{url}</loc></url>"
@@ -31,7 +31,7 @@ module Retriever
 			f << "</urlset>"
 			f.close
 			puts "###############################"
-			puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
+			puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
 			puts "Object Count: #{@sitemap.size}"
 			puts "###############################"
 			puts

data/lib/retriever/link.rb ADDED Viewed

@@ -0,0 +1,29 @@
+module Retriever
+  class Link
+    HTTP_RE = Regexp.new(/^http/i).freeze
+    SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
+    DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
+    NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
+    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
+    def initialize(host, link)
+      @host = host
+      @link = link
+    end
+    def path
+      return link if HTTP_RE =~ link
+      return "http://#{link}" if DUB_DUB_DUB_DOT_RE =~ link
+      return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
+      return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
+      return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
+    end
+    private
+    attr_reader :host, :link
+  end
+end

data/lib/retriever/target.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'open-uri'
+module Retriever
+  class Target
+    HTTP_RE = Regexp.new(/^http/i).freeze
+    DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
+    attr_reader :host, :target, :host_re, :source
+    def initialize(url)
+      url = "http://#{url}" if (!(HTTP_RE =~ url))
+      fail "Bad URL" if (!(/\./ =~ url))
+      new_uri = URI(url)
+      @target = new_uri.to_s
+      @host = new_uri.host
+      @host_re = Regexp.new(@host).freeze
+    end
+    def source
+      resp = false
+      begin
+        resp = open(@target)
+      rescue StandardError => e
+        #puts e.message + " ## " + url
+        #the trap abrt is nescessary to handle the SSL error
+        #for some ungodly reason it's the only way I found to handle it
+        trap("ABRT"){
+          puts "#{@target} failed SSL Certification Verification"
+        }
+        return false
+      end
+      if (@target != resp.base_uri.to_s)
+          fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
+      end
+      resp = resp.read
+      if resp == ""
+        fail "Domain is not working. Try the non-WWW version."
+      end
+      return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
+    end
+  end
+end

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Retriever
-  VERSION = '0.1.3'
+  VERSION = '0.1.4'
 end

data/lib/retriever.rb CHANGED Viewed

@@ -1,18 +1,9 @@
-##################################################################
-#####RubyRetriever -- web crawler and file harvester
-#####created by Joe Norton
-#####http://softwarebyjoe.com
-##LICENSING: GNU GPLv3  License##################################
-#! usr/bin/ruby
-require 'em-synchrony'
-require 'em-synchrony/em-http'
-require 'em-synchrony/fiber_iterator'
-require 'ruby-progressbar'
-require 'open-uri'
-require 'optparse'
-require 'csv'
-require 'bloomfilter-rb'
 require 'retriever/fetch'
 require 'retriever/fetchfiles'
-require 'retriever/fetchsitemap'
+require 'retriever/fetchsitemap'
+require 'retriever/link'
+require 'retriever/target'
+module Retriever
+end

data/readme.md CHANGED Viewed

@@ -1,6 +1,7 @@
-[RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)  [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever)
+[RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
 ==============
+[![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever)  [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
 By Joe Norton
 RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.

data/spec/link_spec.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require 'retriever'
+describe "Link" do
+    r = Retriever::Fetch.new("http://www.cnet.com/reviews/", {})
+    let(:links) { r.fetchLinks(@source) }
+    it "collects links in anchor tags" do
+      @source = (<<SOURCE).strip
+<a href='http://www.cnet.com/download.exe'>download</a>
+SOURCE
+      expect(links).to include('http://www.cnet.com/download.exe')
+    end
+    it "collects links in link tags" do
+      @source = (<<SOURCE).strip
+ <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
+SOURCE
+      expect(links).to include('http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12')
+    end
+    it "does not collect bare links (ones not in an href)" do
+      @source = (<<SOURCE).strip
+http://www.google.com
+SOURCE
+      expect(links).to_not include('http://www.google.com')
+    end
+    it "collects only unique href links on the page" do
+      @source = (<<SOURCE).strip
+<a href='http://www.cnet.com/products/gadgets'>gadgets</a>
+<a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
+SOURCE
+      expect(links).to have(1).items
+    end
+    it "adds a protocol to urls missing them (www.)" do
+      @source = (<<SOURCE).strip
+<a href='www.cnet.com/download.exe'>download</a>
+SOURCE
+      expect(links).to include('http://www.cnet.com/download.exe')
+    end
+    it "doesn't care about any extra attributes on the anchor tag" do
+      @source = (<<SOURCE).strip
+<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
+<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
+SOURCE
+      expect(links).to have(1).item
+    end
+    it "returns relative urls with full path based on hostname" do
+      @source = (<<SOURCE).strip
+<a href='/test.html'>test</a>
+<a href='cpage_18'>about</a>
+SOURCE
+      expect(links).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
+    end
+end

data/spec/retriever_spec.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require_relative '../lib/retriever'
+require 'retriever'
 r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
 test_html = "<a href='www.cnet.com/download.exe'>download</a>
@@ -12,49 +12,36 @@ http://www.google.com
  <link rel='stylesheet' id='gforms_reset_css-css'  href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
  <a href='cpage_18'>about</a>"
-doc = r.fetchPage(r.target)
 links_collection = r.fetchLinks(test_html)
-filtered_links = r.parseInternalLinks(links_collection)
-file_list = r.parseFiles(links_collection)
 describe "Fetch" do
-	describe "#new" do
-		it "sets target, host, and max page vars" do
-			expect(r.target).to eq("http://www.cnet.com/reviews/")
-			expect(r.host).to eq("www.cnet.com")
-			expect(r.maxPages).to eq(100)
-		end
-	end
-	describe "#fetchPage" do
-		it "opens URL and returns source as String" do
-			expect(doc.class).to eq(String)
-		end
-	end
 	describe "#fetchLinks" do
 		it "collects all unique href links on the page" do
 			expect(links_collection).to have(6).items
 		end
-		it "returns relative urls with full path based on hostname" do
-			expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
-		end
 	end
 	describe "#parseInternalLinks" do
+		let (:filtered_links) {r.parseInternalLinks(links_collection)}
 		it "filters links by host" do
 			filtered_links.each do |link|
-				expect(link).to include(r.host)
+				expect(link).to include("www.cnet.com")
 			end
 		end
+	end
+	describe "#parseInternalVisitableLinks" do
+		let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
 		it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
 			filtered_links.each do |link|
 				expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
 			end
 		end
 	end
 	describe "#parseFiles" do
+		let(:file_list) {r.parseFiles(links_collection)}
 		it "filters links by filetype" do
 			file_list.each do |link|
 				expect(link).to include(".exe")

data/spec/target_spec.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require 'retriever'
+require 'open-uri'
+t = Retriever::Target.new("http://www.cnet.com/reviews/")
+describe "Target" do
+    it "creates target var" do
+      expect(t.target).to eq("http://www.cnet.com/reviews/")
+    end
+    it "creates host var" do
+      expect(t.host).to eq("www.cnet.com")
+    end
+    it "creates host_re var" do
+      expect(t.host_re).to eq(/www.cnet.com/)
+    end
+    it "adds protocol to Target URL if none given" do
+      expect(Retriever::Target.new("cnet.com").target).to eq("http://cnet.com")
+    end
+    it "fails if given URL has no dot in it" do
+      expect{Retriever::Target.new("cnetcom")}.to raise_error
+    end
+  describe "#source" do
+    it "opens URL and returns source as String" do
+      expect(Retriever::Target.new("http://techcrunch.com/").source.class).to eq(String)
+    end
+    it "fails if target redirects to new host" do
+      expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
 platform: ruby
 authors:
 - Joe Norton
@@ -122,10 +122,14 @@ files:
 - lib/retriever/fetch.rb
 - lib/retriever/fetchfiles.rb
 - lib/retriever/fetchsitemap.rb
+- lib/retriever/link.rb
+- lib/retriever/target.rb
 - lib/retriever/version.rb
 - readme.md
+- spec/link_spec.rb
 - spec/retriever_spec.rb
 - spec/spec_helper.rb
+- spec/target_spec.rb
 homepage: http://www.softwarebyjoe.com/rubyretriever/
 licenses:
 - MIT