RubyGems - rubyretriever - Versions diffs - 0.0.13 → 0.1.0 - Mend

rubyretriever 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/bin/rr +8 -12
data/lib/retriever/fetch.rb +7 -4
data/lib/retriever/fetchfiles.rb +1 -1
data/lib/retriever/fetchsitemap.rb +16 -1
data/lib/retriever/version.rb +1 -1
data/readme.md +10 -11
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9741c673d910c34e27092d8bc76d42a945979425
-  data.tar.gz: 4478c442e1523de1864519c67c7d0f9fd6c43d9f
+  metadata.gz: b3eba5658a9b3aa77a522d46a9564acb2a8eea5d
+  data.tar.gz: d3315f447ca908cf14f31fe1ff7196f9ca5e6744
 SHA512:
-  metadata.gz: ec5ecde4d038a130cc50a705b41b3c8595156bd7706d3170be7055baf6cc63d1b864ebe41de36dab61c69b1edf13a1a4ab8577a9dcf558d9ff804f2bfd4e860a
-  data.tar.gz: 0a22053d24a799a4273614ee1cae7522bda3771fa39f1a8c37ba6959e907f6a4b30bb9f8d85b8b0f4c9bdfac34c82f0b2ba7ae8d73017639d7783c841d8d2841
+  metadata.gz: 99c13086efcb81db33e48a1ebee4e7021059dddcacc3e5fa2ec6f8e6159a0d0840dbe3068fa35f98eca026d1f8d5c5f90ed5fade7e7e2abbbe111e70dc6dbdc8
+  data.tar.gz: 24c5de4333f44d0391d8bdca3325a820e42b37534a1005fc608150855eed852205c85d5687828cdc75475ca8788a6522d9ca82032b9df7b3c67a17c36ae184b1

data/bin/rr CHANGED Viewed

@@ -1,5 +1,5 @@
 #! /usr/bin/env ruby
-require_relative('../lib/retriever.rb')
+require 'retriever'
 options = {}
  optparse = OptionParser.new do|opts|
    # Set a banner, displayed at the top
@@ -22,13 +22,13 @@ options = {}
    end
   options[:sitemap] = false
-   opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
-     options[:sitemap] = true
+   opts.on( '-s', '--sitemap FORMAT', 'Crawl site and output sitemap' ) do |output_type|
+     options[:sitemap] = output_type
    end
   options[:fileharvest] = false
-   opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
-     options[:fileharvest] = true
+   opts.on( '-f', '--files FILETYPE', 'Crawl site and collect links for files found' ) do |file_ext|
+     options[:fileharvest] = file_ext
    end
   options[:maxpages] = false
@@ -36,13 +36,8 @@ options = {}
      options[:maxpages] = maxpages
    end
-  options[:file_ext] = false
-   opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
-     options[:file_ext] = file_ext
-   end
    options[:autodown] = false
-   opts.on( '-a', '--autodown', 'Automatically download all files of filetype located' ) do
+   opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
      options[:autodown] = true
    end
@@ -64,8 +59,9 @@ ARGV.each do|q|
     puts "###############################"
     puts "### [RubyRetriever]"
     puts "### Creating Sitemap" if options[:sitemap]
+    puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
     puts "### Performing File Harvest" if options[:fileharvest]
-    puts "### Searching for file extension: #{options[:file_ext]} pages" if (options[:file_ext])
+    puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
     puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
     puts "### Being verbose"
     puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]

data/lib/retriever/fetch.rb CHANGED Viewed

@@ -19,18 +19,20 @@ module Retriever
 			@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
 			@v= options[:verbose] ? true : false
 			@output=options[:filename] ? options[:filename] : false
-			@fh = options[:fileharvest] ? true : false
-			@s = options[:sitemap] ? true : false
-			@file_ext = options[:file_ext] ? options[:file_ext] : false
+			@fh = options[:fileharvest] ? options[:fileharvest] : false
+			@file_ext = @fh.to_s
+			@s = options[:sitemap] ? options[:sitemap] : false
 			@autodown = options[:autodown] ? true : false
 			#
 			@host_re = Regexp.new(host).freeze
 			if @fh
-				errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
 				tempExtStr = "."+@file_ext+'\z'
 				@file_re = Regexp.new(tempExtStr).freeze
 			else
 				errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
+				if !@output
+					@output = "rr-#{@host.split('.')[1]}"
+				end
 			end
 			if @prgrss
 				errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
@@ -153,6 +155,7 @@ module Retriever
 				@linkStack.concat(new_links_arr)
 				@sitemap.concat(new_links_arr) if @s
 			end
+			@progressbar.finish
 		end
 		def asyncGetWave() #send a new wave of GET requests, using current @linkStack
 			new_stuff = []

data/lib/retriever/fetchfiles.rb CHANGED Viewed

@@ -23,7 +23,7 @@ module Retriever
 			self.dump(self.fileStack)
 			self.write(@output,self.fileStack) if @output
-			self.autodownload()
+			self.autodownload() if @autodown
 		end
 		def download_file(path)
 			arr = path.split('/')

data/lib/retriever/fetchsitemap.rb CHANGED Viewed

@@ -19,7 +19,22 @@ module Retriever
 			@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
 			self.dump(self.sitemap)
-			self.write(self.sitemap) if @output
+			self.write(self.sitemap) if /CSV/i =~ @s
+			self.gen_xml(self.sitemap) if /XML/i =~ @s
+		end
+		def gen_xml(data)
+			f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
+			f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
+				data.each do |url|
+					f << "<url><loc>#{url}</loc></url>"
+				end
+			f << "</urlset>"
+			f.close
+			puts "###############################"
+			puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
+			puts "Object Count: #{@sitemap.size}"
+			puts "###############################"
+			puts
 		end
 	end
 end

data/lib/retriever/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Retriever
-  VERSION = '0.0.13'
+  VERSION = '0.1.0'
 end

data/readme.md CHANGED Viewed

@@ -19,14 +19,14 @@ gem install rubyretriever
  **Example: Sitemap mode**
 ```sh
-rr --sitemap --progress --limit 100 --output cnet http://www.cnet.com
+rr --sitemap CSV --progress --limit 100 http://www.cnet.com
 ```
 OR -- SAME COMMAND
 ```sh
-rr -s -p -l 100 -o cnet http://www.cnet.com
+rr -s csv -p -l 100 http://www.cnet.com
 ```
-This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet.
+This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
  **Example: File Harvesting mode**
 ```sh
@@ -34,26 +34,25 @@ rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot
 ```
 OR -- SAME COMMAND
 ```sh
-rr -f -e pdf -p -l 1000 -o hubspot http://www.hubspot.com
+rr -f -e pdf -p -l 1000 http://www.hubspot.com
 ```
-This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot, and then it would go ahead and try and download each of those files to a new 'rr-downloads' folder
+This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
 command-line arguments
 -----------------------
-Usage: rr [MODE] [OPTIONS] Target_URL
+Usage: rr [MODE FLAG] [OPTIONS] Target_URL
-Where MODE FLAG is either:
-	-s, --sitemap
-	-f, --files
+Where MODE FLAG is required, and is either:
+	-s, --sitemap FORMAT
+	-f, --files FILETYPE
 and OPTIONS is the applicable:
-    -o, --out FILENAME                  *Dump output to selected filename*
+    -o, --out FILENAME                  *Dump output to selected filename --being phased out*
     -p, --progress						*Outputs a progressbar*
     -v, --verbose                       *Output more information*
     -l, --limit PAGE_LIMIT_#            *set a max on the total number of crawled pages*
-    -e, --ext FILE_EXTENSION            *set a file extension to look for on crawled pages*
     -h, --help                          *Display this screen*
 Current Requirements

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rubyretriever
 version: !ruby/object:Gem::Version
-  version: 0.0.13
+  version: 0.1.0
 platform: ruby
 authors:
 - Joe Norton