rubyretriever 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9741c673d910c34e27092d8bc76d42a945979425
4
- data.tar.gz: 4478c442e1523de1864519c67c7d0f9fd6c43d9f
3
+ metadata.gz: b3eba5658a9b3aa77a522d46a9564acb2a8eea5d
4
+ data.tar.gz: d3315f447ca908cf14f31fe1ff7196f9ca5e6744
5
5
  SHA512:
6
- metadata.gz: ec5ecde4d038a130cc50a705b41b3c8595156bd7706d3170be7055baf6cc63d1b864ebe41de36dab61c69b1edf13a1a4ab8577a9dcf558d9ff804f2bfd4e860a
7
- data.tar.gz: 0a22053d24a799a4273614ee1cae7522bda3771fa39f1a8c37ba6959e907f6a4b30bb9f8d85b8b0f4c9bdfac34c82f0b2ba7ae8d73017639d7783c841d8d2841
6
+ metadata.gz: 99c13086efcb81db33e48a1ebee4e7021059dddcacc3e5fa2ec6f8e6159a0d0840dbe3068fa35f98eca026d1f8d5c5f90ed5fade7e7e2abbbe111e70dc6dbdc8
7
+ data.tar.gz: 24c5de4333f44d0391d8bdca3325a820e42b37534a1005fc608150855eed852205c85d5687828cdc75475ca8788a6522d9ca82032b9df7b3c67a17c36ae184b1
data/bin/rr CHANGED
@@ -1,5 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
- require_relative('../lib/retriever.rb')
2
+ require 'retriever'
3
3
  options = {}
4
4
  optparse = OptionParser.new do|opts|
5
5
  # Set a banner, displayed at the top
@@ -22,13 +22,13 @@ options = {}
22
22
  end
23
23
 
24
24
  options[:sitemap] = false
25
- opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
26
- options[:sitemap] = true
25
+ opts.on( '-s', '--sitemap FORMAT', 'Crawl site and output sitemap' ) do |output_type|
26
+ options[:sitemap] = output_type
27
27
  end
28
28
 
29
29
  options[:fileharvest] = false
30
- opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
31
- options[:fileharvest] = true
30
+ opts.on( '-f', '--files FILETYPE', 'Crawl site and collect links for files found' ) do |file_ext|
31
+ options[:fileharvest] = file_ext
32
32
  end
33
33
 
34
34
  options[:maxpages] = false
@@ -36,13 +36,8 @@ options = {}
36
36
  options[:maxpages] = maxpages
37
37
  end
38
38
 
39
- options[:file_ext] = false
40
- opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
41
- options[:file_ext] = file_ext
42
- end
43
-
44
39
  options[:autodown] = false
45
- opts.on( '-a', '--autodown', 'Automatically download all files of filetype located' ) do
40
+ opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
46
41
  options[:autodown] = true
47
42
  end
48
43
 
@@ -64,8 +59,9 @@ ARGV.each do|q|
64
59
  puts "###############################"
65
60
  puts "### [RubyRetriever]"
66
61
  puts "### Creating Sitemap" if options[:sitemap]
62
+ puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
67
63
  puts "### Performing File Harvest" if options[:fileharvest]
68
- puts "### Searching for file extension: #{options[:file_ext]} pages" if (options[:file_ext])
64
+ puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
69
65
  puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
70
66
  puts "### Being verbose"
71
67
  puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
@@ -19,18 +19,20 @@ module Retriever
19
19
  @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
20
20
  @v= options[:verbose] ? true : false
21
21
  @output=options[:filename] ? options[:filename] : false
22
- @fh = options[:fileharvest] ? true : false
23
- @s = options[:sitemap] ? true : false
24
- @file_ext = options[:file_ext] ? options[:file_ext] : false
22
+ @fh = options[:fileharvest] ? options[:fileharvest] : false
23
+ @file_ext = @fh.to_s
24
+ @s = options[:sitemap] ? options[:sitemap] : false
25
25
  @autodown = options[:autodown] ? true : false
26
26
  #
27
27
  @host_re = Regexp.new(host).freeze
28
28
  if @fh
29
- errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
30
29
  tempExtStr = "."+@file_ext+'\z'
31
30
  @file_re = Regexp.new(tempExtStr).freeze
32
31
  else
33
32
  errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
33
+ if !@output
34
+ @output = "rr-#{@host.split('.')[1]}"
35
+ end
34
36
  end
35
37
  if @prgrss
36
38
  errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
@@ -153,6 +155,7 @@ module Retriever
153
155
  @linkStack.concat(new_links_arr)
154
156
  @sitemap.concat(new_links_arr) if @s
155
157
  end
158
+ @progressbar.finish
156
159
  end
157
160
  def asyncGetWave() #send a new wave of GET requests, using current @linkStack
158
161
  new_stuff = []
@@ -23,7 +23,7 @@ module Retriever
23
23
 
24
24
  self.dump(self.fileStack)
25
25
  self.write(@output,self.fileStack) if @output
26
- self.autodownload()
26
+ self.autodownload() if @autodown
27
27
  end
28
28
  def download_file(path)
29
29
  arr = path.split('/')
@@ -19,7 +19,22 @@ module Retriever
19
19
  @sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
20
20
 
21
21
  self.dump(self.sitemap)
22
- self.write(self.sitemap) if @output
22
+ self.write(self.sitemap) if /CSV/i =~ @s
23
+ self.gen_xml(self.sitemap) if /XML/i =~ @s
24
+ end
25
+ def gen_xml(data)
26
+ f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
27
+ f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
28
+ data.each do |url|
29
+ f << "<url><loc>#{url}</loc></url>"
30
+ end
31
+ f << "</urlset>"
32
+ f.close
33
+ puts "###############################"
34
+ puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
35
+ puts "Object Count: #{@sitemap.size}"
36
+ puts "###############################"
37
+ puts
23
38
  end
24
39
  end
25
40
  end
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '0.0.13'
2
+ VERSION = '0.1.0'
3
3
  end
data/readme.md CHANGED
@@ -19,14 +19,14 @@ gem install rubyretriever
19
19
 
20
20
  **Example: Sitemap mode**
21
21
  ```sh
22
- rr --sitemap --progress --limit 100 --output cnet http://www.cnet.com
22
+ rr --sitemap CSV --progress --limit 100 http://www.cnet.com
23
23
  ```
24
24
  OR -- SAME COMMAND
25
25
  ```sh
26
- rr -s -p -l 100 -o cnet http://www.cnet.com
26
+ rr -s csv -p -l 100 http://www.cnet.com
27
27
  ```
28
28
 
29
- This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet.
29
+ This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
30
30
 
31
31
  **Example: File Harvesting mode**
32
32
  ```sh
@@ -34,26 +34,25 @@ rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot
34
34
  ```
35
35
  OR -- SAME COMMAND
36
36
  ```sh
37
- rr -f -e pdf -p -l 1000 -o hubspot http://www.hubspot.com
37
+ rr -f -e pdf -p -l 1000 http://www.hubspot.com
38
38
  ```
39
39
 
40
- This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot, and then it would go ahead and try and download each of those files to a new 'rr-downloads' folder
40
+ This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
41
41
 
42
42
 
43
43
  command-line arguments
44
44
  -----------------------
45
- Usage: rr [MODE] [OPTIONS] Target_URL
45
+ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
46
46
 
47
- Where MODE FLAG is either:
48
- -s, --sitemap
49
- -f, --files
47
+ Where MODE FLAG is required, and is either:
48
+ -s, --sitemap FORMAT
49
+ -f, --files FILETYPE
50
50
 
51
51
  and OPTIONS is the applicable:
52
- -o, --out FILENAME *Dump output to selected filename*
52
+ -o, --out FILENAME *Dump output to selected filename --being phased out*
53
53
  -p, --progress *Outputs a progressbar*
54
54
  -v, --verbose *Output more information*
55
55
  -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
56
- -e, --ext FILE_EXTENSION *set a file extension to look for on crawled pages*
57
56
  -h, --help *Display this screen*
58
57
 
59
58
  Current Requirements
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton