rubyretriever 0.0.13 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9741c673d910c34e27092d8bc76d42a945979425
4
- data.tar.gz: 4478c442e1523de1864519c67c7d0f9fd6c43d9f
3
+ metadata.gz: b3eba5658a9b3aa77a522d46a9564acb2a8eea5d
4
+ data.tar.gz: d3315f447ca908cf14f31fe1ff7196f9ca5e6744
5
5
  SHA512:
6
- metadata.gz: ec5ecde4d038a130cc50a705b41b3c8595156bd7706d3170be7055baf6cc63d1b864ebe41de36dab61c69b1edf13a1a4ab8577a9dcf558d9ff804f2bfd4e860a
7
- data.tar.gz: 0a22053d24a799a4273614ee1cae7522bda3771fa39f1a8c37ba6959e907f6a4b30bb9f8d85b8b0f4c9bdfac34c82f0b2ba7ae8d73017639d7783c841d8d2841
6
+ metadata.gz: 99c13086efcb81db33e48a1ebee4e7021059dddcacc3e5fa2ec6f8e6159a0d0840dbe3068fa35f98eca026d1f8d5c5f90ed5fade7e7e2abbbe111e70dc6dbdc8
7
+ data.tar.gz: 24c5de4333f44d0391d8bdca3325a820e42b37534a1005fc608150855eed852205c85d5687828cdc75475ca8788a6522d9ca82032b9df7b3c67a17c36ae184b1
data/bin/rr CHANGED
@@ -1,5 +1,5 @@
1
1
  #! /usr/bin/env ruby
2
- require_relative('../lib/retriever.rb')
2
+ require 'retriever'
3
3
  options = {}
4
4
  optparse = OptionParser.new do|opts|
5
5
  # Set a banner, displayed at the top
@@ -22,13 +22,13 @@ options = {}
22
22
  end
23
23
 
24
24
  options[:sitemap] = false
25
- opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
26
- options[:sitemap] = true
25
+ opts.on( '-s', '--sitemap FORMAT', 'Crawl site and output sitemap' ) do |output_type|
26
+ options[:sitemap] = output_type
27
27
  end
28
28
 
29
29
  options[:fileharvest] = false
30
- opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
31
- options[:fileharvest] = true
30
+ opts.on( '-f', '--files FILETYPE', 'Crawl site and collect links for files found' ) do |file_ext|
31
+ options[:fileharvest] = file_ext
32
32
  end
33
33
 
34
34
  options[:maxpages] = false
@@ -36,13 +36,8 @@ options = {}
36
36
  options[:maxpages] = maxpages
37
37
  end
38
38
 
39
- options[:file_ext] = false
40
- opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
41
- options[:file_ext] = file_ext
42
- end
43
-
44
39
  options[:autodown] = false
45
- opts.on( '-a', '--autodown', 'Automatically download all files of filetype located' ) do
40
+ opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
46
41
  options[:autodown] = true
47
42
  end
48
43
 
@@ -64,8 +59,9 @@ ARGV.each do|q|
64
59
  puts "###############################"
65
60
  puts "### [RubyRetriever]"
66
61
  puts "### Creating Sitemap" if options[:sitemap]
62
+ puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
67
63
  puts "### Performing File Harvest" if options[:fileharvest]
68
- puts "### Searching for file extension: #{options[:file_ext]} pages" if (options[:file_ext])
64
+ puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
69
65
  puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
70
66
  puts "### Being verbose"
71
67
  puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
@@ -19,18 +19,20 @@ module Retriever
19
19
  @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
20
20
  @v= options[:verbose] ? true : false
21
21
  @output=options[:filename] ? options[:filename] : false
22
- @fh = options[:fileharvest] ? true : false
23
- @s = options[:sitemap] ? true : false
24
- @file_ext = options[:file_ext] ? options[:file_ext] : false
22
+ @fh = options[:fileharvest] ? options[:fileharvest] : false
23
+ @file_ext = @fh.to_s
24
+ @s = options[:sitemap] ? options[:sitemap] : false
25
25
  @autodown = options[:autodown] ? true : false
26
26
  #
27
27
  @host_re = Regexp.new(host).freeze
28
28
  if @fh
29
- errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
30
29
  tempExtStr = "."+@file_ext+'\z'
31
30
  @file_re = Regexp.new(tempExtStr).freeze
32
31
  else
33
32
  errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
33
+ if !@output
34
+ @output = "rr-#{@host.split('.')[1]}"
35
+ end
34
36
  end
35
37
  if @prgrss
36
38
  errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
@@ -153,6 +155,7 @@ module Retriever
153
155
  @linkStack.concat(new_links_arr)
154
156
  @sitemap.concat(new_links_arr) if @s
155
157
  end
158
+ @progressbar.finish
156
159
  end
157
160
  def asyncGetWave() #send a new wave of GET requests, using current @linkStack
158
161
  new_stuff = []
@@ -23,7 +23,7 @@ module Retriever
23
23
 
24
24
  self.dump(self.fileStack)
25
25
  self.write(@output,self.fileStack) if @output
26
- self.autodownload()
26
+ self.autodownload() if @autodown
27
27
  end
28
28
  def download_file(path)
29
29
  arr = path.split('/')
@@ -19,7 +19,22 @@ module Retriever
19
19
  @sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
20
20
 
21
21
  self.dump(self.sitemap)
22
- self.write(self.sitemap) if @output
22
+ self.write(self.sitemap) if /CSV/i =~ @s
23
+ self.gen_xml(self.sitemap) if /XML/i =~ @s
24
+ end
25
+ def gen_xml(data)
26
+ f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
27
+ f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
28
+ data.each do |url|
29
+ f << "<url><loc>#{url}</loc></url>"
30
+ end
31
+ f << "</urlset>"
32
+ f.close
33
+ puts "###############################"
34
+ puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
35
+ puts "Object Count: #{@sitemap.size}"
36
+ puts "###############################"
37
+ puts
23
38
  end
24
39
  end
25
40
  end
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '0.0.13'
2
+ VERSION = '0.1.0'
3
3
  end
data/readme.md CHANGED
@@ -19,14 +19,14 @@ gem install rubyretriever
19
19
 
20
20
  **Example: Sitemap mode**
21
21
  ```sh
22
- rr --sitemap --progress --limit 100 --output cnet http://www.cnet.com
22
+ rr --sitemap CSV --progress --limit 100 http://www.cnet.com
23
23
  ```
24
24
  OR -- SAME COMMAND
25
25
  ```sh
26
- rr -s -p -l 100 -o cnet http://www.cnet.com
26
+ rr -s csv -p -l 100 http://www.cnet.com
27
27
  ```
28
28
 
29
- This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet.
29
+ This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
30
30
 
31
31
  **Example: File Harvesting mode**
32
32
  ```sh
@@ -34,26 +34,25 @@ rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot
34
34
  ```
35
35
  OR -- SAME COMMAND
36
36
  ```sh
37
- rr -f -e pdf -p -l 1000 -o hubspot http://www.hubspot.com
37
+ rr -f -e pdf -p -l 1000 http://www.hubspot.com
38
38
  ```
39
39
 
40
- This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot, and then it would go ahead and try and download each of those files to a new 'rr-downloads' folder
40
+ This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
41
41
 
42
42
 
43
43
  command-line arguments
44
44
  -----------------------
45
- Usage: rr [MODE] [OPTIONS] Target_URL
45
+ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
46
46
 
47
- Where MODE FLAG is either:
48
- -s, --sitemap
49
- -f, --files
47
+ Where MODE FLAG is required, and is either:
48
+ -s, --sitemap FORMAT
49
+ -f, --files FILETYPE
50
50
 
51
51
  and OPTIONS is the applicable:
52
- -o, --out FILENAME *Dump output to selected filename*
52
+ -o, --out FILENAME *Dump output to selected filename --being phased out*
53
53
  -p, --progress *Outputs a progressbar*
54
54
  -v, --verbose *Output more information*
55
55
  -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
56
- -e, --ext FILE_EXTENSION *set a file extension to look for on crawled pages*
57
56
  -h, --help *Display this screen*
58
57
 
59
58
  Current Requirements
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton