rubyretriever 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/bin/rr +10 -17
  3. data/lib/retriever/version.rb +1 -1
  4. data/readme.md +2 -2
  5. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b3eba5658a9b3aa77a522d46a9564acb2a8eea5d
4
- data.tar.gz: d3315f447ca908cf14f31fe1ff7196f9ca5e6744
3
+ metadata.gz: c9f829e46e94b82625b0c0c67e492ee2433e25cd
4
+ data.tar.gz: 2caeb5719fe47661d29097c9ab9818b6f493710a
5
5
  SHA512:
6
- metadata.gz: 99c13086efcb81db33e48a1ebee4e7021059dddcacc3e5fa2ec6f8e6159a0d0840dbe3068fa35f98eca026d1f8d5c5f90ed5fade7e7e2abbbe111e70dc6dbdc8
7
- data.tar.gz: 24c5de4333f44d0391d8bdca3325a820e42b37534a1005fc608150855eed852205c85d5687828cdc75475ca8788a6522d9ca82032b9df7b3c67a17c36ae184b1
6
+ metadata.gz: d031365939289932696c6483db762ad2ba8c5fd62234c2418f72b7548b18943aa94646e660c349b4a37cb4b00388b28bb6107ed8086d1264f1812dcd284c6343
7
+ data.tar.gz: ad75737457a5cf6ace00f0b83e60b961fde291d7b4a3c1a5e4b8af90c98a5b919e144ff85223175073ef2c6072e08d329ce5b2d10512ee8abbd1a314611134b7
data/bin/rr CHANGED
@@ -4,8 +4,15 @@ options = {}
4
4
  optparse = OptionParser.new do|opts|
5
5
  # Set a banner, displayed at the top
6
6
  # of the help screen.
7
- opts.banner = "Usage: rr [options] Target_URL"
8
-
7
+ opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
8
+ options[:sitemap] = false
9
+ opts.on( '-s', '--sitemap FORMAT', 'MODE FLAG: Sitemap mode - Crawl site and output sitemap, format choices: CSV or XML' ) do |output_type|
10
+ options[:sitemap] = output_type
11
+ end
12
+ options[:fileharvest] = false
13
+ opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode - Crawl site and collect links for files found, extension for filetype' ) do |file_ext|
14
+ options[:fileharvest] = file_ext
15
+ end
9
16
  options[:filename] = nil
10
17
  opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
11
18
  options[:filename] = filename
@@ -15,32 +22,18 @@ options = {}
15
22
  opts.on( '-v', '--verbose', 'Output more information' ) do
16
23
  options[:verbose] = true
17
24
  end
18
-
19
25
  options[:progress] = false
20
- opts.on( '-p', '--progressbar', 'Output more information' ) do
26
+ opts.on( '-p', '--progress', 'Output progress bar' ) do
21
27
  options[:progress] = true
22
28
  end
23
-
24
- options[:sitemap] = false
25
- opts.on( '-s', '--sitemap FORMAT', 'Crawl site and output sitemap' ) do |output_type|
26
- options[:sitemap] = output_type
27
- end
28
-
29
- options[:fileharvest] = false
30
- opts.on( '-f', '--files FILETYPE', 'Crawl site and collect links for files found' ) do |file_ext|
31
- options[:fileharvest] = file_ext
32
- end
33
-
34
29
  options[:maxpages] = false
35
30
  opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
36
31
  options[:maxpages] = maxpages
37
32
  end
38
-
39
33
  options[:autodown] = false
40
34
  opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
41
35
  options[:autodown] = true
42
36
  end
43
-
44
37
  # This displays the help screen, all programs are
45
38
  # assumed to have this option.
46
39
  opts.on( '-h', '--help', 'Display this screen' ) do
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '0.1.0'
2
+ VERSION = '0.1.1'
3
3
  end
data/readme.md CHANGED
@@ -30,11 +30,11 @@ This would go to http://www.cnet.com and map it until it crawled a max of 100 pa
30
30
 
31
31
  **Example: File Harvesting mode**
32
32
  ```sh
33
- rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
33
+ rr --files pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
34
34
  ```
35
35
  OR -- SAME COMMAND
36
36
  ```sh
37
- rr -f -e pdf -p -l 1000 http://www.hubspot.com
37
+ rr -f pdf -p -l 1000 http://www.hubspot.com
38
38
  ```
39
39
 
40
40
  This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
@@ -108,7 +108,7 @@ dependencies:
108
108
  - - ~>
109
109
  - !ruby/object:Gem::Version
110
110
  version: '2.14'
111
- description: General purpose web crawler, site mapper, and file harvester
111
+ description: Asynchronous web crawler, file harvester & autdownloader
112
112
  email:
113
113
  - joe@softwarebyjoe.com
114
114
  executables: