rubyretriever 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/rr +10 -17
- data/lib/retriever/version.rb +1 -1
- data/readme.md +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c9f829e46e94b82625b0c0c67e492ee2433e25cd
|
4
|
+
data.tar.gz: 2caeb5719fe47661d29097c9ab9818b6f493710a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d031365939289932696c6483db762ad2ba8c5fd62234c2418f72b7548b18943aa94646e660c349b4a37cb4b00388b28bb6107ed8086d1264f1812dcd284c6343
|
7
|
+
data.tar.gz: ad75737457a5cf6ace00f0b83e60b961fde291d7b4a3c1a5e4b8af90c98a5b919e144ff85223175073ef2c6072e08d329ce5b2d10512ee8abbd1a314611134b7
|
data/bin/rr
CHANGED
@@ -4,8 +4,15 @@ options = {}
|
|
4
4
|
optparse = OptionParser.new do|opts|
|
5
5
|
# Set a banner, displayed at the top
|
6
6
|
# of the help screen.
|
7
|
-
opts.banner = "Usage: rr [options] Target_URL"
|
8
|
-
|
7
|
+
opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
|
8
|
+
options[:sitemap] = false
|
9
|
+
opts.on( '-s', '--sitemap FORMAT', 'MODE FLAG: Sitemap mode - Crawl site and output sitemap, format choices: CSV or XML' ) do |output_type|
|
10
|
+
options[:sitemap] = output_type
|
11
|
+
end
|
12
|
+
options[:fileharvest] = false
|
13
|
+
opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode - Crawl site and collect links for files found, extension for filetype' ) do |file_ext|
|
14
|
+
options[:fileharvest] = file_ext
|
15
|
+
end
|
9
16
|
options[:filename] = nil
|
10
17
|
opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
|
11
18
|
options[:filename] = filename
|
@@ -15,32 +22,18 @@ options = {}
|
|
15
22
|
opts.on( '-v', '--verbose', 'Output more information' ) do
|
16
23
|
options[:verbose] = true
|
17
24
|
end
|
18
|
-
|
19
25
|
options[:progress] = false
|
20
|
-
opts.on( '-p', '--
|
26
|
+
opts.on( '-p', '--progress', 'Output progress bar' ) do
|
21
27
|
options[:progress] = true
|
22
28
|
end
|
23
|
-
|
24
|
-
options[:sitemap] = false
|
25
|
-
opts.on( '-s', '--sitemap FORMAT', 'Crawl site and output sitemap' ) do |output_type|
|
26
|
-
options[:sitemap] = output_type
|
27
|
-
end
|
28
|
-
|
29
|
-
options[:fileharvest] = false
|
30
|
-
opts.on( '-f', '--files FILETYPE', 'Crawl site and collect links for files found' ) do |file_ext|
|
31
|
-
options[:fileharvest] = file_ext
|
32
|
-
end
|
33
|
-
|
34
29
|
options[:maxpages] = false
|
35
30
|
opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
|
36
31
|
options[:maxpages] = maxpages
|
37
32
|
end
|
38
|
-
|
39
33
|
options[:autodown] = false
|
40
34
|
opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
|
41
35
|
options[:autodown] = true
|
42
36
|
end
|
43
|
-
|
44
37
|
# This displays the help screen, all programs are
|
45
38
|
# assumed to have this option.
|
46
39
|
opts.on( '-h', '--help', 'Display this screen' ) do
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -30,11 +30,11 @@ This would go to http://www.cnet.com and map it until it crawled a max of 100 pa
|
|
30
30
|
|
31
31
|
**Example: File Harvesting mode**
|
32
32
|
```sh
|
33
|
-
rr --files
|
33
|
+
rr --files pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
|
34
34
|
```
|
35
35
|
OR -- SAME COMMAND
|
36
36
|
```sh
|
37
|
-
rr -f
|
37
|
+
rr -f pdf -p -l 1000 http://www.hubspot.com
|
38
38
|
```
|
39
39
|
|
40
40
|
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -108,7 +108,7 @@ dependencies:
|
|
108
108
|
- - ~>
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '2.14'
|
111
|
-
description:
|
111
|
+
description: Asynchronous web crawler, file harvester & autdownloader
|
112
112
|
email:
|
113
113
|
- joe@softwarebyjoe.com
|
114
114
|
executables:
|