rubyretriever 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/rr +10 -17
- data/lib/retriever/version.rb +1 -1
- data/readme.md +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c9f829e46e94b82625b0c0c67e492ee2433e25cd
|
4
|
+
data.tar.gz: 2caeb5719fe47661d29097c9ab9818b6f493710a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d031365939289932696c6483db762ad2ba8c5fd62234c2418f72b7548b18943aa94646e660c349b4a37cb4b00388b28bb6107ed8086d1264f1812dcd284c6343
|
7
|
+
data.tar.gz: ad75737457a5cf6ace00f0b83e60b961fde291d7b4a3c1a5e4b8af90c98a5b919e144ff85223175073ef2c6072e08d329ce5b2d10512ee8abbd1a314611134b7
|
data/bin/rr
CHANGED
@@ -4,8 +4,15 @@ options = {}
|
|
4
4
|
optparse = OptionParser.new do|opts|
|
5
5
|
# Set a banner, displayed at the top
|
6
6
|
# of the help screen.
|
7
|
-
opts.banner = "Usage: rr [options] Target_URL"
|
8
|
-
|
7
|
+
opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
|
8
|
+
options[:sitemap] = false
|
9
|
+
opts.on( '-s', '--sitemap FORMAT', 'MODE FLAG: Sitemap mode - Crawl site and output sitemap, format choices: CSV or XML' ) do |output_type|
|
10
|
+
options[:sitemap] = output_type
|
11
|
+
end
|
12
|
+
options[:fileharvest] = false
|
13
|
+
opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode - Crawl site and collect links for files found, extension for filetype' ) do |file_ext|
|
14
|
+
options[:fileharvest] = file_ext
|
15
|
+
end
|
9
16
|
options[:filename] = nil
|
10
17
|
opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
|
11
18
|
options[:filename] = filename
|
@@ -15,32 +22,18 @@ options = {}
|
|
15
22
|
opts.on( '-v', '--verbose', 'Output more information' ) do
|
16
23
|
options[:verbose] = true
|
17
24
|
end
|
18
|
-
|
19
25
|
options[:progress] = false
|
20
|
-
opts.on( '-p', '--
|
26
|
+
opts.on( '-p', '--progress', 'Output progress bar' ) do
|
21
27
|
options[:progress] = true
|
22
28
|
end
|
23
|
-
|
24
|
-
options[:sitemap] = false
|
25
|
-
opts.on( '-s', '--sitemap FORMAT', 'Crawl site and output sitemap' ) do |output_type|
|
26
|
-
options[:sitemap] = output_type
|
27
|
-
end
|
28
|
-
|
29
|
-
options[:fileharvest] = false
|
30
|
-
opts.on( '-f', '--files FILETYPE', 'Crawl site and collect links for files found' ) do |file_ext|
|
31
|
-
options[:fileharvest] = file_ext
|
32
|
-
end
|
33
|
-
|
34
29
|
options[:maxpages] = false
|
35
30
|
opts.on( '-l', '--limit PAGE_LIMIT_#', 'set a max on the total number of crawled pages' ) do |maxpages|
|
36
31
|
options[:maxpages] = maxpages
|
37
32
|
end
|
38
|
-
|
39
33
|
options[:autodown] = false
|
40
34
|
opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
|
41
35
|
options[:autodown] = true
|
42
36
|
end
|
43
|
-
|
44
37
|
# This displays the help screen, all programs are
|
45
38
|
# assumed to have this option.
|
46
39
|
opts.on( '-h', '--help', 'Display this screen' ) do
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -30,11 +30,11 @@ This would go to http://www.cnet.com and map it until it crawled a max of 100 pa
|
|
30
30
|
|
31
31
|
**Example: File Harvesting mode**
|
32
32
|
```sh
|
33
|
-
rr --files
|
33
|
+
rr --files pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
|
34
34
|
```
|
35
35
|
OR -- SAME COMMAND
|
36
36
|
```sh
|
37
|
-
rr -f
|
37
|
+
rr -f pdf -p -l 1000 http://www.hubspot.com
|
38
38
|
```
|
39
39
|
|
40
40
|
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -108,7 +108,7 @@ dependencies:
|
|
108
108
|
- - ~>
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '2.14'
|
111
|
-
description:
|
111
|
+
description: Asynchronous web crawler, file harvester & autdownloader
|
112
112
|
email:
|
113
113
|
- joe@softwarebyjoe.com
|
114
114
|
executables:
|