rubyretriever 0.0.13 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/rr +8 -12
- data/lib/retriever/fetch.rb +7 -4
- data/lib/retriever/fetchfiles.rb +1 -1
- data/lib/retriever/fetchsitemap.rb +16 -1
- data/lib/retriever/version.rb +1 -1
- data/readme.md +10 -11
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b3eba5658a9b3aa77a522d46a9564acb2a8eea5d
|
4
|
+
data.tar.gz: d3315f447ca908cf14f31fe1ff7196f9ca5e6744
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 99c13086efcb81db33e48a1ebee4e7021059dddcacc3e5fa2ec6f8e6159a0d0840dbe3068fa35f98eca026d1f8d5c5f90ed5fade7e7e2abbbe111e70dc6dbdc8
|
7
|
+
data.tar.gz: 24c5de4333f44d0391d8bdca3325a820e42b37534a1005fc608150855eed852205c85d5687828cdc75475ca8788a6522d9ca82032b9df7b3c67a17c36ae184b1
|
data/bin/rr
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
-
|
2
|
+
require 'retriever'
|
3
3
|
options = {}
|
4
4
|
optparse = OptionParser.new do|opts|
|
5
5
|
# Set a banner, displayed at the top
|
@@ -22,13 +22,13 @@ options = {}
|
|
22
22
|
end
|
23
23
|
|
24
24
|
options[:sitemap] = false
|
25
|
-
opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
|
26
|
-
options[:sitemap] =
|
25
|
+
opts.on( '-s', '--sitemap FORMAT', 'Crawl site and output sitemap' ) do |output_type|
|
26
|
+
options[:sitemap] = output_type
|
27
27
|
end
|
28
28
|
|
29
29
|
options[:fileharvest] = false
|
30
|
-
opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
|
31
|
-
options[:fileharvest] =
|
30
|
+
opts.on( '-f', '--files FILETYPE', 'Crawl site and collect links for files found' ) do |file_ext|
|
31
|
+
options[:fileharvest] = file_ext
|
32
32
|
end
|
33
33
|
|
34
34
|
options[:maxpages] = false
|
@@ -36,13 +36,8 @@ options = {}
|
|
36
36
|
options[:maxpages] = maxpages
|
37
37
|
end
|
38
38
|
|
39
|
-
options[:file_ext] = false
|
40
|
-
opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
|
41
|
-
options[:file_ext] = file_ext
|
42
|
-
end
|
43
|
-
|
44
39
|
options[:autodown] = false
|
45
|
-
opts.on( '-a', '--
|
40
|
+
opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
|
46
41
|
options[:autodown] = true
|
47
42
|
end
|
48
43
|
|
@@ -64,8 +59,9 @@ ARGV.each do|q|
|
|
64
59
|
puts "###############################"
|
65
60
|
puts "### [RubyRetriever]"
|
66
61
|
puts "### Creating Sitemap" if options[:sitemap]
|
62
|
+
puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
|
67
63
|
puts "### Performing File Harvest" if options[:fileharvest]
|
68
|
-
puts "### Searching for file extension: #{options[:
|
64
|
+
puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
|
69
65
|
puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
|
70
66
|
puts "### Being verbose"
|
71
67
|
puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
|
data/lib/retriever/fetch.rb
CHANGED
@@ -19,18 +19,20 @@ module Retriever
|
|
19
19
|
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
20
20
|
@v= options[:verbose] ? true : false
|
21
21
|
@output=options[:filename] ? options[:filename] : false
|
22
|
-
@fh = options[:fileharvest] ?
|
23
|
-
@
|
24
|
-
@
|
22
|
+
@fh = options[:fileharvest] ? options[:fileharvest] : false
|
23
|
+
@file_ext = @fh.to_s
|
24
|
+
@s = options[:sitemap] ? options[:sitemap] : false
|
25
25
|
@autodown = options[:autodown] ? true : false
|
26
26
|
#
|
27
27
|
@host_re = Regexp.new(host).freeze
|
28
28
|
if @fh
|
29
|
-
errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
|
30
29
|
tempExtStr = "."+@file_ext+'\z'
|
31
30
|
@file_re = Regexp.new(tempExtStr).freeze
|
32
31
|
else
|
33
32
|
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
33
|
+
if !@output
|
34
|
+
@output = "rr-#{@host.split('.')[1]}"
|
35
|
+
end
|
34
36
|
end
|
35
37
|
if @prgrss
|
36
38
|
errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
|
@@ -153,6 +155,7 @@ module Retriever
|
|
153
155
|
@linkStack.concat(new_links_arr)
|
154
156
|
@sitemap.concat(new_links_arr) if @s
|
155
157
|
end
|
158
|
+
@progressbar.finish
|
156
159
|
end
|
157
160
|
def asyncGetWave() #send a new wave of GET requests, using current @linkStack
|
158
161
|
new_stuff = []
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -19,7 +19,22 @@ module Retriever
|
|
19
19
|
@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
|
20
20
|
|
21
21
|
self.dump(self.sitemap)
|
22
|
-
self.write(self.sitemap) if @
|
22
|
+
self.write(self.sitemap) if /CSV/i =~ @s
|
23
|
+
self.gen_xml(self.sitemap) if /XML/i =~ @s
|
24
|
+
end
|
25
|
+
def gen_xml(data)
|
26
|
+
f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
|
27
|
+
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
28
|
+
data.each do |url|
|
29
|
+
f << "<url><loc>#{url}</loc></url>"
|
30
|
+
end
|
31
|
+
f << "</urlset>"
|
32
|
+
f.close
|
33
|
+
puts "###############################"
|
34
|
+
puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
|
35
|
+
puts "Object Count: #{@sitemap.size}"
|
36
|
+
puts "###############################"
|
37
|
+
puts
|
23
38
|
end
|
24
39
|
end
|
25
40
|
end
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -19,14 +19,14 @@ gem install rubyretriever
|
|
19
19
|
|
20
20
|
**Example: Sitemap mode**
|
21
21
|
```sh
|
22
|
-
rr --sitemap --progress --limit 100
|
22
|
+
rr --sitemap CSV --progress --limit 100 http://www.cnet.com
|
23
23
|
```
|
24
24
|
OR -- SAME COMMAND
|
25
25
|
```sh
|
26
|
-
rr -s -p -l 100
|
26
|
+
rr -s csv -p -l 100 http://www.cnet.com
|
27
27
|
```
|
28
28
|
|
29
|
-
This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet.
|
29
|
+
This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
|
30
30
|
|
31
31
|
**Example: File Harvesting mode**
|
32
32
|
```sh
|
@@ -34,26 +34,25 @@ rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot
|
|
34
34
|
```
|
35
35
|
OR -- SAME COMMAND
|
36
36
|
```sh
|
37
|
-
rr -f -e pdf -p -l 1000
|
37
|
+
rr -f -e pdf -p -l 1000 http://www.hubspot.com
|
38
38
|
```
|
39
39
|
|
40
|
-
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot
|
40
|
+
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
|
41
41
|
|
42
42
|
|
43
43
|
command-line arguments
|
44
44
|
-----------------------
|
45
|
-
Usage: rr [MODE] [OPTIONS] Target_URL
|
45
|
+
Usage: rr [MODE FLAG] [OPTIONS] Target_URL
|
46
46
|
|
47
|
-
Where MODE FLAG is either:
|
48
|
-
-s, --sitemap
|
49
|
-
-f, --files
|
47
|
+
Where MODE FLAG is required, and is either:
|
48
|
+
-s, --sitemap FORMAT
|
49
|
+
-f, --files FILETYPE
|
50
50
|
|
51
51
|
and OPTIONS is the applicable:
|
52
|
-
-o, --out FILENAME *Dump output to selected filename*
|
52
|
+
-o, --out FILENAME *Dump output to selected filename --being phased out*
|
53
53
|
-p, --progress *Outputs a progressbar*
|
54
54
|
-v, --verbose *Output more information*
|
55
55
|
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
56
|
-
-e, --ext FILE_EXTENSION *set a file extension to look for on crawled pages*
|
57
56
|
-h, --help *Display this screen*
|
58
57
|
|
59
58
|
Current Requirements
|