rubyretriever 0.0.13 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/rr +8 -12
- data/lib/retriever/fetch.rb +7 -4
- data/lib/retriever/fetchfiles.rb +1 -1
- data/lib/retriever/fetchsitemap.rb +16 -1
- data/lib/retriever/version.rb +1 -1
- data/readme.md +10 -11
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b3eba5658a9b3aa77a522d46a9564acb2a8eea5d
|
4
|
+
data.tar.gz: d3315f447ca908cf14f31fe1ff7196f9ca5e6744
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 99c13086efcb81db33e48a1ebee4e7021059dddcacc3e5fa2ec6f8e6159a0d0840dbe3068fa35f98eca026d1f8d5c5f90ed5fade7e7e2abbbe111e70dc6dbdc8
|
7
|
+
data.tar.gz: 24c5de4333f44d0391d8bdca3325a820e42b37534a1005fc608150855eed852205c85d5687828cdc75475ca8788a6522d9ca82032b9df7b3c67a17c36ae184b1
|
data/bin/rr
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
|
-
|
2
|
+
require 'retriever'
|
3
3
|
options = {}
|
4
4
|
optparse = OptionParser.new do|opts|
|
5
5
|
# Set a banner, displayed at the top
|
@@ -22,13 +22,13 @@ options = {}
|
|
22
22
|
end
|
23
23
|
|
24
24
|
options[:sitemap] = false
|
25
|
-
opts.on( '-s', '--sitemap', 'Crawl site and output sitemap' ) do
|
26
|
-
options[:sitemap] =
|
25
|
+
opts.on( '-s', '--sitemap FORMAT', 'Crawl site and output sitemap' ) do |output_type|
|
26
|
+
options[:sitemap] = output_type
|
27
27
|
end
|
28
28
|
|
29
29
|
options[:fileharvest] = false
|
30
|
-
opts.on( '-f', '--files', 'Crawl site and collect links for files found' ) do
|
31
|
-
options[:fileharvest] =
|
30
|
+
opts.on( '-f', '--files FILETYPE', 'Crawl site and collect links for files found' ) do |file_ext|
|
31
|
+
options[:fileharvest] = file_ext
|
32
32
|
end
|
33
33
|
|
34
34
|
options[:maxpages] = false
|
@@ -36,13 +36,8 @@ options = {}
|
|
36
36
|
options[:maxpages] = maxpages
|
37
37
|
end
|
38
38
|
|
39
|
-
options[:file_ext] = false
|
40
|
-
opts.on( '-e', '--ext FILE_EXTENSION', 'set a file extension to look for on crawled pages' ) do |file_ext|
|
41
|
-
options[:file_ext] = file_ext
|
42
|
-
end
|
43
|
-
|
44
39
|
options[:autodown] = false
|
45
|
-
opts.on( '-a', '--
|
40
|
+
opts.on( '-a', '--auto', 'Automatically download all files of filetype located' ) do
|
46
41
|
options[:autodown] = true
|
47
42
|
end
|
48
43
|
|
@@ -64,8 +59,9 @@ ARGV.each do|q|
|
|
64
59
|
puts "###############################"
|
65
60
|
puts "### [RubyRetriever]"
|
66
61
|
puts "### Creating Sitemap" if options[:sitemap]
|
62
|
+
puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
|
67
63
|
puts "### Performing File Harvest" if options[:fileharvest]
|
68
|
-
puts "### Searching for file extension: #{options[:
|
64
|
+
puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
|
69
65
|
puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
|
70
66
|
puts "### Being verbose"
|
71
67
|
puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
|
data/lib/retriever/fetch.rb
CHANGED
@@ -19,18 +19,20 @@ module Retriever
|
|
19
19
|
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
20
20
|
@v= options[:verbose] ? true : false
|
21
21
|
@output=options[:filename] ? options[:filename] : false
|
22
|
-
@fh = options[:fileharvest] ?
|
23
|
-
@
|
24
|
-
@
|
22
|
+
@fh = options[:fileharvest] ? options[:fileharvest] : false
|
23
|
+
@file_ext = @fh.to_s
|
24
|
+
@s = options[:sitemap] ? options[:sitemap] : false
|
25
25
|
@autodown = options[:autodown] ? true : false
|
26
26
|
#
|
27
27
|
@host_re = Regexp.new(host).freeze
|
28
28
|
if @fh
|
29
|
-
errlog("Please provide a FILETYPE. It is required for file harvest mode.") if !@file_ext
|
30
29
|
tempExtStr = "."+@file_ext+'\z'
|
31
30
|
@file_re = Regexp.new(tempExtStr).freeze
|
32
31
|
else
|
33
32
|
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
33
|
+
if !@output
|
34
|
+
@output = "rr-#{@host.split('.')[1]}"
|
35
|
+
end
|
34
36
|
end
|
35
37
|
if @prgrss
|
36
38
|
errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
|
@@ -153,6 +155,7 @@ module Retriever
|
|
153
155
|
@linkStack.concat(new_links_arr)
|
154
156
|
@sitemap.concat(new_links_arr) if @s
|
155
157
|
end
|
158
|
+
@progressbar.finish
|
156
159
|
end
|
157
160
|
def asyncGetWave() #send a new wave of GET requests, using current @linkStack
|
158
161
|
new_stuff = []
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -19,7 +19,22 @@ module Retriever
|
|
19
19
|
@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
|
20
20
|
|
21
21
|
self.dump(self.sitemap)
|
22
|
-
self.write(self.sitemap) if @
|
22
|
+
self.write(self.sitemap) if /CSV/i =~ @s
|
23
|
+
self.gen_xml(self.sitemap) if /XML/i =~ @s
|
24
|
+
end
|
25
|
+
def gen_xml(data)
|
26
|
+
f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
|
27
|
+
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
28
|
+
data.each do |url|
|
29
|
+
f << "<url><loc>#{url}</loc></url>"
|
30
|
+
end
|
31
|
+
f << "</urlset>"
|
32
|
+
f.close
|
33
|
+
puts "###############################"
|
34
|
+
puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
|
35
|
+
puts "Object Count: #{@sitemap.size}"
|
36
|
+
puts "###############################"
|
37
|
+
puts
|
23
38
|
end
|
24
39
|
end
|
25
40
|
end
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -19,14 +19,14 @@ gem install rubyretriever
|
|
19
19
|
|
20
20
|
**Example: Sitemap mode**
|
21
21
|
```sh
|
22
|
-
rr --sitemap --progress --limit 100
|
22
|
+
rr --sitemap CSV --progress --limit 100 http://www.cnet.com
|
23
23
|
```
|
24
24
|
OR -- SAME COMMAND
|
25
25
|
```sh
|
26
|
-
rr -s -p -l 100
|
26
|
+
rr -s csv -p -l 100 http://www.cnet.com
|
27
27
|
```
|
28
28
|
|
29
|
-
This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet.
|
29
|
+
This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
|
30
30
|
|
31
31
|
**Example: File Harvesting mode**
|
32
32
|
```sh
|
@@ -34,26 +34,25 @@ rr --files --ext pdf --progress --limit 1000 --output hubspot http://www.hubspot
|
|
34
34
|
```
|
35
35
|
OR -- SAME COMMAND
|
36
36
|
```sh
|
37
|
-
rr -f -e pdf -p -l 1000
|
37
|
+
rr -f -e pdf -p -l 1000 http://www.hubspot.com
|
38
38
|
```
|
39
39
|
|
40
|
-
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot
|
40
|
+
This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
|
41
41
|
|
42
42
|
|
43
43
|
command-line arguments
|
44
44
|
-----------------------
|
45
|
-
Usage: rr [MODE] [OPTIONS] Target_URL
|
45
|
+
Usage: rr [MODE FLAG] [OPTIONS] Target_URL
|
46
46
|
|
47
|
-
Where MODE FLAG is either:
|
48
|
-
-s, --sitemap
|
49
|
-
-f, --files
|
47
|
+
Where MODE FLAG is required, and is either:
|
48
|
+
-s, --sitemap FORMAT
|
49
|
+
-f, --files FILETYPE
|
50
50
|
|
51
51
|
and OPTIONS is the applicable:
|
52
|
-
-o, --out FILENAME *Dump output to selected filename*
|
52
|
+
-o, --out FILENAME *Dump output to selected filename --being phased out*
|
53
53
|
-p, --progress *Outputs a progressbar*
|
54
54
|
-v, --verbose *Output more information*
|
55
55
|
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
56
|
-
-e, --ext FILE_EXTENSION *set a file extension to look for on crawled pages*
|
57
56
|
-h, --help *Display this screen*
|
58
57
|
|
59
58
|
Current Requirements
|