rubyretriever 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/rr +3 -2
- data/lib/retriever/fetch.rb +22 -62
- data/lib/retriever/fetchfiles.rb +5 -4
- data/lib/retriever/fetchsitemap.rb +7 -7
- data/lib/retriever/link.rb +29 -0
- data/lib/retriever/target.rb +41 -0
- data/lib/retriever/version.rb +1 -1
- data/lib/retriever.rb +7 -16
- data/readme.md +3 -2
- data/spec/link_spec.rb +66 -0
- data/spec/retriever_spec.rb +9 -22
- data/spec/target_spec.rb +39 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 924c9958e88587353cc80f4b134cca91f73f3e57
|
4
|
+
data.tar.gz: 393457cd37ad3fb372008a7829c8028f658f2b58
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5dcde12eb9fea2181b6a91c0d798351b78efa80652547afd02db536ab9d139de2969b08326d61363435baf04cc4036a0ed0a4cbdd9c884bd05314b8210c38938
|
7
|
+
data.tar.gz: 8c226a13d4e0b29beffc1940b6ca05ff9f4ae403decc1990a2a6418f90fca12e132852c48eea082918d74cb593006e98ed14ffbe9366d9dd64ef0f058eefd7a2
|
data/bin/rr
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
require 'retriever'
|
3
|
+
require 'optparse'
|
3
4
|
options = {}
|
4
5
|
optparse = OptionParser.new do|opts|
|
5
6
|
# Set a banner, displayed at the top
|
@@ -61,8 +62,8 @@ ARGV.each do|q|
|
|
61
62
|
end
|
62
63
|
puts "###############################"
|
63
64
|
puts "### [RubyRetriever] go fetch #{q}"
|
64
|
-
|
65
|
-
|
65
|
+
Retriever::FetchFiles.new(q, options) if options[:fileharvest]
|
66
|
+
Retriever::FetchSitemap.new(q, options) if options[:sitemap]
|
66
67
|
puts "### [RubyRetriever] is done."
|
67
68
|
puts "###############################"
|
68
69
|
puts
|
data/lib/retriever/fetch.rb
CHANGED
@@ -1,19 +1,20 @@
|
|
1
|
+
require 'em-synchrony'
|
2
|
+
require 'em-synchrony/em-http'
|
3
|
+
require 'em-synchrony/fiber_iterator'
|
4
|
+
require 'ruby-progressbar'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'csv'
|
7
|
+
require 'bloomfilter-rb'
|
8
|
+
|
1
9
|
module Retriever
|
2
10
|
class Fetch
|
3
|
-
attr_reader :
|
11
|
+
attr_reader :maxPages, :t
|
4
12
|
#constants
|
5
|
-
HTTP_RE = Regexp.new(/^http/i).freeze
|
6
13
|
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
7
14
|
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
|
8
|
-
SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
|
9
|
-
DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
|
10
|
-
NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
|
11
|
-
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
12
15
|
|
13
16
|
def initialize(url,options)
|
14
|
-
|
15
|
-
@target = new_uri.to_s
|
16
|
-
@host = new_uri.host
|
17
|
+
@t = Retriever::Target.new(url)
|
17
18
|
#OPTIONS
|
18
19
|
@prgrss = options[:progress] ? options[:progress] : false
|
19
20
|
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
@@ -24,14 +25,13 @@ module Retriever
|
|
24
25
|
@s = options[:sitemap] ? options[:sitemap] : false
|
25
26
|
@autodown = options[:autodown] ? true : false
|
26
27
|
#
|
27
|
-
@host_re = Regexp.new(host).freeze
|
28
28
|
if @fh
|
29
29
|
tempExtStr = "."+@file_ext+'\z'
|
30
30
|
@file_re = Regexp.new(tempExtStr).freeze
|
31
31
|
else
|
32
32
|
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
33
33
|
if !@output
|
34
|
-
@output = "rr-#{@host.split('.')[1]}"
|
34
|
+
@output = "rr-#{@t.host.split('.')[1]}"
|
35
35
|
end
|
36
36
|
end
|
37
37
|
if @prgrss
|
@@ -45,7 +45,7 @@ module Retriever
|
|
45
45
|
@progressbar = ProgressBar.create(prgressVars)
|
46
46
|
end
|
47
47
|
@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
48
|
-
@already_crawled.insert(@target)
|
48
|
+
@already_crawled.insert(@t.target)
|
49
49
|
end
|
50
50
|
def errlog(msg)
|
51
51
|
raise "ERROR: #{msg}"
|
@@ -56,10 +56,10 @@ module Retriever
|
|
56
56
|
def dump(data)
|
57
57
|
puts "###############################"
|
58
58
|
if @s
|
59
|
-
puts "#{@target} Sitemap"
|
59
|
+
puts "#{@t.target} Sitemap"
|
60
60
|
puts "Page Count: #{data.size}"
|
61
61
|
elsif @fh
|
62
|
-
puts "Target URL: #{@target}"
|
62
|
+
puts "Target URL: #{@t.target}"
|
63
63
|
puts "Filetype: #{@file_ext}"
|
64
64
|
puts "File Count: #{data.size}"
|
65
65
|
else
|
@@ -84,58 +84,20 @@ module Retriever
|
|
84
84
|
puts
|
85
85
|
end
|
86
86
|
end
|
87
|
-
def fetchPage(url)
|
88
|
-
resp = false
|
89
|
-
EM.synchrony do
|
90
|
-
begin
|
91
|
-
resp = EventMachine::HttpRequest.new(url).get
|
92
|
-
rescue StandardError => e
|
93
|
-
#puts e.message + " ## " + url
|
94
|
-
#the trap abrt is nescessary to handle the SSL error
|
95
|
-
#for some ungodly reason it's the only way I found to handle it
|
96
|
-
trap("ABRT"){
|
97
|
-
puts "#{url} failed SSL Certification Verification"
|
98
|
-
}
|
99
|
-
return false
|
100
|
-
end
|
101
|
-
lg("URL Crawled: #{url}")
|
102
|
-
EventMachine.stop
|
103
|
-
end
|
104
|
-
if resp.response == ""
|
105
|
-
errlog("Domain is not working. Try the non-WWW version.")
|
106
|
-
end
|
107
|
-
return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
|
108
|
-
end
|
109
87
|
#recieves page source as string
|
110
88
|
#returns array of unique href links
|
111
89
|
def fetchLinks(doc)
|
112
90
|
return false if !doc
|
113
|
-
|
114
|
-
|
115
|
-
link
|
116
|
-
|
117
|
-
if (DUB_DUB_DUB_DOT_RE =~ link)
|
118
|
-
link = "http://#{link}"
|
119
|
-
elsif SINGLE_SLASH_RE =~ link #link uses relative path
|
120
|
-
link = "http://#{@host}"+link #appending hostname to relative paths
|
121
|
-
elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
|
122
|
-
link = "http:#{link}" #appending current url to relative paths
|
123
|
-
elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
|
124
|
-
link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
|
125
|
-
else
|
126
|
-
next
|
127
|
-
end
|
128
|
-
end
|
129
|
-
linkArray.push(link)
|
130
|
-
end
|
131
|
-
linkArray.uniq!
|
91
|
+
doc.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
|
92
|
+
link = match[0]
|
93
|
+
Link.new(@t.host, link).path
|
94
|
+
end.uniq
|
132
95
|
end
|
133
96
|
def parseInternalLinks(all_links)
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
end
|
97
|
+
all_links.select{ |linky| (@t.host_re =~ linky) }
|
98
|
+
end
|
99
|
+
def parseInternalVisitableLinks(all_links)
|
100
|
+
parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
|
139
101
|
end
|
140
102
|
def async_crawl_and_collect()
|
141
103
|
while (@already_crawled.size < @maxPages)
|
@@ -147,8 +109,6 @@ module Retriever
|
|
147
109
|
end
|
148
110
|
break;
|
149
111
|
end
|
150
|
-
#puts "New loop"
|
151
|
-
#puts @linkStack
|
152
112
|
new_links_arr = self.asyncGetWave()
|
153
113
|
next if (new_links_arr.nil? || new_links_arr.empty?)
|
154
114
|
new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -4,16 +4,17 @@ module Retriever
|
|
4
4
|
def initialize(url,options)
|
5
5
|
super
|
6
6
|
@fileStack = []
|
7
|
-
all_links = self.fetchLinks(
|
8
|
-
@linkStack = self.
|
7
|
+
all_links = self.fetchLinks(@t.source)
|
8
|
+
@linkStack = self.parseInternalVisitableLinks(all_links)
|
9
|
+
lg("URL Crawled: #{@t.target}")
|
9
10
|
self.lg("#{@linkStack.size-1} new links found")
|
10
11
|
|
11
12
|
tempFileCollection = self.parseFiles(all_links)
|
12
13
|
@fileStack.concat(tempFileCollection) if tempFileCollection.size>0
|
13
14
|
self.lg("#{@fileStack.size} new files found")
|
14
|
-
errlog("Bad URL -- #{@target}") if !@linkStack
|
15
|
+
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
15
16
|
|
16
|
-
@linkStack.delete(@target) if @linkStack.include?(@target)
|
17
|
+
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
17
18
|
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
18
19
|
|
19
20
|
self.async_crawl_and_collect()
|
@@ -3,12 +3,13 @@ module Retriever
|
|
3
3
|
attr_reader :sitemap
|
4
4
|
def initialize(url,options)
|
5
5
|
super
|
6
|
-
@sitemap = [@target]
|
7
|
-
@linkStack = self.
|
6
|
+
@sitemap = [@t.target]
|
7
|
+
@linkStack = self.parseInternalVisitableLinks(self.fetchLinks(@t.source))
|
8
|
+
lg("URL Crawled: #{@t.target}")
|
8
9
|
self.lg("#{@linkStack.size-1} new links found")
|
9
|
-
errlog("Bad URL -- #{@target}") if !@linkStack
|
10
|
+
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
10
11
|
|
11
|
-
@linkStack.delete(@target) if @linkStack.include?(@target)
|
12
|
+
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
12
13
|
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
13
14
|
@sitemap.concat(@linkStack)
|
14
15
|
|
@@ -16,14 +17,13 @@ module Retriever
|
|
16
17
|
|
17
18
|
@sitemap.sort_by! {|x| x.length} if @sitemap.size>1
|
18
19
|
@sitemap.uniq!
|
19
|
-
@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
|
20
20
|
|
21
21
|
self.dump(self.sitemap)
|
22
22
|
self.write(self.sitemap) if /CSV/i =~ @s
|
23
23
|
self.gen_xml(self.sitemap) if /XML/i =~ @s
|
24
24
|
end
|
25
25
|
def gen_xml(data)
|
26
|
-
f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
|
26
|
+
f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
|
27
27
|
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
28
28
|
data.each do |url|
|
29
29
|
f << "<url><loc>#{url}</loc></url>"
|
@@ -31,7 +31,7 @@ module Retriever
|
|
31
31
|
f << "</urlset>"
|
32
32
|
f.close
|
33
33
|
puts "###############################"
|
34
|
-
puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
|
34
|
+
puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
|
35
35
|
puts "Object Count: #{@sitemap.size}"
|
36
36
|
puts "###############################"
|
37
37
|
puts
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Retriever
|
2
|
+
class Link
|
3
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
4
|
+
SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
|
5
|
+
DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
|
6
|
+
NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
|
7
|
+
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
8
|
+
|
9
|
+
def initialize(host, link)
|
10
|
+
@host = host
|
11
|
+
@link = link
|
12
|
+
end
|
13
|
+
|
14
|
+
def path
|
15
|
+
return link if HTTP_RE =~ link
|
16
|
+
|
17
|
+
return "http://#{link}" if DUB_DUB_DUB_DOT_RE =~ link
|
18
|
+
|
19
|
+
return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
|
20
|
+
|
21
|
+
return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
|
22
|
+
|
23
|
+
return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
attr_reader :host, :link
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
|
3
|
+
module Retriever
|
4
|
+
class Target
|
5
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
6
|
+
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
7
|
+
attr_reader :host, :target, :host_re, :source
|
8
|
+
def initialize(url)
|
9
|
+
url = "http://#{url}" if (!(HTTP_RE =~ url))
|
10
|
+
fail "Bad URL" if (!(/\./ =~ url))
|
11
|
+
new_uri = URI(url)
|
12
|
+
@target = new_uri.to_s
|
13
|
+
@host = new_uri.host
|
14
|
+
@host_re = Regexp.new(@host).freeze
|
15
|
+
end
|
16
|
+
|
17
|
+
def source
|
18
|
+
resp = false
|
19
|
+
begin
|
20
|
+
resp = open(@target)
|
21
|
+
rescue StandardError => e
|
22
|
+
#puts e.message + " ## " + url
|
23
|
+
#the trap abrt is nescessary to handle the SSL error
|
24
|
+
#for some ungodly reason it's the only way I found to handle it
|
25
|
+
trap("ABRT"){
|
26
|
+
puts "#{@target} failed SSL Certification Verification"
|
27
|
+
}
|
28
|
+
return false
|
29
|
+
end
|
30
|
+
if (@target != resp.base_uri.to_s)
|
31
|
+
fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
|
32
|
+
end
|
33
|
+
resp = resp.read
|
34
|
+
if resp == ""
|
35
|
+
fail "Domain is not working. Try the non-WWW version."
|
36
|
+
end
|
37
|
+
return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
@@ -1,18 +1,9 @@
|
|
1
|
-
##################################################################
|
2
|
-
#####RubyRetriever -- web crawler and file harvester
|
3
|
-
#####created by Joe Norton
|
4
|
-
#####http://softwarebyjoe.com
|
5
|
-
##LICENSING: GNU GPLv3 License##################################
|
6
|
-
#! usr/bin/ruby
|
7
|
-
require 'em-synchrony'
|
8
|
-
require 'em-synchrony/em-http'
|
9
|
-
require 'em-synchrony/fiber_iterator'
|
10
|
-
require 'ruby-progressbar'
|
11
|
-
require 'open-uri'
|
12
|
-
require 'optparse'
|
13
|
-
require 'csv'
|
14
|
-
require 'bloomfilter-rb'
|
15
|
-
|
16
1
|
require 'retriever/fetch'
|
17
2
|
require 'retriever/fetchfiles'
|
18
|
-
require 'retriever/fetchsitemap'
|
3
|
+
require 'retriever/fetchsitemap'
|
4
|
+
require 'retriever/link'
|
5
|
+
require 'retriever/target'
|
6
|
+
|
7
|
+
module Retriever
|
8
|
+
|
9
|
+
end
|
data/readme.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
[RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
|
1
|
+
[RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
|
2
2
|
==============
|
3
|
-
|
3
|
+
[](http://badge.fury.io/rb/rubyretriever) [](https://travis-ci.org/joenorton/rubyretriever)
|
4
|
+
|
4
5
|
By Joe Norton
|
5
6
|
|
6
7
|
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
|
data/spec/link_spec.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'retriever'
|
2
|
+
|
3
|
+
describe "Link" do
|
4
|
+
|
5
|
+
r = Retriever::Fetch.new("http://www.cnet.com/reviews/", {})
|
6
|
+
let(:links) { r.fetchLinks(@source) }
|
7
|
+
|
8
|
+
it "collects links in anchor tags" do
|
9
|
+
@source = (<<SOURCE).strip
|
10
|
+
<a href='http://www.cnet.com/download.exe'>download</a>
|
11
|
+
SOURCE
|
12
|
+
|
13
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "collects links in link tags" do
|
17
|
+
@source = (<<SOURCE).strip
|
18
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
19
|
+
SOURCE
|
20
|
+
|
21
|
+
expect(links).to include('http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12')
|
22
|
+
end
|
23
|
+
|
24
|
+
it "does not collect bare links (ones not in an href)" do
|
25
|
+
@source = (<<SOURCE).strip
|
26
|
+
http://www.google.com
|
27
|
+
SOURCE
|
28
|
+
|
29
|
+
expect(links).to_not include('http://www.google.com')
|
30
|
+
end
|
31
|
+
|
32
|
+
it "collects only unique href links on the page" do
|
33
|
+
@source = (<<SOURCE).strip
|
34
|
+
<a href='http://www.cnet.com/products/gadgets'>gadgets</a>
|
35
|
+
<a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
|
36
|
+
SOURCE
|
37
|
+
|
38
|
+
expect(links).to have(1).items
|
39
|
+
end
|
40
|
+
|
41
|
+
it "adds a protocol to urls missing them (www.)" do
|
42
|
+
@source = (<<SOURCE).strip
|
43
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
44
|
+
SOURCE
|
45
|
+
|
46
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
47
|
+
end
|
48
|
+
|
49
|
+
it "doesn't care about any extra attributes on the anchor tag" do
|
50
|
+
@source = (<<SOURCE).strip
|
51
|
+
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
52
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
|
53
|
+
SOURCE
|
54
|
+
|
55
|
+
expect(links).to have(1).item
|
56
|
+
end
|
57
|
+
|
58
|
+
it "returns relative urls with full path based on hostname" do
|
59
|
+
@source = (<<SOURCE).strip
|
60
|
+
<a href='/test.html'>test</a>
|
61
|
+
<a href='cpage_18'>about</a>
|
62
|
+
SOURCE
|
63
|
+
|
64
|
+
expect(links).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
|
65
|
+
end
|
66
|
+
end
|
data/spec/retriever_spec.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require 'retriever'
|
2
2
|
|
3
3
|
r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
|
4
4
|
test_html = "<a href='www.cnet.com/download.exe'>download</a>
|
@@ -12,49 +12,36 @@ http://www.google.com
|
|
12
12
|
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
13
13
|
<a href='cpage_18'>about</a>"
|
14
14
|
|
15
|
-
doc = r.fetchPage(r.target)
|
16
15
|
links_collection = r.fetchLinks(test_html)
|
17
|
-
filtered_links = r.parseInternalLinks(links_collection)
|
18
|
-
file_list = r.parseFiles(links_collection)
|
19
16
|
|
20
17
|
describe "Fetch" do
|
21
18
|
|
22
|
-
describe "#new" do
|
23
|
-
it "sets target, host, and max page vars" do
|
24
|
-
expect(r.target).to eq("http://www.cnet.com/reviews/")
|
25
|
-
expect(r.host).to eq("www.cnet.com")
|
26
|
-
expect(r.maxPages).to eq(100)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
describe "#fetchPage" do
|
31
|
-
it "opens URL and returns source as String" do
|
32
|
-
expect(doc.class).to eq(String)
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
19
|
describe "#fetchLinks" do
|
37
20
|
it "collects all unique href links on the page" do
|
38
21
|
expect(links_collection).to have(6).items
|
39
22
|
end
|
40
|
-
it "returns relative urls with full path based on hostname" do
|
41
|
-
expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
|
42
|
-
end
|
43
23
|
end
|
44
24
|
|
45
25
|
describe "#parseInternalLinks" do
|
26
|
+
let (:filtered_links) {r.parseInternalLinks(links_collection)}
|
46
27
|
it "filters links by host" do
|
47
28
|
filtered_links.each do |link|
|
48
|
-
expect(link).to include(
|
29
|
+
expect(link).to include("www.cnet.com")
|
49
30
|
end
|
50
31
|
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "#parseInternalVisitableLinks" do
|
35
|
+
let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
|
51
36
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
52
37
|
filtered_links.each do |link|
|
53
38
|
expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
|
54
39
|
end
|
55
40
|
end
|
56
41
|
end
|
42
|
+
|
57
43
|
describe "#parseFiles" do
|
44
|
+
let(:file_list) {r.parseFiles(links_collection)}
|
58
45
|
it "filters links by filetype" do
|
59
46
|
file_list.each do |link|
|
60
47
|
expect(link).to include(".exe")
|
data/spec/target_spec.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'retriever'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
t = Retriever::Target.new("http://www.cnet.com/reviews/")
|
5
|
+
|
6
|
+
describe "Target" do
|
7
|
+
|
8
|
+
it "creates target var" do
|
9
|
+
expect(t.target).to eq("http://www.cnet.com/reviews/")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "creates host var" do
|
13
|
+
expect(t.host).to eq("www.cnet.com")
|
14
|
+
end
|
15
|
+
|
16
|
+
it "creates host_re var" do
|
17
|
+
expect(t.host_re).to eq(/www.cnet.com/)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "adds protocol to Target URL if none given" do
|
21
|
+
expect(Retriever::Target.new("cnet.com").target).to eq("http://cnet.com")
|
22
|
+
end
|
23
|
+
|
24
|
+
it "fails if given URL has no dot in it" do
|
25
|
+
expect{Retriever::Target.new("cnetcom")}.to raise_error
|
26
|
+
end
|
27
|
+
|
28
|
+
describe "#source" do
|
29
|
+
|
30
|
+
it "opens URL and returns source as String" do
|
31
|
+
expect(Retriever::Target.new("http://techcrunch.com/").source.class).to eq(String)
|
32
|
+
end
|
33
|
+
|
34
|
+
it "fails if target redirects to new host" do
|
35
|
+
expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -122,10 +122,14 @@ files:
|
|
122
122
|
- lib/retriever/fetch.rb
|
123
123
|
- lib/retriever/fetchfiles.rb
|
124
124
|
- lib/retriever/fetchsitemap.rb
|
125
|
+
- lib/retriever/link.rb
|
126
|
+
- lib/retriever/target.rb
|
125
127
|
- lib/retriever/version.rb
|
126
128
|
- readme.md
|
129
|
+
- spec/link_spec.rb
|
127
130
|
- spec/retriever_spec.rb
|
128
131
|
- spec/spec_helper.rb
|
132
|
+
- spec/target_spec.rb
|
129
133
|
homepage: http://www.softwarebyjoe.com/rubyretriever/
|
130
134
|
licenses:
|
131
135
|
- MIT
|