rubyretriever 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/rr +3 -2
- data/lib/retriever/fetch.rb +22 -62
- data/lib/retriever/fetchfiles.rb +5 -4
- data/lib/retriever/fetchsitemap.rb +7 -7
- data/lib/retriever/link.rb +29 -0
- data/lib/retriever/target.rb +41 -0
- data/lib/retriever/version.rb +1 -1
- data/lib/retriever.rb +7 -16
- data/readme.md +3 -2
- data/spec/link_spec.rb +66 -0
- data/spec/retriever_spec.rb +9 -22
- data/spec/target_spec.rb +39 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 924c9958e88587353cc80f4b134cca91f73f3e57
|
4
|
+
data.tar.gz: 393457cd37ad3fb372008a7829c8028f658f2b58
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5dcde12eb9fea2181b6a91c0d798351b78efa80652547afd02db536ab9d139de2969b08326d61363435baf04cc4036a0ed0a4cbdd9c884bd05314b8210c38938
|
7
|
+
data.tar.gz: 8c226a13d4e0b29beffc1940b6ca05ff9f4ae403decc1990a2a6418f90fca12e132852c48eea082918d74cb593006e98ed14ffbe9366d9dd64ef0f058eefd7a2
|
data/bin/rr
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#! /usr/bin/env ruby
|
2
2
|
require 'retriever'
|
3
|
+
require 'optparse'
|
3
4
|
options = {}
|
4
5
|
optparse = OptionParser.new do|opts|
|
5
6
|
# Set a banner, displayed at the top
|
@@ -61,8 +62,8 @@ ARGV.each do|q|
|
|
61
62
|
end
|
62
63
|
puts "###############################"
|
63
64
|
puts "### [RubyRetriever] go fetch #{q}"
|
64
|
-
|
65
|
-
|
65
|
+
Retriever::FetchFiles.new(q, options) if options[:fileharvest]
|
66
|
+
Retriever::FetchSitemap.new(q, options) if options[:sitemap]
|
66
67
|
puts "### [RubyRetriever] is done."
|
67
68
|
puts "###############################"
|
68
69
|
puts
|
data/lib/retriever/fetch.rb
CHANGED
@@ -1,19 +1,20 @@
|
|
1
|
+
require 'em-synchrony'
|
2
|
+
require 'em-synchrony/em-http'
|
3
|
+
require 'em-synchrony/fiber_iterator'
|
4
|
+
require 'ruby-progressbar'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'csv'
|
7
|
+
require 'bloomfilter-rb'
|
8
|
+
|
1
9
|
module Retriever
|
2
10
|
class Fetch
|
3
|
-
attr_reader :
|
11
|
+
attr_reader :maxPages, :t
|
4
12
|
#constants
|
5
|
-
HTTP_RE = Regexp.new(/^http/i).freeze
|
6
13
|
HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
|
7
14
|
NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
|
8
|
-
SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
|
9
|
-
DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
|
10
|
-
NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
|
11
|
-
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
12
15
|
|
13
16
|
def initialize(url,options)
|
14
|
-
|
15
|
-
@target = new_uri.to_s
|
16
|
-
@host = new_uri.host
|
17
|
+
@t = Retriever::Target.new(url)
|
17
18
|
#OPTIONS
|
18
19
|
@prgrss = options[:progress] ? options[:progress] : false
|
19
20
|
@maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
|
@@ -24,14 +25,13 @@ module Retriever
|
|
24
25
|
@s = options[:sitemap] ? options[:sitemap] : false
|
25
26
|
@autodown = options[:autodown] ? true : false
|
26
27
|
#
|
27
|
-
@host_re = Regexp.new(host).freeze
|
28
28
|
if @fh
|
29
29
|
tempExtStr = "."+@file_ext+'\z'
|
30
30
|
@file_re = Regexp.new(tempExtStr).freeze
|
31
31
|
else
|
32
32
|
errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
|
33
33
|
if !@output
|
34
|
-
@output = "rr-#{@host.split('.')[1]}"
|
34
|
+
@output = "rr-#{@t.host.split('.')[1]}"
|
35
35
|
end
|
36
36
|
end
|
37
37
|
if @prgrss
|
@@ -45,7 +45,7 @@ module Retriever
|
|
45
45
|
@progressbar = ProgressBar.create(prgressVars)
|
46
46
|
end
|
47
47
|
@already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
48
|
-
@already_crawled.insert(@target)
|
48
|
+
@already_crawled.insert(@t.target)
|
49
49
|
end
|
50
50
|
def errlog(msg)
|
51
51
|
raise "ERROR: #{msg}"
|
@@ -56,10 +56,10 @@ module Retriever
|
|
56
56
|
def dump(data)
|
57
57
|
puts "###############################"
|
58
58
|
if @s
|
59
|
-
puts "#{@target} Sitemap"
|
59
|
+
puts "#{@t.target} Sitemap"
|
60
60
|
puts "Page Count: #{data.size}"
|
61
61
|
elsif @fh
|
62
|
-
puts "Target URL: #{@target}"
|
62
|
+
puts "Target URL: #{@t.target}"
|
63
63
|
puts "Filetype: #{@file_ext}"
|
64
64
|
puts "File Count: #{data.size}"
|
65
65
|
else
|
@@ -84,58 +84,20 @@ module Retriever
|
|
84
84
|
puts
|
85
85
|
end
|
86
86
|
end
|
87
|
-
def fetchPage(url)
|
88
|
-
resp = false
|
89
|
-
EM.synchrony do
|
90
|
-
begin
|
91
|
-
resp = EventMachine::HttpRequest.new(url).get
|
92
|
-
rescue StandardError => e
|
93
|
-
#puts e.message + " ## " + url
|
94
|
-
#the trap abrt is nescessary to handle the SSL error
|
95
|
-
#for some ungodly reason it's the only way I found to handle it
|
96
|
-
trap("ABRT"){
|
97
|
-
puts "#{url} failed SSL Certification Verification"
|
98
|
-
}
|
99
|
-
return false
|
100
|
-
end
|
101
|
-
lg("URL Crawled: #{url}")
|
102
|
-
EventMachine.stop
|
103
|
-
end
|
104
|
-
if resp.response == ""
|
105
|
-
errlog("Domain is not working. Try the non-WWW version.")
|
106
|
-
end
|
107
|
-
return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
|
108
|
-
end
|
109
87
|
#recieves page source as string
|
110
88
|
#returns array of unique href links
|
111
89
|
def fetchLinks(doc)
|
112
90
|
return false if !doc
|
113
|
-
|
114
|
-
|
115
|
-
link
|
116
|
-
|
117
|
-
if (DUB_DUB_DUB_DOT_RE =~ link)
|
118
|
-
link = "http://#{link}"
|
119
|
-
elsif SINGLE_SLASH_RE =~ link #link uses relative path
|
120
|
-
link = "http://#{@host}"+link #appending hostname to relative paths
|
121
|
-
elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
|
122
|
-
link = "http:#{link}" #appending current url to relative paths
|
123
|
-
elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
|
124
|
-
link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
|
125
|
-
else
|
126
|
-
next
|
127
|
-
end
|
128
|
-
end
|
129
|
-
linkArray.push(link)
|
130
|
-
end
|
131
|
-
linkArray.uniq!
|
91
|
+
doc.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
|
92
|
+
link = match[0]
|
93
|
+
Link.new(@t.host, link).path
|
94
|
+
end.uniq
|
132
95
|
end
|
133
96
|
def parseInternalLinks(all_links)
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
end
|
97
|
+
all_links.select{ |linky| (@t.host_re =~ linky) }
|
98
|
+
end
|
99
|
+
def parseInternalVisitableLinks(all_links)
|
100
|
+
parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
|
139
101
|
end
|
140
102
|
def async_crawl_and_collect()
|
141
103
|
while (@already_crawled.size < @maxPages)
|
@@ -147,8 +109,6 @@ module Retriever
|
|
147
109
|
end
|
148
110
|
break;
|
149
111
|
end
|
150
|
-
#puts "New loop"
|
151
|
-
#puts @linkStack
|
152
112
|
new_links_arr = self.asyncGetWave()
|
153
113
|
next if (new_links_arr.nil? || new_links_arr.empty?)
|
154
114
|
new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -4,16 +4,17 @@ module Retriever
|
|
4
4
|
def initialize(url,options)
|
5
5
|
super
|
6
6
|
@fileStack = []
|
7
|
-
all_links = self.fetchLinks(
|
8
|
-
@linkStack = self.
|
7
|
+
all_links = self.fetchLinks(@t.source)
|
8
|
+
@linkStack = self.parseInternalVisitableLinks(all_links)
|
9
|
+
lg("URL Crawled: #{@t.target}")
|
9
10
|
self.lg("#{@linkStack.size-1} new links found")
|
10
11
|
|
11
12
|
tempFileCollection = self.parseFiles(all_links)
|
12
13
|
@fileStack.concat(tempFileCollection) if tempFileCollection.size>0
|
13
14
|
self.lg("#{@fileStack.size} new files found")
|
14
|
-
errlog("Bad URL -- #{@target}") if !@linkStack
|
15
|
+
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
15
16
|
|
16
|
-
@linkStack.delete(@target) if @linkStack.include?(@target)
|
17
|
+
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
17
18
|
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
18
19
|
|
19
20
|
self.async_crawl_and_collect()
|
@@ -3,12 +3,13 @@ module Retriever
|
|
3
3
|
attr_reader :sitemap
|
4
4
|
def initialize(url,options)
|
5
5
|
super
|
6
|
-
@sitemap = [@target]
|
7
|
-
@linkStack = self.
|
6
|
+
@sitemap = [@t.target]
|
7
|
+
@linkStack = self.parseInternalVisitableLinks(self.fetchLinks(@t.source))
|
8
|
+
lg("URL Crawled: #{@t.target}")
|
8
9
|
self.lg("#{@linkStack.size-1} new links found")
|
9
|
-
errlog("Bad URL -- #{@target}") if !@linkStack
|
10
|
+
errlog("Bad URL -- #{@t.target}") if !@linkStack
|
10
11
|
|
11
|
-
@linkStack.delete(@target) if @linkStack.include?(@target)
|
12
|
+
@linkStack.delete(@t.target) if @linkStack.include?(@t.target)
|
12
13
|
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
13
14
|
@sitemap.concat(@linkStack)
|
14
15
|
|
@@ -16,14 +17,13 @@ module Retriever
|
|
16
17
|
|
17
18
|
@sitemap.sort_by! {|x| x.length} if @sitemap.size>1
|
18
19
|
@sitemap.uniq!
|
19
|
-
@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
|
20
20
|
|
21
21
|
self.dump(self.sitemap)
|
22
22
|
self.write(self.sitemap) if /CSV/i =~ @s
|
23
23
|
self.gen_xml(self.sitemap) if /XML/i =~ @s
|
24
24
|
end
|
25
25
|
def gen_xml(data)
|
26
|
-
f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
|
26
|
+
f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
|
27
27
|
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
28
28
|
data.each do |url|
|
29
29
|
f << "<url><loc>#{url}</loc></url>"
|
@@ -31,7 +31,7 @@ module Retriever
|
|
31
31
|
f << "</urlset>"
|
32
32
|
f.close
|
33
33
|
puts "###############################"
|
34
|
-
puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
|
34
|
+
puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
|
35
35
|
puts "Object Count: #{@sitemap.size}"
|
36
36
|
puts "###############################"
|
37
37
|
puts
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Retriever
|
2
|
+
class Link
|
3
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
4
|
+
SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
|
5
|
+
DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
|
6
|
+
NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
|
7
|
+
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
8
|
+
|
9
|
+
def initialize(host, link)
|
10
|
+
@host = host
|
11
|
+
@link = link
|
12
|
+
end
|
13
|
+
|
14
|
+
def path
|
15
|
+
return link if HTTP_RE =~ link
|
16
|
+
|
17
|
+
return "http://#{link}" if DUB_DUB_DUB_DOT_RE =~ link
|
18
|
+
|
19
|
+
return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
|
20
|
+
|
21
|
+
return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
|
22
|
+
|
23
|
+
return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
attr_reader :host, :link
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
|
3
|
+
module Retriever
|
4
|
+
class Target
|
5
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
6
|
+
DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
|
7
|
+
attr_reader :host, :target, :host_re, :source
|
8
|
+
def initialize(url)
|
9
|
+
url = "http://#{url}" if (!(HTTP_RE =~ url))
|
10
|
+
fail "Bad URL" if (!(/\./ =~ url))
|
11
|
+
new_uri = URI(url)
|
12
|
+
@target = new_uri.to_s
|
13
|
+
@host = new_uri.host
|
14
|
+
@host_re = Regexp.new(@host).freeze
|
15
|
+
end
|
16
|
+
|
17
|
+
def source
|
18
|
+
resp = false
|
19
|
+
begin
|
20
|
+
resp = open(@target)
|
21
|
+
rescue StandardError => e
|
22
|
+
#puts e.message + " ## " + url
|
23
|
+
#the trap abrt is nescessary to handle the SSL error
|
24
|
+
#for some ungodly reason it's the only way I found to handle it
|
25
|
+
trap("ABRT"){
|
26
|
+
puts "#{@target} failed SSL Certification Verification"
|
27
|
+
}
|
28
|
+
return false
|
29
|
+
end
|
30
|
+
if (@target != resp.base_uri.to_s)
|
31
|
+
fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
|
32
|
+
end
|
33
|
+
resp = resp.read
|
34
|
+
if resp == ""
|
35
|
+
fail "Domain is not working. Try the non-WWW version."
|
36
|
+
end
|
37
|
+
return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
data/lib/retriever/version.rb
CHANGED
data/lib/retriever.rb
CHANGED
@@ -1,18 +1,9 @@
|
|
1
|
-
##################################################################
|
2
|
-
#####RubyRetriever -- web crawler and file harvester
|
3
|
-
#####created by Joe Norton
|
4
|
-
#####http://softwarebyjoe.com
|
5
|
-
##LICENSING: GNU GPLv3 License##################################
|
6
|
-
#! usr/bin/ruby
|
7
|
-
require 'em-synchrony'
|
8
|
-
require 'em-synchrony/em-http'
|
9
|
-
require 'em-synchrony/fiber_iterator'
|
10
|
-
require 'ruby-progressbar'
|
11
|
-
require 'open-uri'
|
12
|
-
require 'optparse'
|
13
|
-
require 'csv'
|
14
|
-
require 'bloomfilter-rb'
|
15
|
-
|
16
1
|
require 'retriever/fetch'
|
17
2
|
require 'retriever/fetchfiles'
|
18
|
-
require 'retriever/fetchsitemap'
|
3
|
+
require 'retriever/fetchsitemap'
|
4
|
+
require 'retriever/link'
|
5
|
+
require 'retriever/target'
|
6
|
+
|
7
|
+
module Retriever
|
8
|
+
|
9
|
+
end
|
data/readme.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
[RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
|
1
|
+
[RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
|
2
2
|
==============
|
3
|
-
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever) [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
|
4
|
+
|
4
5
|
By Joe Norton
|
5
6
|
|
6
7
|
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
|
data/spec/link_spec.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'retriever'
|
2
|
+
|
3
|
+
describe "Link" do
|
4
|
+
|
5
|
+
r = Retriever::Fetch.new("http://www.cnet.com/reviews/", {})
|
6
|
+
let(:links) { r.fetchLinks(@source) }
|
7
|
+
|
8
|
+
it "collects links in anchor tags" do
|
9
|
+
@source = (<<SOURCE).strip
|
10
|
+
<a href='http://www.cnet.com/download.exe'>download</a>
|
11
|
+
SOURCE
|
12
|
+
|
13
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "collects links in link tags" do
|
17
|
+
@source = (<<SOURCE).strip
|
18
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
19
|
+
SOURCE
|
20
|
+
|
21
|
+
expect(links).to include('http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12')
|
22
|
+
end
|
23
|
+
|
24
|
+
it "does not collect bare links (ones not in an href)" do
|
25
|
+
@source = (<<SOURCE).strip
|
26
|
+
http://www.google.com
|
27
|
+
SOURCE
|
28
|
+
|
29
|
+
expect(links).to_not include('http://www.google.com')
|
30
|
+
end
|
31
|
+
|
32
|
+
it "collects only unique href links on the page" do
|
33
|
+
@source = (<<SOURCE).strip
|
34
|
+
<a href='http://www.cnet.com/products/gadgets'>gadgets</a>
|
35
|
+
<a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
|
36
|
+
SOURCE
|
37
|
+
|
38
|
+
expect(links).to have(1).items
|
39
|
+
end
|
40
|
+
|
41
|
+
it "adds a protocol to urls missing them (www.)" do
|
42
|
+
@source = (<<SOURCE).strip
|
43
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
44
|
+
SOURCE
|
45
|
+
|
46
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
47
|
+
end
|
48
|
+
|
49
|
+
it "doesn't care about any extra attributes on the anchor tag" do
|
50
|
+
@source = (<<SOURCE).strip
|
51
|
+
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
52
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
|
53
|
+
SOURCE
|
54
|
+
|
55
|
+
expect(links).to have(1).item
|
56
|
+
end
|
57
|
+
|
58
|
+
it "returns relative urls with full path based on hostname" do
|
59
|
+
@source = (<<SOURCE).strip
|
60
|
+
<a href='/test.html'>test</a>
|
61
|
+
<a href='cpage_18'>about</a>
|
62
|
+
SOURCE
|
63
|
+
|
64
|
+
expect(links).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
|
65
|
+
end
|
66
|
+
end
|
data/spec/retriever_spec.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require 'retriever'
|
2
2
|
|
3
3
|
r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
|
4
4
|
test_html = "<a href='www.cnet.com/download.exe'>download</a>
|
@@ -12,49 +12,36 @@ http://www.google.com
|
|
12
12
|
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
13
13
|
<a href='cpage_18'>about</a>"
|
14
14
|
|
15
|
-
doc = r.fetchPage(r.target)
|
16
15
|
links_collection = r.fetchLinks(test_html)
|
17
|
-
filtered_links = r.parseInternalLinks(links_collection)
|
18
|
-
file_list = r.parseFiles(links_collection)
|
19
16
|
|
20
17
|
describe "Fetch" do
|
21
18
|
|
22
|
-
describe "#new" do
|
23
|
-
it "sets target, host, and max page vars" do
|
24
|
-
expect(r.target).to eq("http://www.cnet.com/reviews/")
|
25
|
-
expect(r.host).to eq("www.cnet.com")
|
26
|
-
expect(r.maxPages).to eq(100)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
describe "#fetchPage" do
|
31
|
-
it "opens URL and returns source as String" do
|
32
|
-
expect(doc.class).to eq(String)
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
19
|
describe "#fetchLinks" do
|
37
20
|
it "collects all unique href links on the page" do
|
38
21
|
expect(links_collection).to have(6).items
|
39
22
|
end
|
40
|
-
it "returns relative urls with full path based on hostname" do
|
41
|
-
expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
|
42
|
-
end
|
43
23
|
end
|
44
24
|
|
45
25
|
describe "#parseInternalLinks" do
|
26
|
+
let (:filtered_links) {r.parseInternalLinks(links_collection)}
|
46
27
|
it "filters links by host" do
|
47
28
|
filtered_links.each do |link|
|
48
|
-
expect(link).to include(
|
29
|
+
expect(link).to include("www.cnet.com")
|
49
30
|
end
|
50
31
|
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "#parseInternalVisitableLinks" do
|
35
|
+
let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
|
51
36
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
52
37
|
filtered_links.each do |link|
|
53
38
|
expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
|
54
39
|
end
|
55
40
|
end
|
56
41
|
end
|
42
|
+
|
57
43
|
describe "#parseFiles" do
|
44
|
+
let(:file_list) {r.parseFiles(links_collection)}
|
58
45
|
it "filters links by filetype" do
|
59
46
|
file_list.each do |link|
|
60
47
|
expect(link).to include(".exe")
|
data/spec/target_spec.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'retriever'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
t = Retriever::Target.new("http://www.cnet.com/reviews/")
|
5
|
+
|
6
|
+
describe "Target" do
|
7
|
+
|
8
|
+
it "creates target var" do
|
9
|
+
expect(t.target).to eq("http://www.cnet.com/reviews/")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "creates host var" do
|
13
|
+
expect(t.host).to eq("www.cnet.com")
|
14
|
+
end
|
15
|
+
|
16
|
+
it "creates host_re var" do
|
17
|
+
expect(t.host_re).to eq(/www.cnet.com/)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "adds protocol to Target URL if none given" do
|
21
|
+
expect(Retriever::Target.new("cnet.com").target).to eq("http://cnet.com")
|
22
|
+
end
|
23
|
+
|
24
|
+
it "fails if given URL has no dot in it" do
|
25
|
+
expect{Retriever::Target.new("cnetcom")}.to raise_error
|
26
|
+
end
|
27
|
+
|
28
|
+
describe "#source" do
|
29
|
+
|
30
|
+
it "opens URL and returns source as String" do
|
31
|
+
expect(Retriever::Target.new("http://techcrunch.com/").source.class).to eq(String)
|
32
|
+
end
|
33
|
+
|
34
|
+
it "fails if target redirects to new host" do
|
35
|
+
expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -122,10 +122,14 @@ files:
|
|
122
122
|
- lib/retriever/fetch.rb
|
123
123
|
- lib/retriever/fetchfiles.rb
|
124
124
|
- lib/retriever/fetchsitemap.rb
|
125
|
+
- lib/retriever/link.rb
|
126
|
+
- lib/retriever/target.rb
|
125
127
|
- lib/retriever/version.rb
|
126
128
|
- readme.md
|
129
|
+
- spec/link_spec.rb
|
127
130
|
- spec/retriever_spec.rb
|
128
131
|
- spec/spec_helper.rb
|
132
|
+
- spec/target_spec.rb
|
129
133
|
homepage: http://www.softwarebyjoe.com/rubyretriever/
|
130
134
|
licenses:
|
131
135
|
- MIT
|