rubyretriever 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f27793b1294bd489c3338aa5bc739dc6058d479a
4
- data.tar.gz: e48491765a741087ba7bf708d62a8bbf8aa3cf80
3
+ metadata.gz: 924c9958e88587353cc80f4b134cca91f73f3e57
4
+ data.tar.gz: 393457cd37ad3fb372008a7829c8028f658f2b58
5
5
  SHA512:
6
- metadata.gz: 60c017cf5dda8c659b12c3146f565df743dc4f6cf0e8436889bebb86cdccd5b128cca233d76b1413f526c21d1d99478c8149e6c43772e5dd0db1067f8dea5263
7
- data.tar.gz: ec8142f3a8cbd75861c74322dd0358da51ee45388376ca0118d0998b81cf302b182eef93c39d88ef17dd35ef80a6253bf7a058e2e2052ea60ea3266d401892ab
6
+ metadata.gz: 5dcde12eb9fea2181b6a91c0d798351b78efa80652547afd02db536ab9d139de2969b08326d61363435baf04cc4036a0ed0a4cbdd9c884bd05314b8210c38938
7
+ data.tar.gz: 8c226a13d4e0b29beffc1940b6ca05ff9f4ae403decc1990a2a6418f90fca12e132852c48eea082918d74cb593006e98ed14ffbe9366d9dd64ef0f058eefd7a2
data/bin/rr CHANGED
@@ -1,5 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
2
  require 'retriever'
3
+ require 'optparse'
3
4
  options = {}
4
5
  optparse = OptionParser.new do|opts|
5
6
  # Set a banner, displayed at the top
@@ -61,8 +62,8 @@ ARGV.each do|q|
61
62
  end
62
63
  puts "###############################"
63
64
  puts "### [RubyRetriever] go fetch #{q}"
64
- test = Retriever::FetchFiles.new(q, options) if options[:fileharvest]
65
- test = Retriever::FetchSitemap.new(q, options) if options[:sitemap]
65
+ Retriever::FetchFiles.new(q, options) if options[:fileharvest]
66
+ Retriever::FetchSitemap.new(q, options) if options[:sitemap]
66
67
  puts "### [RubyRetriever] is done."
67
68
  puts "###############################"
68
69
  puts
@@ -1,19 +1,20 @@
1
+ require 'em-synchrony'
2
+ require 'em-synchrony/em-http'
3
+ require 'em-synchrony/fiber_iterator'
4
+ require 'ruby-progressbar'
5
+ require 'open-uri'
6
+ require 'csv'
7
+ require 'bloomfilter-rb'
8
+
1
9
  module Retriever
2
10
  class Fetch
3
- attr_reader :target, :host, :host_re, :maxPages
11
+ attr_reader :maxPages, :t
4
12
  #constants
5
- HTTP_RE = Regexp.new(/^http/i).freeze
6
13
  HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
7
14
  NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
8
- SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
9
- DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
10
- NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
11
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
12
15
 
13
16
  def initialize(url,options)
14
- new_uri = URI(url)
15
- @target = new_uri.to_s
16
- @host = new_uri.host
17
+ @t = Retriever::Target.new(url)
17
18
  #OPTIONS
18
19
  @prgrss = options[:progress] ? options[:progress] : false
19
20
  @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
@@ -24,14 +25,13 @@ module Retriever
24
25
  @s = options[:sitemap] ? options[:sitemap] : false
25
26
  @autodown = options[:autodown] ? true : false
26
27
  #
27
- @host_re = Regexp.new(host).freeze
28
28
  if @fh
29
29
  tempExtStr = "."+@file_ext+'\z'
30
30
  @file_re = Regexp.new(tempExtStr).freeze
31
31
  else
32
32
  errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
33
33
  if !@output
34
- @output = "rr-#{@host.split('.')[1]}"
34
+ @output = "rr-#{@t.host.split('.')[1]}"
35
35
  end
36
36
  end
37
37
  if @prgrss
@@ -45,7 +45,7 @@ module Retriever
45
45
  @progressbar = ProgressBar.create(prgressVars)
46
46
  end
47
47
  @already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
48
- @already_crawled.insert(@target)
48
+ @already_crawled.insert(@t.target)
49
49
  end
50
50
  def errlog(msg)
51
51
  raise "ERROR: #{msg}"
@@ -56,10 +56,10 @@ module Retriever
56
56
  def dump(data)
57
57
  puts "###############################"
58
58
  if @s
59
- puts "#{@target} Sitemap"
59
+ puts "#{@t.target} Sitemap"
60
60
  puts "Page Count: #{data.size}"
61
61
  elsif @fh
62
- puts "Target URL: #{@target}"
62
+ puts "Target URL: #{@t.target}"
63
63
  puts "Filetype: #{@file_ext}"
64
64
  puts "File Count: #{data.size}"
65
65
  else
@@ -84,58 +84,20 @@ module Retriever
84
84
  puts
85
85
  end
86
86
  end
87
- def fetchPage(url)
88
- resp = false
89
- EM.synchrony do
90
- begin
91
- resp = EventMachine::HttpRequest.new(url).get
92
- rescue StandardError => e
93
- #puts e.message + " ## " + url
94
- #the trap abrt is nescessary to handle the SSL error
95
- #for some ungodly reason it's the only way I found to handle it
96
- trap("ABRT"){
97
- puts "#{url} failed SSL Certification Verification"
98
- }
99
- return false
100
- end
101
- lg("URL Crawled: #{url}")
102
- EventMachine.stop
103
- end
104
- if resp.response == ""
105
- errlog("Domain is not working. Try the non-WWW version.")
106
- end
107
- return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
108
- end
109
87
  #recieves page source as string
110
88
  #returns array of unique href links
111
89
  def fetchLinks(doc)
112
90
  return false if !doc
113
- linkArray = []
114
- doc.scan(HREF_CONTENTS_RE) do |arr| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
115
- link = arr[0]
116
- if (!(HTTP_RE =~ link))
117
- if (DUB_DUB_DUB_DOT_RE =~ link)
118
- link = "http://#{link}"
119
- elsif SINGLE_SLASH_RE =~ link #link uses relative path
120
- link = "http://#{@host}"+link #appending hostname to relative paths
121
- elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
122
- link = "http:#{link}" #appending current url to relative paths
123
- elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
124
- link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
125
- else
126
- next
127
- end
128
- end
129
- linkArray.push(link)
130
- end
131
- linkArray.uniq!
91
+ doc.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
92
+ link = match[0]
93
+ Link.new(@t.host, link).path
94
+ end.uniq
132
95
  end
133
96
  def parseInternalLinks(all_links)
134
- if all_links
135
- all_links.select{ |linky| (@host_re =~ linky && (!(NONPAGE_EXT_RE =~linky)))}
136
- else
137
- return false
138
- end
97
+ all_links.select{ |linky| (@t.host_re =~ linky) }
98
+ end
99
+ def parseInternalVisitableLinks(all_links)
100
+ parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
139
101
  end
140
102
  def async_crawl_and_collect()
141
103
  while (@already_crawled.size < @maxPages)
@@ -147,8 +109,6 @@ module Retriever
147
109
  end
148
110
  break;
149
111
  end
150
- #puts "New loop"
151
- #puts @linkStack
152
112
  new_links_arr = self.asyncGetWave()
153
113
  next if (new_links_arr.nil? || new_links_arr.empty?)
154
114
  new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
@@ -4,16 +4,17 @@ module Retriever
4
4
  def initialize(url,options)
5
5
  super
6
6
  @fileStack = []
7
- all_links = self.fetchLinks(fetchPage(@target))
8
- @linkStack = self.parseInternalLinks(all_links)
7
+ all_links = self.fetchLinks(@t.source)
8
+ @linkStack = self.parseInternalVisitableLinks(all_links)
9
+ lg("URL Crawled: #{@t.target}")
9
10
  self.lg("#{@linkStack.size-1} new links found")
10
11
 
11
12
  tempFileCollection = self.parseFiles(all_links)
12
13
  @fileStack.concat(tempFileCollection) if tempFileCollection.size>0
13
14
  self.lg("#{@fileStack.size} new files found")
14
- errlog("Bad URL -- #{@target}") if !@linkStack
15
+ errlog("Bad URL -- #{@t.target}") if !@linkStack
15
16
 
16
- @linkStack.delete(@target) if @linkStack.include?(@target)
17
+ @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
17
18
  @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
18
19
 
19
20
  self.async_crawl_and_collect()
@@ -3,12 +3,13 @@ module Retriever
3
3
  attr_reader :sitemap
4
4
  def initialize(url,options)
5
5
  super
6
- @sitemap = [@target]
7
- @linkStack = self.parseInternalLinks(self.fetchLinks(fetchPage(@target)))
6
+ @sitemap = [@t.target]
7
+ @linkStack = self.parseInternalVisitableLinks(self.fetchLinks(@t.source))
8
+ lg("URL Crawled: #{@t.target}")
8
9
  self.lg("#{@linkStack.size-1} new links found")
9
- errlog("Bad URL -- #{@target}") if !@linkStack
10
+ errlog("Bad URL -- #{@t.target}") if !@linkStack
10
11
 
11
- @linkStack.delete(@target) if @linkStack.include?(@target)
12
+ @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
12
13
  @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
13
14
  @sitemap.concat(@linkStack)
14
15
 
@@ -16,14 +17,13 @@ module Retriever
16
17
 
17
18
  @sitemap.sort_by! {|x| x.length} if @sitemap.size>1
18
19
  @sitemap.uniq!
19
- @sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
20
20
 
21
21
  self.dump(self.sitemap)
22
22
  self.write(self.sitemap) if /CSV/i =~ @s
23
23
  self.gen_xml(self.sitemap) if /XML/i =~ @s
24
24
  end
25
25
  def gen_xml(data)
26
- f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
26
+ f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
27
27
  f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
28
28
  data.each do |url|
29
29
  f << "<url><loc>#{url}</loc></url>"
@@ -31,7 +31,7 @@ module Retriever
31
31
  f << "</urlset>"
32
32
  f.close
33
33
  puts "###############################"
34
- puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
34
+ puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
35
35
  puts "Object Count: #{@sitemap.size}"
36
36
  puts "###############################"
37
37
  puts
@@ -0,0 +1,29 @@
1
+ module Retriever
2
+ class Link
3
+ HTTP_RE = Regexp.new(/^http/i).freeze
4
+ SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
5
+ DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
6
+ NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
7
+ DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
8
+
9
+ def initialize(host, link)
10
+ @host = host
11
+ @link = link
12
+ end
13
+
14
+ def path
15
+ return link if HTTP_RE =~ link
16
+
17
+ return "http://#{link}" if DUB_DUB_DUB_DOT_RE =~ link
18
+
19
+ return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
20
+
21
+ return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
22
+
23
+ return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
24
+ end
25
+
26
+ private
27
+ attr_reader :host, :link
28
+ end
29
+ end
@@ -0,0 +1,41 @@
1
+ require 'open-uri'
2
+
3
+ module Retriever
4
+ class Target
5
+ HTTP_RE = Regexp.new(/^http/i).freeze
6
+ DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
7
+ attr_reader :host, :target, :host_re, :source
8
+ def initialize(url)
9
+ url = "http://#{url}" if (!(HTTP_RE =~ url))
10
+ fail "Bad URL" if (!(/\./ =~ url))
11
+ new_uri = URI(url)
12
+ @target = new_uri.to_s
13
+ @host = new_uri.host
14
+ @host_re = Regexp.new(@host).freeze
15
+ end
16
+
17
+ def source
18
+ resp = false
19
+ begin
20
+ resp = open(@target)
21
+ rescue StandardError => e
22
+ #puts e.message + " ## " + url
23
+ #the trap abrt is nescessary to handle the SSL error
24
+ #for some ungodly reason it's the only way I found to handle it
25
+ trap("ABRT"){
26
+ puts "#{@target} failed SSL Certification Verification"
27
+ }
28
+ return false
29
+ end
30
+ if (@target != resp.base_uri.to_s)
31
+ fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
32
+ end
33
+ resp = resp.read
34
+ if resp == ""
35
+ fail "Domain is not working. Try the non-WWW version."
36
+ end
37
+ return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
38
+ end
39
+
40
+ end
41
+ end
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '0.1.3'
2
+ VERSION = '0.1.4'
3
3
  end
data/lib/retriever.rb CHANGED
@@ -1,18 +1,9 @@
1
- ##################################################################
2
- #####RubyRetriever -- web crawler and file harvester
3
- #####created by Joe Norton
4
- #####http://softwarebyjoe.com
5
- ##LICENSING: GNU GPLv3 License##################################
6
- #! usr/bin/ruby
7
- require 'em-synchrony'
8
- require 'em-synchrony/em-http'
9
- require 'em-synchrony/fiber_iterator'
10
- require 'ruby-progressbar'
11
- require 'open-uri'
12
- require 'optparse'
13
- require 'csv'
14
- require 'bloomfilter-rb'
15
-
16
1
  require 'retriever/fetch'
17
2
  require 'retriever/fetchfiles'
18
- require 'retriever/fetchsitemap'
3
+ require 'retriever/fetchsitemap'
4
+ require 'retriever/link'
5
+ require 'retriever/target'
6
+
7
+ module Retriever
8
+
9
+ end
data/readme.md CHANGED
@@ -1,6 +1,7 @@
1
- [RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/) [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever)
1
+ [RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
2
2
  ==============
3
-
3
+ [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever) [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
4
+
4
5
  By Joe Norton
5
6
 
6
7
  RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
data/spec/link_spec.rb ADDED
@@ -0,0 +1,66 @@
1
+ require 'retriever'
2
+
3
+ describe "Link" do
4
+
5
+ r = Retriever::Fetch.new("http://www.cnet.com/reviews/", {})
6
+ let(:links) { r.fetchLinks(@source) }
7
+
8
+ it "collects links in anchor tags" do
9
+ @source = (<<SOURCE).strip
10
+ <a href='http://www.cnet.com/download.exe'>download</a>
11
+ SOURCE
12
+
13
+ expect(links).to include('http://www.cnet.com/download.exe')
14
+ end
15
+
16
+ it "collects links in link tags" do
17
+ @source = (<<SOURCE).strip
18
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
19
+ SOURCE
20
+
21
+ expect(links).to include('http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12')
22
+ end
23
+
24
+ it "does not collect bare links (ones not in an href)" do
25
+ @source = (<<SOURCE).strip
26
+ http://www.google.com
27
+ SOURCE
28
+
29
+ expect(links).to_not include('http://www.google.com')
30
+ end
31
+
32
+ it "collects only unique href links on the page" do
33
+ @source = (<<SOURCE).strip
34
+ <a href='http://www.cnet.com/products/gadgets'>gadgets</a>
35
+ <a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
36
+ SOURCE
37
+
38
+ expect(links).to have(1).items
39
+ end
40
+
41
+ it "adds a protocol to urls missing them (www.)" do
42
+ @source = (<<SOURCE).strip
43
+ <a href='www.cnet.com/download.exe'>download</a>
44
+ SOURCE
45
+
46
+ expect(links).to include('http://www.cnet.com/download.exe')
47
+ end
48
+
49
+ it "doesn't care about any extra attributes on the anchor tag" do
50
+ @source = (<<SOURCE).strip
51
+ <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
52
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
53
+ SOURCE
54
+
55
+ expect(links).to have(1).item
56
+ end
57
+
58
+ it "returns relative urls with full path based on hostname" do
59
+ @source = (<<SOURCE).strip
60
+ <a href='/test.html'>test</a>
61
+ <a href='cpage_18'>about</a>
62
+ SOURCE
63
+
64
+ expect(links).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
65
+ end
66
+ end
@@ -1,4 +1,4 @@
1
- require_relative '../lib/retriever'
1
+ require 'retriever'
2
2
 
3
3
  r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
4
4
  test_html = "<a href='www.cnet.com/download.exe'>download</a>
@@ -12,49 +12,36 @@ http://www.google.com
12
12
  <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
13
13
  <a href='cpage_18'>about</a>"
14
14
 
15
- doc = r.fetchPage(r.target)
16
15
  links_collection = r.fetchLinks(test_html)
17
- filtered_links = r.parseInternalLinks(links_collection)
18
- file_list = r.parseFiles(links_collection)
19
16
 
20
17
  describe "Fetch" do
21
18
 
22
- describe "#new" do
23
- it "sets target, host, and max page vars" do
24
- expect(r.target).to eq("http://www.cnet.com/reviews/")
25
- expect(r.host).to eq("www.cnet.com")
26
- expect(r.maxPages).to eq(100)
27
- end
28
- end
29
-
30
- describe "#fetchPage" do
31
- it "opens URL and returns source as String" do
32
- expect(doc.class).to eq(String)
33
- end
34
- end
35
-
36
19
  describe "#fetchLinks" do
37
20
  it "collects all unique href links on the page" do
38
21
  expect(links_collection).to have(6).items
39
22
  end
40
- it "returns relative urls with full path based on hostname" do
41
- expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
42
- end
43
23
  end
44
24
 
45
25
  describe "#parseInternalLinks" do
26
+ let (:filtered_links) {r.parseInternalLinks(links_collection)}
46
27
  it "filters links by host" do
47
28
  filtered_links.each do |link|
48
- expect(link).to include(r.host)
29
+ expect(link).to include("www.cnet.com")
49
30
  end
50
31
  end
32
+ end
33
+
34
+ describe "#parseInternalVisitableLinks" do
35
+ let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
51
36
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
52
37
  filtered_links.each do |link|
53
38
  expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
54
39
  end
55
40
  end
56
41
  end
42
+
57
43
  describe "#parseFiles" do
44
+ let(:file_list) {r.parseFiles(links_collection)}
58
45
  it "filters links by filetype" do
59
46
  file_list.each do |link|
60
47
  expect(link).to include(".exe")
@@ -0,0 +1,39 @@
1
+ require 'retriever'
2
+ require 'open-uri'
3
+
4
+ t = Retriever::Target.new("http://www.cnet.com/reviews/")
5
+
6
+ describe "Target" do
7
+
8
+ it "creates target var" do
9
+ expect(t.target).to eq("http://www.cnet.com/reviews/")
10
+ end
11
+
12
+ it "creates host var" do
13
+ expect(t.host).to eq("www.cnet.com")
14
+ end
15
+
16
+ it "creates host_re var" do
17
+ expect(t.host_re).to eq(/www.cnet.com/)
18
+ end
19
+
20
+ it "adds protocol to Target URL if none given" do
21
+ expect(Retriever::Target.new("cnet.com").target).to eq("http://cnet.com")
22
+ end
23
+
24
+ it "fails if given URL has no dot in it" do
25
+ expect{Retriever::Target.new("cnetcom")}.to raise_error
26
+ end
27
+
28
+ describe "#source" do
29
+
30
+ it "opens URL and returns source as String" do
31
+ expect(Retriever::Target.new("http://techcrunch.com/").source.class).to eq(String)
32
+ end
33
+
34
+ it "fails if target redirects to new host" do
35
+ expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
36
+ end
37
+ end
38
+
39
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
@@ -122,10 +122,14 @@ files:
122
122
  - lib/retriever/fetch.rb
123
123
  - lib/retriever/fetchfiles.rb
124
124
  - lib/retriever/fetchsitemap.rb
125
+ - lib/retriever/link.rb
126
+ - lib/retriever/target.rb
125
127
  - lib/retriever/version.rb
126
128
  - readme.md
129
+ - spec/link_spec.rb
127
130
  - spec/retriever_spec.rb
128
131
  - spec/spec_helper.rb
132
+ - spec/target_spec.rb
129
133
  homepage: http://www.softwarebyjoe.com/rubyretriever/
130
134
  licenses:
131
135
  - MIT