rubyretriever 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f27793b1294bd489c3338aa5bc739dc6058d479a
4
- data.tar.gz: e48491765a741087ba7bf708d62a8bbf8aa3cf80
3
+ metadata.gz: 924c9958e88587353cc80f4b134cca91f73f3e57
4
+ data.tar.gz: 393457cd37ad3fb372008a7829c8028f658f2b58
5
5
  SHA512:
6
- metadata.gz: 60c017cf5dda8c659b12c3146f565df743dc4f6cf0e8436889bebb86cdccd5b128cca233d76b1413f526c21d1d99478c8149e6c43772e5dd0db1067f8dea5263
7
- data.tar.gz: ec8142f3a8cbd75861c74322dd0358da51ee45388376ca0118d0998b81cf302b182eef93c39d88ef17dd35ef80a6253bf7a058e2e2052ea60ea3266d401892ab
6
+ metadata.gz: 5dcde12eb9fea2181b6a91c0d798351b78efa80652547afd02db536ab9d139de2969b08326d61363435baf04cc4036a0ed0a4cbdd9c884bd05314b8210c38938
7
+ data.tar.gz: 8c226a13d4e0b29beffc1940b6ca05ff9f4ae403decc1990a2a6418f90fca12e132852c48eea082918d74cb593006e98ed14ffbe9366d9dd64ef0f058eefd7a2
data/bin/rr CHANGED
@@ -1,5 +1,6 @@
1
1
  #! /usr/bin/env ruby
2
2
  require 'retriever'
3
+ require 'optparse'
3
4
  options = {}
4
5
  optparse = OptionParser.new do|opts|
5
6
  # Set a banner, displayed at the top
@@ -61,8 +62,8 @@ ARGV.each do|q|
61
62
  end
62
63
  puts "###############################"
63
64
  puts "### [RubyRetriever] go fetch #{q}"
64
- test = Retriever::FetchFiles.new(q, options) if options[:fileharvest]
65
- test = Retriever::FetchSitemap.new(q, options) if options[:sitemap]
65
+ Retriever::FetchFiles.new(q, options) if options[:fileharvest]
66
+ Retriever::FetchSitemap.new(q, options) if options[:sitemap]
66
67
  puts "### [RubyRetriever] is done."
67
68
  puts "###############################"
68
69
  puts
@@ -1,19 +1,20 @@
1
+ require 'em-synchrony'
2
+ require 'em-synchrony/em-http'
3
+ require 'em-synchrony/fiber_iterator'
4
+ require 'ruby-progressbar'
5
+ require 'open-uri'
6
+ require 'csv'
7
+ require 'bloomfilter-rb'
8
+
1
9
  module Retriever
2
10
  class Fetch
3
- attr_reader :target, :host, :host_re, :maxPages
11
+ attr_reader :maxPages, :t
4
12
  #constants
5
- HTTP_RE = Regexp.new(/^http/i).freeze
6
13
  HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
7
14
  NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
8
- SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
9
- DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
10
- NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
11
- DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
12
15
 
13
16
  def initialize(url,options)
14
- new_uri = URI(url)
15
- @target = new_uri.to_s
16
- @host = new_uri.host
17
+ @t = Retriever::Target.new(url)
17
18
  #OPTIONS
18
19
  @prgrss = options[:progress] ? options[:progress] : false
19
20
  @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
@@ -24,14 +25,13 @@ module Retriever
24
25
  @s = options[:sitemap] ? options[:sitemap] : false
25
26
  @autodown = options[:autodown] ? true : false
26
27
  #
27
- @host_re = Regexp.new(host).freeze
28
28
  if @fh
29
29
  tempExtStr = "."+@file_ext+'\z'
30
30
  @file_re = Regexp.new(tempExtStr).freeze
31
31
  else
32
32
  errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
33
33
  if !@output
34
- @output = "rr-#{@host.split('.')[1]}"
34
+ @output = "rr-#{@t.host.split('.')[1]}"
35
35
  end
36
36
  end
37
37
  if @prgrss
@@ -45,7 +45,7 @@ module Retriever
45
45
  @progressbar = ProgressBar.create(prgressVars)
46
46
  end
47
47
  @already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
48
- @already_crawled.insert(@target)
48
+ @already_crawled.insert(@t.target)
49
49
  end
50
50
  def errlog(msg)
51
51
  raise "ERROR: #{msg}"
@@ -56,10 +56,10 @@ module Retriever
56
56
  def dump(data)
57
57
  puts "###############################"
58
58
  if @s
59
- puts "#{@target} Sitemap"
59
+ puts "#{@t.target} Sitemap"
60
60
  puts "Page Count: #{data.size}"
61
61
  elsif @fh
62
- puts "Target URL: #{@target}"
62
+ puts "Target URL: #{@t.target}"
63
63
  puts "Filetype: #{@file_ext}"
64
64
  puts "File Count: #{data.size}"
65
65
  else
@@ -84,58 +84,20 @@ module Retriever
84
84
  puts
85
85
  end
86
86
  end
87
- def fetchPage(url)
88
- resp = false
89
- EM.synchrony do
90
- begin
91
- resp = EventMachine::HttpRequest.new(url).get
92
- rescue StandardError => e
93
- #puts e.message + " ## " + url
94
- #the trap abrt is nescessary to handle the SSL error
95
- #for some ungodly reason it's the only way I found to handle it
96
- trap("ABRT"){
97
- puts "#{url} failed SSL Certification Verification"
98
- }
99
- return false
100
- end
101
- lg("URL Crawled: #{url}")
102
- EventMachine.stop
103
- end
104
- if resp.response == ""
105
- errlog("Domain is not working. Try the non-WWW version.")
106
- end
107
- return resp.response.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
108
- end
109
87
  #recieves page source as string
110
88
  #returns array of unique href links
111
89
  def fetchLinks(doc)
112
90
  return false if !doc
113
- linkArray = []
114
- doc.scan(HREF_CONTENTS_RE) do |arr| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
115
- link = arr[0]
116
- if (!(HTTP_RE =~ link))
117
- if (DUB_DUB_DUB_DOT_RE =~ link)
118
- link = "http://#{link}"
119
- elsif SINGLE_SLASH_RE =~ link #link uses relative path
120
- link = "http://#{@host}"+link #appending hostname to relative paths
121
- elsif DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
122
- link = "http:#{link}" #appending current url to relative paths
123
- elsif (NO_SLASH_PAGE_RE =~ link) #link uses relative path with no slashes at all, people actually this - imagine that.
124
- link = "http://#{@host}"+"/"+link #appending hostname and slashy to create full paths
125
- else
126
- next
127
- end
128
- end
129
- linkArray.push(link)
130
- end
131
- linkArray.uniq!
91
+ doc.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
92
+ link = match[0]
93
+ Link.new(@t.host, link).path
94
+ end.uniq
132
95
  end
133
96
  def parseInternalLinks(all_links)
134
- if all_links
135
- all_links.select{ |linky| (@host_re =~ linky && (!(NONPAGE_EXT_RE =~linky)))}
136
- else
137
- return false
138
- end
97
+ all_links.select{ |linky| (@t.host_re =~ linky) }
98
+ end
99
+ def parseInternalVisitableLinks(all_links)
100
+ parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
139
101
  end
140
102
  def async_crawl_and_collect()
141
103
  while (@already_crawled.size < @maxPages)
@@ -147,8 +109,6 @@ module Retriever
147
109
  end
148
110
  break;
149
111
  end
150
- #puts "New loop"
151
- #puts @linkStack
152
112
  new_links_arr = self.asyncGetWave()
153
113
  next if (new_links_arr.nil? || new_links_arr.empty?)
154
114
  new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
@@ -4,16 +4,17 @@ module Retriever
4
4
  def initialize(url,options)
5
5
  super
6
6
  @fileStack = []
7
- all_links = self.fetchLinks(fetchPage(@target))
8
- @linkStack = self.parseInternalLinks(all_links)
7
+ all_links = self.fetchLinks(@t.source)
8
+ @linkStack = self.parseInternalVisitableLinks(all_links)
9
+ lg("URL Crawled: #{@t.target}")
9
10
  self.lg("#{@linkStack.size-1} new links found")
10
11
 
11
12
  tempFileCollection = self.parseFiles(all_links)
12
13
  @fileStack.concat(tempFileCollection) if tempFileCollection.size>0
13
14
  self.lg("#{@fileStack.size} new files found")
14
- errlog("Bad URL -- #{@target}") if !@linkStack
15
+ errlog("Bad URL -- #{@t.target}") if !@linkStack
15
16
 
16
- @linkStack.delete(@target) if @linkStack.include?(@target)
17
+ @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
17
18
  @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
18
19
 
19
20
  self.async_crawl_and_collect()
@@ -3,12 +3,13 @@ module Retriever
3
3
  attr_reader :sitemap
4
4
  def initialize(url,options)
5
5
  super
6
- @sitemap = [@target]
7
- @linkStack = self.parseInternalLinks(self.fetchLinks(fetchPage(@target)))
6
+ @sitemap = [@t.target]
7
+ @linkStack = self.parseInternalVisitableLinks(self.fetchLinks(@t.source))
8
+ lg("URL Crawled: #{@t.target}")
8
9
  self.lg("#{@linkStack.size-1} new links found")
9
- errlog("Bad URL -- #{@target}") if !@linkStack
10
+ errlog("Bad URL -- #{@t.target}") if !@linkStack
10
11
 
11
- @linkStack.delete(@target) if @linkStack.include?(@target)
12
+ @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
12
13
  @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
13
14
  @sitemap.concat(@linkStack)
14
15
 
@@ -16,14 +17,13 @@ module Retriever
16
17
 
17
18
  @sitemap.sort_by! {|x| x.length} if @sitemap.size>1
18
19
  @sitemap.uniq!
19
- @sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
20
20
 
21
21
  self.dump(self.sitemap)
22
22
  self.write(self.sitemap) if /CSV/i =~ @s
23
23
  self.gen_xml(self.sitemap) if /XML/i =~ @s
24
24
  end
25
25
  def gen_xml(data)
26
- f = File.open("sitemap-#{@host.split('.')[1]}.xml", 'w+')
26
+ f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
27
27
  f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
28
28
  data.each do |url|
29
29
  f << "<url><loc>#{url}</loc></url>"
@@ -31,7 +31,7 @@ module Retriever
31
31
  f << "</urlset>"
32
32
  f.close
33
33
  puts "###############################"
34
- puts "File Created: sitemap-#{@host.split('.')[1]}.xml"
34
+ puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
35
35
  puts "Object Count: #{@sitemap.size}"
36
36
  puts "###############################"
37
37
  puts
@@ -0,0 +1,29 @@
1
+ module Retriever
2
+ class Link
3
+ HTTP_RE = Regexp.new(/^http/i).freeze
4
+ SINGLE_SLASH_RE = Regexp.new(/^\/{1}[^\/]/).freeze
5
+ DOUBLE_SLASH_RE = Regexp.new(/^\/{2}[^\/]/).freeze
6
+ NO_SLASH_PAGE_RE = Regexp.new(/^[a-z0-9\-\_\=\?\.]+\z/ix).freeze
7
+ DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
8
+
9
+ def initialize(host, link)
10
+ @host = host
11
+ @link = link
12
+ end
13
+
14
+ def path
15
+ return link if HTTP_RE =~ link
16
+
17
+ return "http://#{link}" if DUB_DUB_DUB_DOT_RE =~ link
18
+
19
+ return "http://#{host}#{link}" if SINGLE_SLASH_RE =~ link
20
+
21
+ return "http:#{link}" if DOUBLE_SLASH_RE =~ link #link begins with '//' (maybe a messed up link?)
22
+
23
+ return "http://#{host}/#{link}" if NO_SLASH_PAGE_RE =~ link #link uses relative path with no slashes at all, people actually this - imagine that.
24
+ end
25
+
26
+ private
27
+ attr_reader :host, :link
28
+ end
29
+ end
@@ -0,0 +1,41 @@
1
+ require 'open-uri'
2
+
3
+ module Retriever
4
+ class Target
5
+ HTTP_RE = Regexp.new(/^http/i).freeze
6
+ DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
7
+ attr_reader :host, :target, :host_re, :source
8
+ def initialize(url)
9
+ url = "http://#{url}" if (!(HTTP_RE =~ url))
10
+ fail "Bad URL" if (!(/\./ =~ url))
11
+ new_uri = URI(url)
12
+ @target = new_uri.to_s
13
+ @host = new_uri.host
14
+ @host_re = Regexp.new(@host).freeze
15
+ end
16
+
17
+ def source
18
+ resp = false
19
+ begin
20
+ resp = open(@target)
21
+ rescue StandardError => e
22
+ #puts e.message + " ## " + url
23
+ #the trap abrt is nescessary to handle the SSL error
24
+ #for some ungodly reason it's the only way I found to handle it
25
+ trap("ABRT"){
26
+ puts "#{@target} failed SSL Certification Verification"
27
+ }
28
+ return false
29
+ end
30
+ if (@target != resp.base_uri.to_s)
31
+ fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
32
+ end
33
+ resp = resp.read
34
+ if resp == ""
35
+ fail "Domain is not working. Try the non-WWW version."
36
+ end
37
+ return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
38
+ end
39
+
40
+ end
41
+ end
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '0.1.3'
2
+ VERSION = '0.1.4'
3
3
  end
data/lib/retriever.rb CHANGED
@@ -1,18 +1,9 @@
1
- ##################################################################
2
- #####RubyRetriever -- web crawler and file harvester
3
- #####created by Joe Norton
4
- #####http://softwarebyjoe.com
5
- ##LICENSING: GNU GPLv3 License##################################
6
- #! usr/bin/ruby
7
- require 'em-synchrony'
8
- require 'em-synchrony/em-http'
9
- require 'em-synchrony/fiber_iterator'
10
- require 'ruby-progressbar'
11
- require 'open-uri'
12
- require 'optparse'
13
- require 'csv'
14
- require 'bloomfilter-rb'
15
-
16
1
  require 'retriever/fetch'
17
2
  require 'retriever/fetchfiles'
18
- require 'retriever/fetchsitemap'
3
+ require 'retriever/fetchsitemap'
4
+ require 'retriever/link'
5
+ require 'retriever/target'
6
+
7
+ module Retriever
8
+
9
+ end
data/readme.md CHANGED
@@ -1,6 +1,7 @@
1
- [RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/) [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever)
1
+ [RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
2
2
  ==============
3
-
3
+ [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever) [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
4
+
4
5
  By Joe Norton
5
6
 
6
7
  RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader, and all around nice buddy to have around.
data/spec/link_spec.rb ADDED
@@ -0,0 +1,66 @@
1
+ require 'retriever'
2
+
3
+ describe "Link" do
4
+
5
+ r = Retriever::Fetch.new("http://www.cnet.com/reviews/", {})
6
+ let(:links) { r.fetchLinks(@source) }
7
+
8
+ it "collects links in anchor tags" do
9
+ @source = (<<SOURCE).strip
10
+ <a href='http://www.cnet.com/download.exe'>download</a>
11
+ SOURCE
12
+
13
+ expect(links).to include('http://www.cnet.com/download.exe')
14
+ end
15
+
16
+ it "collects links in link tags" do
17
+ @source = (<<SOURCE).strip
18
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
19
+ SOURCE
20
+
21
+ expect(links).to include('http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12')
22
+ end
23
+
24
+ it "does not collect bare links (ones not in an href)" do
25
+ @source = (<<SOURCE).strip
26
+ http://www.google.com
27
+ SOURCE
28
+
29
+ expect(links).to_not include('http://www.google.com')
30
+ end
31
+
32
+ it "collects only unique href links on the page" do
33
+ @source = (<<SOURCE).strip
34
+ <a href='http://www.cnet.com/products/gadgets'>gadgets</a>
35
+ <a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
36
+ SOURCE
37
+
38
+ expect(links).to have(1).items
39
+ end
40
+
41
+ it "adds a protocol to urls missing them (www.)" do
42
+ @source = (<<SOURCE).strip
43
+ <a href='www.cnet.com/download.exe'>download</a>
44
+ SOURCE
45
+
46
+ expect(links).to include('http://www.cnet.com/download.exe')
47
+ end
48
+
49
+ it "doesn't care about any extra attributes on the anchor tag" do
50
+ @source = (<<SOURCE).strip
51
+ <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
52
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
53
+ SOURCE
54
+
55
+ expect(links).to have(1).item
56
+ end
57
+
58
+ it "returns relative urls with full path based on hostname" do
59
+ @source = (<<SOURCE).strip
60
+ <a href='/test.html'>test</a>
61
+ <a href='cpage_18'>about</a>
62
+ SOURCE
63
+
64
+ expect(links).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
65
+ end
66
+ end
@@ -1,4 +1,4 @@
1
- require_relative '../lib/retriever'
1
+ require 'retriever'
2
2
 
3
3
  r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
4
4
  test_html = "<a href='www.cnet.com/download.exe'>download</a>
@@ -12,49 +12,36 @@ http://www.google.com
12
12
  <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
13
13
  <a href='cpage_18'>about</a>"
14
14
 
15
- doc = r.fetchPage(r.target)
16
15
  links_collection = r.fetchLinks(test_html)
17
- filtered_links = r.parseInternalLinks(links_collection)
18
- file_list = r.parseFiles(links_collection)
19
16
 
20
17
  describe "Fetch" do
21
18
 
22
- describe "#new" do
23
- it "sets target, host, and max page vars" do
24
- expect(r.target).to eq("http://www.cnet.com/reviews/")
25
- expect(r.host).to eq("www.cnet.com")
26
- expect(r.maxPages).to eq(100)
27
- end
28
- end
29
-
30
- describe "#fetchPage" do
31
- it "opens URL and returns source as String" do
32
- expect(doc.class).to eq(String)
33
- end
34
- end
35
-
36
19
  describe "#fetchLinks" do
37
20
  it "collects all unique href links on the page" do
38
21
  expect(links_collection).to have(6).items
39
22
  end
40
- it "returns relative urls with full path based on hostname" do
41
- expect(links_collection).to include("http://www.cnet.com/test.html","http://www.cnet.com/cpage_18")
42
- end
43
23
  end
44
24
 
45
25
  describe "#parseInternalLinks" do
26
+ let (:filtered_links) {r.parseInternalLinks(links_collection)}
46
27
  it "filters links by host" do
47
28
  filtered_links.each do |link|
48
- expect(link).to include(r.host)
29
+ expect(link).to include("www.cnet.com")
49
30
  end
50
31
  end
32
+ end
33
+
34
+ describe "#parseInternalVisitableLinks" do
35
+ let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
51
36
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
52
37
  filtered_links.each do |link|
53
38
  expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
54
39
  end
55
40
  end
56
41
  end
42
+
57
43
  describe "#parseFiles" do
44
+ let(:file_list) {r.parseFiles(links_collection)}
58
45
  it "filters links by filetype" do
59
46
  file_list.each do |link|
60
47
  expect(link).to include(".exe")
@@ -0,0 +1,39 @@
1
+ require 'retriever'
2
+ require 'open-uri'
3
+
4
+ t = Retriever::Target.new("http://www.cnet.com/reviews/")
5
+
6
+ describe "Target" do
7
+
8
+ it "creates target var" do
9
+ expect(t.target).to eq("http://www.cnet.com/reviews/")
10
+ end
11
+
12
+ it "creates host var" do
13
+ expect(t.host).to eq("www.cnet.com")
14
+ end
15
+
16
+ it "creates host_re var" do
17
+ expect(t.host_re).to eq(/www.cnet.com/)
18
+ end
19
+
20
+ it "adds protocol to Target URL if none given" do
21
+ expect(Retriever::Target.new("cnet.com").target).to eq("http://cnet.com")
22
+ end
23
+
24
+ it "fails if given URL has no dot in it" do
25
+ expect{Retriever::Target.new("cnetcom")}.to raise_error
26
+ end
27
+
28
+ describe "#source" do
29
+
30
+ it "opens URL and returns source as String" do
31
+ expect(Retriever::Target.new("http://techcrunch.com/").source.class).to eq(String)
32
+ end
33
+
34
+ it "fails if target redirects to new host" do
35
+ expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
36
+ end
37
+ end
38
+
39
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
@@ -122,10 +122,14 @@ files:
122
122
  - lib/retriever/fetch.rb
123
123
  - lib/retriever/fetchfiles.rb
124
124
  - lib/retriever/fetchsitemap.rb
125
+ - lib/retriever/link.rb
126
+ - lib/retriever/target.rb
125
127
  - lib/retriever/version.rb
126
128
  - readme.md
129
+ - spec/link_spec.rb
127
130
  - spec/retriever_spec.rb
128
131
  - spec/spec_helper.rb
132
+ - spec/target_spec.rb
129
133
  homepage: http://www.softwarebyjoe.com/rubyretriever/
130
134
  licenses:
131
135
  - MIT