arachnid 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/arachnid.rb +28 -8
  2. metadata +1 -1
data/lib/arachnid.rb CHANGED
@@ -4,6 +4,7 @@ require 'typhoeus'
4
4
  require 'bloomfilter-rb'
5
5
  require 'nokogiri'
6
6
  require 'domainatrix'
7
+ require 'uri'
7
8
 
8
9
  class Arachnid
9
10
 
@@ -44,7 +45,15 @@ class Arachnid
44
45
 
45
46
  links.each do |link|
46
47
  if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
47
- @global_queue << sanitize_link(split_url_at_hash(link))
48
+
49
+ sanitized_link = sanitize_link(split_url_at_hash(link))
50
+ if(sanitized_link)
51
+
52
+ absolute_link = make_absolute(sanitized_link, response.effective_url)
53
+ if(absolute_link)
54
+ @global_queue << absolute_link
55
+ end
56
+ end
48
57
  end
49
58
  end
50
59
 
@@ -52,13 +61,12 @@ class Arachnid
52
61
 
53
62
  @hydra.queue request
54
63
 
55
- @global_visited.insert(q)
56
- @global_queue.delete(q)
57
-
58
- rescue URI::InvalidURIError => e
59
- @global_visited.insert(q)
60
- @global_queue.delete(q)
64
+ rescue URI::InvalidURIError, NoMethodError => e
65
+ puts "Exception caught: #{e}" if @debug == true
61
66
  end
67
+
68
+ @global_visited.insert(q)
69
+ @global_queue.delete(q)
62
70
  end
63
71
 
64
72
  @hydra.run
@@ -121,7 +129,19 @@ class Arachnid
121
129
  end
122
130
 
123
131
  def sanitize_link(url)
124
- return url.gsub(/\s+/, "%20")
132
+ begin
133
+ return url.gsub(/\s+/, "%20")
134
+ rescue
135
+ return false
136
+ end
137
+ end
138
+
139
+ def make_absolute( href, root )
140
+ begin
141
+ URI.parse(root).merge(URI.parse(href)).to_s
142
+ rescue URI::InvalidURIError, URI::InvalidComponentError => e
143
+ return false
144
+ end
125
145
  end
126
146
 
127
147
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: arachnid
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.1.1
5
+ version: 0.1.2
6
6
  platform: ruby
7
7
  authors:
8
8
  - dchuk