arachnid 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/arachnid.rb +28 -8
  2. metadata +1 -1
data/lib/arachnid.rb CHANGED
@@ -4,6 +4,7 @@ require 'typhoeus'
4
4
  require 'bloomfilter-rb'
5
5
  require 'nokogiri'
6
6
  require 'domainatrix'
7
+ require 'uri'
7
8
 
8
9
  class Arachnid
9
10
 
@@ -44,7 +45,15 @@ class Arachnid
44
45
 
45
46
  links.each do |link|
46
47
  if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
47
- @global_queue << sanitize_link(split_url_at_hash(link))
48
+
49
+ sanitized_link = sanitize_link(split_url_at_hash(link))
50
+ if(sanitized_link)
51
+
52
+ absolute_link = make_absolute(sanitized_link, response.effective_url)
53
+ if(absolute_link)
54
+ @global_queue << absolute_link
55
+ end
56
+ end
48
57
  end
49
58
  end
50
59
 
@@ -52,13 +61,12 @@ class Arachnid
52
61
 
53
62
  @hydra.queue request
54
63
 
55
- @global_visited.insert(q)
56
- @global_queue.delete(q)
57
-
58
- rescue URI::InvalidURIError => e
59
- @global_visited.insert(q)
60
- @global_queue.delete(q)
64
+ rescue URI::InvalidURIError, NoMethodError => e
65
+ puts "Exception caught: #{e}" if @debug == true
61
66
  end
67
+
68
+ @global_visited.insert(q)
69
+ @global_queue.delete(q)
62
70
  end
63
71
 
64
72
  @hydra.run
@@ -121,7 +129,19 @@ class Arachnid
121
129
  end
122
130
 
123
131
  def sanitize_link(url)
124
- return url.gsub(/\s+/, "%20")
132
+ begin
133
+ return url.gsub(/\s+/, "%20")
134
+ rescue
135
+ return false
136
+ end
137
+ end
138
+
139
+ def make_absolute( href, root )
140
+ begin
141
+ URI.parse(root).merge(URI.parse(href)).to_s
142
+ rescue URI::InvalidURIError, URI::InvalidComponentError => e
143
+ return false
144
+ end
125
145
  end
126
146
 
127
147
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: arachnid
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.1.1
5
+ version: 0.1.2
6
6
  platform: ruby
7
7
  authors:
8
8
  - dchuk