arachnid 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/arachnid.rb +28 -8
- metadata +1 -1
data/lib/arachnid.rb
CHANGED
@@ -4,6 +4,7 @@ require 'typhoeus'
|
|
4
4
|
require 'bloomfilter-rb'
|
5
5
|
require 'nokogiri'
|
6
6
|
require 'domainatrix'
|
7
|
+
require 'uri'
|
7
8
|
|
8
9
|
class Arachnid
|
9
10
|
|
@@ -44,7 +45,15 @@ class Arachnid
|
|
44
45
|
|
45
46
|
links.each do |link|
|
46
47
|
if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
|
47
|
-
|
48
|
+
|
49
|
+
sanitized_link = sanitize_link(split_url_at_hash(link))
|
50
|
+
if(sanitized_link)
|
51
|
+
|
52
|
+
absolute_link = make_absolute(sanitized_link, response.effective_url)
|
53
|
+
if(absolute_link)
|
54
|
+
@global_queue << absolute_link
|
55
|
+
end
|
56
|
+
end
|
48
57
|
end
|
49
58
|
end
|
50
59
|
|
@@ -52,13 +61,12 @@ class Arachnid
|
|
52
61
|
|
53
62
|
@hydra.queue request
|
54
63
|
|
55
|
-
|
56
|
-
@
|
57
|
-
|
58
|
-
rescue URI::InvalidURIError => e
|
59
|
-
@global_visited.insert(q)
|
60
|
-
@global_queue.delete(q)
|
64
|
+
rescue URI::InvalidURIError, NoMethodError => e
|
65
|
+
puts "Exception caught: #{e}" if @debug == true
|
61
66
|
end
|
67
|
+
|
68
|
+
@global_visited.insert(q)
|
69
|
+
@global_queue.delete(q)
|
62
70
|
end
|
63
71
|
|
64
72
|
@hydra.run
|
@@ -121,7 +129,19 @@ class Arachnid
|
|
121
129
|
end
|
122
130
|
|
123
131
|
def sanitize_link(url)
|
124
|
-
|
132
|
+
begin
|
133
|
+
return url.gsub(/\s+/, "%20")
|
134
|
+
rescue
|
135
|
+
return false
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def make_absolute( href, root )
|
140
|
+
begin
|
141
|
+
URI.parse(root).merge(URI.parse(href)).to_s
|
142
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
143
|
+
return false
|
144
|
+
end
|
125
145
|
end
|
126
146
|
|
127
147
|
end
|