arachnid 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/arachnid.rb +22 -1
- metadata +1 -1
data/lib/arachnid.rb
CHANGED
@@ -15,6 +15,7 @@ class Arachnid
|
|
15
15
|
@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
|
16
16
|
@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
|
17
17
|
@exclude_urls_with_images = options[:exclude_urls_with_images] ? options[:exclude_urls_with_images] : false
|
18
|
+
@proxy_list = options[:proxy_list] ? options[:proxy_list] : nil
|
18
19
|
|
19
20
|
@debug = options[:debug] ? options[:debug] : false
|
20
21
|
end
|
@@ -26,6 +27,8 @@ class Arachnid
|
|
26
27
|
#defaults to -1 so it will always keep running until it runs out of urls
|
27
28
|
max_urls = options[:max_urls] ? options[:max_urls] : nil
|
28
29
|
|
30
|
+
|
31
|
+
|
29
32
|
@hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
|
30
33
|
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
31
34
|
@global_queue = []
|
@@ -38,7 +41,11 @@ class Arachnid
|
|
38
41
|
temp_queue.each do |q|
|
39
42
|
|
40
43
|
begin
|
41
|
-
|
44
|
+
ip,port,user,pass = grab_proxy
|
45
|
+
|
46
|
+
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
|
47
|
+
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
|
48
|
+
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
|
42
49
|
|
43
50
|
request.on_complete do |response|
|
44
51
|
|
@@ -95,6 +102,20 @@ class Arachnid
|
|
95
102
|
end
|
96
103
|
end
|
97
104
|
|
105
|
+
def internal_link?(url, effective_url)
|
106
|
+
|
107
|
+
absolute_url = make_absolute(url, effective_url)
|
108
|
+
|
109
|
+
parsed_url = parse_domain(absolute_url)
|
110
|
+
|
111
|
+
def grab_proxy
|
112
|
+
|
113
|
+
return nil unless @proxy_list
|
114
|
+
|
115
|
+
return @proxy_list.sample.split(':')
|
116
|
+
|
117
|
+
end
|
118
|
+
|
98
119
|
def internal_link?(url, effective_url)
|
99
120
|
|
100
121
|
absolute_url = make_absolute(url, effective_url)
|