arachnid 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/arachnid.rb +22 -1
  2. metadata +1 -1
data/lib/arachnid.rb CHANGED
@@ -15,6 +15,7 @@ class Arachnid
15
15
  @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
16
16
  @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
17
17
  @exclude_urls_with_images = options[:exclude_urls_with_images] ? options[:exclude_urls_with_images] : false
18
+ @proxy_list = options[:proxy_list] ? options[:proxy_list] : nil
18
19
 
19
20
  @debug = options[:debug] ? options[:debug] : false
20
21
  end
@@ -26,6 +27,8 @@ class Arachnid
26
27
  #defaults to -1 so it will always keep running until it runs out of urls
27
28
  max_urls = options[:max_urls] ? options[:max_urls] : nil
28
29
 
30
+
31
+
29
32
  @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
30
33
  @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
31
34
  @global_queue = []
@@ -38,7 +41,11 @@ class Arachnid
38
41
  temp_queue.each do |q|
39
42
 
40
43
  begin
41
- request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true)
44
+ ip,port,user,pass = grab_proxy
45
+
46
+ request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
47
+ request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
48
+ request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
42
49
 
43
50
  request.on_complete do |response|
44
51
 
@@ -95,6 +102,20 @@ class Arachnid
95
102
  end
96
103
  end
97
104
 
105
+ def internal_link?(url, effective_url)
106
+
107
+ absolute_url = make_absolute(url, effective_url)
108
+
109
+ parsed_url = parse_domain(absolute_url)
110
+
111
+ def grab_proxy
112
+
113
+ return nil unless @proxy_list
114
+
115
+ return @proxy_list.sample.split(':')
116
+
117
+ end
118
+
98
119
  def internal_link?(url, effective_url)
99
120
 
100
121
  absolute_url = make_absolute(url, effective_url)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: