arachnid 0.3.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/arachnid.rb +13 -26
- metadata +5 -5
data/lib/arachnid.rb
CHANGED
@@ -14,8 +14,7 @@ class Arachnid
|
|
14
14
|
|
15
15
|
@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
|
16
16
|
@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
|
17
|
-
@
|
18
|
-
@proxy_list = options[:proxy_list] ? options[:proxy_list] : nil
|
17
|
+
@exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
|
19
18
|
|
20
19
|
@debug = options[:debug] ? options[:debug] : false
|
21
20
|
end
|
@@ -27,8 +26,6 @@ class Arachnid
|
|
27
26
|
#defaults to -1 so it will always keep running until it runs out of urls
|
28
27
|
max_urls = options[:max_urls] ? options[:max_urls] : nil
|
29
28
|
|
30
|
-
|
31
|
-
|
32
29
|
@hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
|
33
30
|
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
34
31
|
@global_queue = []
|
@@ -41,11 +38,7 @@ class Arachnid
|
|
41
38
|
temp_queue.each do |q|
|
42
39
|
|
43
40
|
begin
|
44
|
-
|
45
|
-
|
46
|
-
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
|
47
|
-
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
|
48
|
-
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
|
41
|
+
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true)
|
49
42
|
|
50
43
|
request.on_complete do |response|
|
51
44
|
|
@@ -54,7 +47,7 @@ class Arachnid
|
|
54
47
|
links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
|
55
48
|
|
56
49
|
links.each do |link|
|
57
|
-
if(internal_link?(link, response.effective_url) && !@global_visited.include?(
|
50
|
+
if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
|
58
51
|
|
59
52
|
sanitized_link = sanitize_link(split_url_at_hash(link))
|
60
53
|
if(sanitized_link)
|
@@ -77,6 +70,7 @@ class Arachnid
|
|
77
70
|
|
78
71
|
@global_visited.insert(q)
|
79
72
|
@global_queue.delete(q)
|
73
|
+
|
80
74
|
end
|
81
75
|
|
82
76
|
@hydra.run
|
@@ -86,7 +80,7 @@ class Arachnid
|
|
86
80
|
end
|
87
81
|
|
88
82
|
def parse_domain(url)
|
89
|
-
puts "Parsing URL: #{url}" if @debug
|
83
|
+
puts "Parsing URL: #{url}" if @debug
|
90
84
|
|
91
85
|
begin
|
92
86
|
parsed_domain = Domainatrix.parse(url)
|
@@ -97,19 +91,11 @@ class Arachnid
|
|
97
91
|
parsed_domain.domain + '.' + parsed_domain.public_suffix
|
98
92
|
end
|
99
93
|
rescue NoMethodError, Addressable::URI::InvalidURIError => e
|
100
|
-
puts "URL Parsing Exception (#{url}): #{e}"
|
94
|
+
puts "URL Parsing Exception (#{url}): #{e}"
|
101
95
|
return nil
|
102
96
|
end
|
103
97
|
end
|
104
98
|
|
105
|
-
def grab_proxy
|
106
|
-
|
107
|
-
return nil unless @proxy_list
|
108
|
-
|
109
|
-
return @proxy_list.sample.split(':')
|
110
|
-
|
111
|
-
end
|
112
|
-
|
113
99
|
def internal_link?(url, effective_url)
|
114
100
|
|
115
101
|
absolute_url = make_absolute(url, effective_url)
|
@@ -139,16 +125,16 @@ class Arachnid
|
|
139
125
|
end
|
140
126
|
end
|
141
127
|
|
142
|
-
def
|
128
|
+
def ignore_extensions(url)
|
143
129
|
return true if url.to_s.length == 0
|
144
|
-
return true unless @
|
130
|
+
return true unless @exclude_urls_with_extensions
|
145
131
|
|
146
|
-
extensions = ['.jpg', '.gif', '.png', '.jpeg']
|
147
132
|
not_found = true
|
148
133
|
|
149
|
-
|
150
|
-
if(url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
|
134
|
+
@exclude_urls_with_extensions.each do |e|
|
135
|
+
if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
|
151
136
|
not_found = false
|
137
|
+
puts "#{e} Found At URL: #{url}" if @debug
|
152
138
|
end
|
153
139
|
end
|
154
140
|
|
@@ -164,8 +150,9 @@ class Arachnid
|
|
164
150
|
end
|
165
151
|
|
166
152
|
def make_absolute( href, root )
|
153
|
+
|
167
154
|
begin
|
168
|
-
URI.parse(root).merge(URI.parse(href)).to_s
|
155
|
+
URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
|
169
156
|
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
170
157
|
return false
|
171
158
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -64,17 +64,17 @@ dependencies:
|
|
64
64
|
requirement: !ruby/object:Gem::Requirement
|
65
65
|
none: false
|
66
66
|
requirements:
|
67
|
-
- - '
|
67
|
+
- - ! '>='
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0
|
69
|
+
version: '0'
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
none: false
|
74
74
|
requirements:
|
75
|
-
- - '
|
75
|
+
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
|
-
version: 0
|
77
|
+
version: '0'
|
78
78
|
description: Arachnid is a web crawler that relies on Bloom Filters to efficiently
|
79
79
|
store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
|
80
80
|
every page on a domain.
|