arachnid 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/arachnid.rb +13 -26
- metadata +5 -5
data/lib/arachnid.rb
CHANGED
@@ -14,8 +14,7 @@ class Arachnid
|
|
14
14
|
|
15
15
|
@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
|
16
16
|
@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
|
17
|
-
@
|
18
|
-
@proxy_list = options[:proxy_list] ? options[:proxy_list] : nil
|
17
|
+
@exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
|
19
18
|
|
20
19
|
@debug = options[:debug] ? options[:debug] : false
|
21
20
|
end
|
@@ -27,8 +26,6 @@ class Arachnid
|
|
27
26
|
#defaults to -1 so it will always keep running until it runs out of urls
|
28
27
|
max_urls = options[:max_urls] ? options[:max_urls] : nil
|
29
28
|
|
30
|
-
|
31
|
-
|
32
29
|
@hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
|
33
30
|
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
34
31
|
@global_queue = []
|
@@ -41,11 +38,7 @@ class Arachnid
|
|
41
38
|
temp_queue.each do |q|
|
42
39
|
|
43
40
|
begin
|
44
|
-
|
45
|
-
|
46
|
-
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
|
47
|
-
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
|
48
|
-
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
|
41
|
+
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true)
|
49
42
|
|
50
43
|
request.on_complete do |response|
|
51
44
|
|
@@ -54,7 +47,7 @@ class Arachnid
|
|
54
47
|
links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
|
55
48
|
|
56
49
|
links.each do |link|
|
57
|
-
if(internal_link?(link, response.effective_url) && !@global_visited.include?(
|
50
|
+
if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
|
58
51
|
|
59
52
|
sanitized_link = sanitize_link(split_url_at_hash(link))
|
60
53
|
if(sanitized_link)
|
@@ -77,6 +70,7 @@ class Arachnid
|
|
77
70
|
|
78
71
|
@global_visited.insert(q)
|
79
72
|
@global_queue.delete(q)
|
73
|
+
|
80
74
|
end
|
81
75
|
|
82
76
|
@hydra.run
|
@@ -86,7 +80,7 @@ class Arachnid
|
|
86
80
|
end
|
87
81
|
|
88
82
|
def parse_domain(url)
|
89
|
-
puts "Parsing URL: #{url}" if @debug
|
83
|
+
puts "Parsing URL: #{url}" if @debug
|
90
84
|
|
91
85
|
begin
|
92
86
|
parsed_domain = Domainatrix.parse(url)
|
@@ -97,19 +91,11 @@ class Arachnid
|
|
97
91
|
parsed_domain.domain + '.' + parsed_domain.public_suffix
|
98
92
|
end
|
99
93
|
rescue NoMethodError, Addressable::URI::InvalidURIError => e
|
100
|
-
puts "URL Parsing Exception (#{url}): #{e}"
|
94
|
+
puts "URL Parsing Exception (#{url}): #{e}"
|
101
95
|
return nil
|
102
96
|
end
|
103
97
|
end
|
104
98
|
|
105
|
-
def grab_proxy
|
106
|
-
|
107
|
-
return nil unless @proxy_list
|
108
|
-
|
109
|
-
return @proxy_list.sample.split(':')
|
110
|
-
|
111
|
-
end
|
112
|
-
|
113
99
|
def internal_link?(url, effective_url)
|
114
100
|
|
115
101
|
absolute_url = make_absolute(url, effective_url)
|
@@ -139,16 +125,16 @@ class Arachnid
|
|
139
125
|
end
|
140
126
|
end
|
141
127
|
|
142
|
-
def
|
128
|
+
def ignore_extensions(url)
|
143
129
|
return true if url.to_s.length == 0
|
144
|
-
return true unless @
|
130
|
+
return true unless @exclude_urls_with_extensions
|
145
131
|
|
146
|
-
extensions = ['.jpg', '.gif', '.png', '.jpeg']
|
147
132
|
not_found = true
|
148
133
|
|
149
|
-
|
150
|
-
if(url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
|
134
|
+
@exclude_urls_with_extensions.each do |e|
|
135
|
+
if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
|
151
136
|
not_found = false
|
137
|
+
puts "#{e} Found At URL: #{url}" if @debug
|
152
138
|
end
|
153
139
|
end
|
154
140
|
|
@@ -164,8 +150,9 @@ class Arachnid
|
|
164
150
|
end
|
165
151
|
|
166
152
|
def make_absolute( href, root )
|
153
|
+
|
167
154
|
begin
|
168
|
-
URI.parse(root).merge(URI.parse(href)).to_s
|
155
|
+
URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
|
169
156
|
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
170
157
|
return false
|
171
158
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -64,17 +64,17 @@ dependencies:
|
|
64
64
|
requirement: !ruby/object:Gem::Requirement
|
65
65
|
none: false
|
66
66
|
requirements:
|
67
|
-
- - '
|
67
|
+
- - ! '>='
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0
|
69
|
+
version: '0'
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
none: false
|
74
74
|
requirements:
|
75
|
-
- - '
|
75
|
+
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
|
-
version: 0
|
77
|
+
version: '0'
|
78
78
|
description: Arachnid is a web crawler that relies on Bloom Filters to efficiently
|
79
79
|
store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
|
80
80
|
every page on a domain.
|