arachnid 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/arachnid.rb +13 -26
  2. metadata +5 -5
data/lib/arachnid.rb CHANGED
@@ -14,8 +14,7 @@ class Arachnid
14
14
 
15
15
  @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
16
16
  @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
17
- @exclude_urls_with_images = options[:exclude_urls_with_images] ? options[:exclude_urls_with_images] : false
18
- @proxy_list = options[:proxy_list] ? options[:proxy_list] : nil
17
+ @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
19
18
 
20
19
  @debug = options[:debug] ? options[:debug] : false
21
20
  end
@@ -27,8 +26,6 @@ class Arachnid
27
26
  #defaults to -1 so it will always keep running until it runs out of urls
28
27
  max_urls = options[:max_urls] ? options[:max_urls] : nil
29
28
 
30
-
31
-
32
29
  @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
33
30
  @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
34
31
  @global_queue = []
@@ -41,11 +38,7 @@ class Arachnid
41
38
  temp_queue.each do |q|
42
39
 
43
40
  begin
44
- ip,port,user,pass = grab_proxy
45
-
46
- request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
47
- request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
48
- request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
41
+ request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true)
49
42
 
50
43
  request.on_complete do |response|
51
44
 
@@ -54,7 +47,7 @@ class Arachnid
54
47
  links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
55
48
 
56
49
  links.each do |link|
57
- if(internal_link?(link, response.effective_url) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
50
+ if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
58
51
 
59
52
  sanitized_link = sanitize_link(split_url_at_hash(link))
60
53
  if(sanitized_link)
@@ -77,6 +70,7 @@ class Arachnid
77
70
 
78
71
  @global_visited.insert(q)
79
72
  @global_queue.delete(q)
73
+
80
74
  end
81
75
 
82
76
  @hydra.run
@@ -86,7 +80,7 @@ class Arachnid
86
80
  end
87
81
 
88
82
  def parse_domain(url)
89
- puts "Parsing URL: #{url}" if @debug == true
83
+ puts "Parsing URL: #{url}" if @debug
90
84
 
91
85
  begin
92
86
  parsed_domain = Domainatrix.parse(url)
@@ -97,19 +91,11 @@ class Arachnid
97
91
  parsed_domain.domain + '.' + parsed_domain.public_suffix
98
92
  end
99
93
  rescue NoMethodError, Addressable::URI::InvalidURIError => e
100
- puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
94
+ puts "URL Parsing Exception (#{url}): #{e}"
101
95
  return nil
102
96
  end
103
97
  end
104
98
 
105
- def grab_proxy
106
-
107
- return nil unless @proxy_list
108
-
109
- return @proxy_list.sample.split(':')
110
-
111
- end
112
-
113
99
  def internal_link?(url, effective_url)
114
100
 
115
101
  absolute_url = make_absolute(url, effective_url)
@@ -139,16 +125,16 @@ class Arachnid
139
125
  end
140
126
  end
141
127
 
142
- def no_image_in_url?(url)
128
+ def ignore_extensions(url)
143
129
  return true if url.to_s.length == 0
144
- return true unless @exclude_urls_with_images
130
+ return true unless @exclude_urls_with_extensions
145
131
 
146
- extensions = ['.jpg', '.gif', '.png', '.jpeg']
147
132
  not_found = true
148
133
 
149
- extensions.each do |e|
150
- if(url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
134
+ @exclude_urls_with_extensions.each do |e|
135
+ if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
151
136
  not_found = false
137
+ puts "#{e} Found At URL: #{url}" if @debug
152
138
  end
153
139
  end
154
140
 
@@ -164,8 +150,9 @@ class Arachnid
164
150
  end
165
151
 
166
152
  def make_absolute( href, root )
153
+
167
154
  begin
168
- URI.parse(root).merge(URI.parse(href)).to_s
155
+ URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
169
156
  rescue URI::InvalidURIError, URI::InvalidComponentError => e
170
157
  return false
171
158
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -64,17 +64,17 @@ dependencies:
64
64
  requirement: !ruby/object:Gem::Requirement
65
65
  none: false
66
66
  requirements:
67
- - - '='
67
+ - - ! '>='
68
68
  - !ruby/object:Gem::Version
69
- version: 0.0.10
69
+ version: '0'
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  none: false
74
74
  requirements:
75
- - - '='
75
+ - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
- version: 0.0.10
77
+ version: '0'
78
78
  description: Arachnid is a web crawler that relies on Bloom Filters to efficiently
79
79
  store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
80
80
  every page on a domain.