arachnid 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/arachnid.rb +13 -26
  2. metadata +5 -5
data/lib/arachnid.rb CHANGED
@@ -14,8 +14,7 @@ class Arachnid
14
14
 
15
15
  @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
16
16
  @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
17
- @exclude_urls_with_images = options[:exclude_urls_with_images] ? options[:exclude_urls_with_images] : false
18
- @proxy_list = options[:proxy_list] ? options[:proxy_list] : nil
17
+ @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
19
18
 
20
19
  @debug = options[:debug] ? options[:debug] : false
21
20
  end
@@ -27,8 +26,6 @@ class Arachnid
27
26
  #defaults to -1 so it will always keep running until it runs out of urls
28
27
  max_urls = options[:max_urls] ? options[:max_urls] : nil
29
28
 
30
-
31
-
32
29
  @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
33
30
  @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
34
31
  @global_queue = []
@@ -41,11 +38,7 @@ class Arachnid
41
38
  temp_queue.each do |q|
42
39
 
43
40
  begin
44
- ip,port,user,pass = grab_proxy
45
-
46
- request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
47
- request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
48
- request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
41
+ request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true)
49
42
 
50
43
  request.on_complete do |response|
51
44
 
@@ -54,7 +47,7 @@ class Arachnid
54
47
  links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
55
48
 
56
49
  links.each do |link|
57
- if(internal_link?(link, response.effective_url) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
50
+ if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
58
51
 
59
52
  sanitized_link = sanitize_link(split_url_at_hash(link))
60
53
  if(sanitized_link)
@@ -77,6 +70,7 @@ class Arachnid
77
70
 
78
71
  @global_visited.insert(q)
79
72
  @global_queue.delete(q)
73
+
80
74
  end
81
75
 
82
76
  @hydra.run
@@ -86,7 +80,7 @@ class Arachnid
86
80
  end
87
81
 
88
82
  def parse_domain(url)
89
- puts "Parsing URL: #{url}" if @debug == true
83
+ puts "Parsing URL: #{url}" if @debug
90
84
 
91
85
  begin
92
86
  parsed_domain = Domainatrix.parse(url)
@@ -97,19 +91,11 @@ class Arachnid
97
91
  parsed_domain.domain + '.' + parsed_domain.public_suffix
98
92
  end
99
93
  rescue NoMethodError, Addressable::URI::InvalidURIError => e
100
- puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
94
+ puts "URL Parsing Exception (#{url}): #{e}"
101
95
  return nil
102
96
  end
103
97
  end
104
98
 
105
- def grab_proxy
106
-
107
- return nil unless @proxy_list
108
-
109
- return @proxy_list.sample.split(':')
110
-
111
- end
112
-
113
99
  def internal_link?(url, effective_url)
114
100
 
115
101
  absolute_url = make_absolute(url, effective_url)
@@ -139,16 +125,16 @@ class Arachnid
139
125
  end
140
126
  end
141
127
 
142
- def no_image_in_url?(url)
128
+ def ignore_extensions(url)
143
129
  return true if url.to_s.length == 0
144
- return true unless @exclude_urls_with_images
130
+ return true unless @exclude_urls_with_extensions
145
131
 
146
- extensions = ['.jpg', '.gif', '.png', '.jpeg']
147
132
  not_found = true
148
133
 
149
- extensions.each do |e|
150
- if(url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
134
+ @exclude_urls_with_extensions.each do |e|
135
+ if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
151
136
  not_found = false
137
+ puts "#{e} Found At URL: #{url}" if @debug
152
138
  end
153
139
  end
154
140
 
@@ -164,8 +150,9 @@ class Arachnid
164
150
  end
165
151
 
166
152
  def make_absolute( href, root )
153
+
167
154
  begin
168
- URI.parse(root).merge(URI.parse(href)).to_s
155
+ URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
169
156
  rescue URI::InvalidURIError, URI::InvalidComponentError => e
170
157
  return false
171
158
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -64,17 +64,17 @@ dependencies:
64
64
  requirement: !ruby/object:Gem::Requirement
65
65
  none: false
66
66
  requirements:
67
- - - '='
67
+ - - ! '>='
68
68
  - !ruby/object:Gem::Version
69
- version: 0.0.10
69
+ version: '0'
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  none: false
74
74
  requirements:
75
- - - '='
75
+ - - ! '>='
76
76
  - !ruby/object:Gem::Version
77
- version: 0.0.10
77
+ version: '0'
78
78
  description: Arachnid is a web crawler that relies on Bloom Filters to efficiently
79
79
  store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
80
80
  every page on a domain.