arachnidish 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/arachnidish.rb +174 -0
- metadata +102 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cbba3f96eda4cf0065a0b7d65c373bbac54b55bf
|
4
|
+
data.tar.gz: ebc7dd1e5ae6432c83534f94f77057b0cb2716f3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3cf72eae18e2c7c9c43a8405f9ad0dc88f131df092d35fb550348cea3e3a84bae4f37bc666e7683c0a459ab19b09cbda380c9fe859b70edfa5870500fa838099
|
7
|
+
data.tar.gz: b7e0f862dd41359c9e994c8b6c4f993f0b95e0a0feeb3afe62d3aa9e08697c140d63948b21be57cbcd615aab5895a4a94e5fbaa0657b900f6ebcf674941e6675
|
data/lib/arachnidish.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'typhoeus'
|
4
|
+
require 'bloomfilter-rb'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'domainatrix'
|
7
|
+
require 'uri'
|
8
|
+
|
9
|
+
class Arachnidish
|
10
|
+
|
11
|
+
def initialize(url, options = {})
|
12
|
+
@start_url = url
|
13
|
+
@domain = parse_domain(url)
|
14
|
+
|
15
|
+
@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
|
16
|
+
@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
|
17
|
+
@exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
|
18
|
+
@proxy_list = options[:proxy_list] ? options[:proxy_list] : false
|
19
|
+
|
20
|
+
@debug = options[:debug] ? options[:debug] : false
|
21
|
+
end
|
22
|
+
|
23
|
+
def crawl(options = {})
|
24
|
+
|
25
|
+
#defaults to 1 thread so people don't do a stupid amount of crawling on unsuspecting domains
|
26
|
+
threads = options[:threads] ? options[:threads] : 1
|
27
|
+
#defaults to -1 so it will always keep running until it runs out of urls
|
28
|
+
max_urls = options[:max_urls] ? options[:max_urls] : nil
|
29
|
+
|
30
|
+
@hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
|
31
|
+
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
32
|
+
@global_queue = []
|
33
|
+
|
34
|
+
@global_queue << @start_url
|
35
|
+
|
36
|
+
while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
|
37
|
+
temp_queue = @global_queue
|
38
|
+
|
39
|
+
temp_queue.each do |q|
|
40
|
+
|
41
|
+
begin
|
42
|
+
ip,port,user,pass = grab_proxy
|
43
|
+
|
44
|
+
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
|
45
|
+
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
|
46
|
+
request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
|
47
|
+
|
48
|
+
request.on_complete do |response|
|
49
|
+
|
50
|
+
yield response
|
51
|
+
|
52
|
+
links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
|
53
|
+
|
54
|
+
links.each do |link|
|
55
|
+
if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
|
56
|
+
|
57
|
+
sanitized_link = sanitize_link(split_url_at_hash(link))
|
58
|
+
if(sanitized_link)
|
59
|
+
|
60
|
+
absolute_link = make_absolute(sanitized_link, response.effective_url)
|
61
|
+
if(absolute_link)
|
62
|
+
@global_queue << absolute_link
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
@hydra.queue request
|
71
|
+
|
72
|
+
rescue URI::InvalidURIError, NoMethodError => e
|
73
|
+
puts "Exception caught: #{e}" if @debug == true
|
74
|
+
end
|
75
|
+
|
76
|
+
@global_visited.insert(q)
|
77
|
+
@global_queue.delete(q)
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
@hydra.run
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
def grab_proxy
|
88
|
+
|
89
|
+
return nil unless @proxy_list
|
90
|
+
|
91
|
+
return @proxy_list.sample.split(':')
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
def parse_domain(url)
|
96
|
+
puts "Parsing URL: #{url}" if @debug
|
97
|
+
|
98
|
+
begin
|
99
|
+
parsed_domain = Domainatrix.parse(url)
|
100
|
+
|
101
|
+
if(parsed_domain.subdomain != "")
|
102
|
+
parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
|
103
|
+
else
|
104
|
+
parsed_domain.domain + '.' + parsed_domain.public_suffix
|
105
|
+
end
|
106
|
+
rescue NoMethodError, Addressable::URI::InvalidURIError => e
|
107
|
+
puts "URL Parsing Exception (#{url}): #{e}"
|
108
|
+
return nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def internal_link?(url, effective_url)
|
113
|
+
|
114
|
+
absolute_url = make_absolute(url, effective_url)
|
115
|
+
|
116
|
+
parsed_url = parse_domain(absolute_url)
|
117
|
+
if(@domain == parsed_url)
|
118
|
+
return true
|
119
|
+
else
|
120
|
+
return false
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def split_url_at_hash(url)
|
125
|
+
return url.to_s unless @split_url_at_hash
|
126
|
+
|
127
|
+
return url.to_s.split('#')[0]
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
def no_hash_in_url?(url)
|
132
|
+
return true unless @exclude_urls_with_hash
|
133
|
+
|
134
|
+
if(url.to_s.scan(/#/).size > 0)
|
135
|
+
return false
|
136
|
+
else
|
137
|
+
return true
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def ignore_extensions(url)
|
142
|
+
return true if url.to_s.length == 0
|
143
|
+
return true unless @exclude_urls_with_extensions
|
144
|
+
|
145
|
+
not_found = true
|
146
|
+
|
147
|
+
@exclude_urls_with_extensions.each do |e|
|
148
|
+
if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
|
149
|
+
not_found = false
|
150
|
+
puts "#{e} Found At URL: #{url}" if @debug
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
return not_found
|
155
|
+
end
|
156
|
+
|
157
|
+
def sanitize_link(url)
|
158
|
+
begin
|
159
|
+
return url.gsub(/\s+/, "%20")
|
160
|
+
rescue
|
161
|
+
return false
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def make_absolute( href, root )
|
166
|
+
|
167
|
+
begin
|
168
|
+
URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
|
169
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
170
|
+
return false
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: arachnidish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- csphere
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-01-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: typhoeus
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.3.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.3.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bloomfilter-rb
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 2.1.1
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 2.1.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: domainatrix
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: Arachnidish is a web crawler that relies on Bloom Filters to efficiently
|
70
|
+
store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
|
71
|
+
every page on a domain.
|
72
|
+
email: cspheregreen@gmail.com
|
73
|
+
executables: []
|
74
|
+
extensions: []
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- lib/arachnidish.rb
|
78
|
+
homepage: https://github.com/csphere/Arachnid
|
79
|
+
licenses: []
|
80
|
+
metadata: {}
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubyforge_project:
|
97
|
+
rubygems_version: 2.4.1
|
98
|
+
signing_key:
|
99
|
+
specification_version: 4
|
100
|
+
summary: Extremely fast and efficient domain spider, that isn't quite so strict
|
101
|
+
test_files: []
|
102
|
+
has_rdoc:
|