arachnid 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/arachnid.rb +127 -0
- metadata +99 -0
data/lib/arachnid.rb
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'typhoeus'
|
4
|
+
require 'bloomfilter-rb'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'domainatrix'
|
7
|
+
|
8
|
+
class Arachnid
|
9
|
+
|
10
|
+
def initialize(url, options = {})
|
11
|
+
@start_url = url
|
12
|
+
@domain = parse_domain(url)
|
13
|
+
|
14
|
+
@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
|
15
|
+
@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
|
16
|
+
@exclude_urls_with_images = options[:exclude_urls_with_images] ? options[:exclude_urls_with_images] : false
|
17
|
+
|
18
|
+
@debug = options[:debug] ? options[:debug] : false
|
19
|
+
end
|
20
|
+
|
21
|
+
def crawl(options = {})
|
22
|
+
|
23
|
+
threads = options[:threads] ? options[:threads] : 1
|
24
|
+
|
25
|
+
@hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
|
26
|
+
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
27
|
+
@global_queue = []
|
28
|
+
|
29
|
+
@global_queue << @start_url
|
30
|
+
|
31
|
+
while(@global_queue.size > 0)
|
32
|
+
temp_queue = @global_queue
|
33
|
+
|
34
|
+
temp_queue.each do |q|
|
35
|
+
|
36
|
+
begin
|
37
|
+
request = Typhoeus::Request.new(q, :timeout => 10000)
|
38
|
+
|
39
|
+
request.on_complete do |response|
|
40
|
+
|
41
|
+
yield response
|
42
|
+
|
43
|
+
links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
|
44
|
+
|
45
|
+
links.each do |link|
|
46
|
+
if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
|
47
|
+
@global_queue << sanitize_link(split_url_at_hash(link))
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
@hydra.queue request
|
54
|
+
|
55
|
+
@global_visited.insert(q)
|
56
|
+
@global_queue.delete(q)
|
57
|
+
|
58
|
+
rescue URI::InvalidURIError => e
|
59
|
+
@global_visited.insert(q)
|
60
|
+
@global_queue.delete(q)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
@hydra.run
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
def parse_domain(url)
|
71
|
+
puts "Parsing URL: #{url}" if @debug == true
|
72
|
+
|
73
|
+
begin
|
74
|
+
parsed_domain = Domainatrix.parse(url)
|
75
|
+
parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
|
76
|
+
rescue NoMethodError, Addressable::URI::InvalidURIError => e
|
77
|
+
puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
|
78
|
+
return nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def internal_link?(url)
|
83
|
+
parsed_url = parse_domain(url)
|
84
|
+
if(@domain == parsed_url)
|
85
|
+
return true
|
86
|
+
else
|
87
|
+
return false
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def split_url_at_hash(url)
|
92
|
+
return url.to_s unless @split_url_at_hash
|
93
|
+
|
94
|
+
return url.to_s.split('#')[0]
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
def no_hash_in_url?(url)
|
99
|
+
return true unless @exclude_urls_with_hash
|
100
|
+
|
101
|
+
if(url.to_s.scan(/#/).size > 0)
|
102
|
+
return false
|
103
|
+
else
|
104
|
+
return true
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def no_image_in_url?(url)
|
109
|
+
return true unless @exclude_urls_with_images
|
110
|
+
|
111
|
+
extensions = ['.jpg', '.gif', '.png', '.jpeg']
|
112
|
+
not_found = true
|
113
|
+
|
114
|
+
extensions.each do |e|
|
115
|
+
if(url.to_s[-e.size .. -1] == e.to_s)
|
116
|
+
not_found = false
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
return not_found
|
121
|
+
end
|
122
|
+
|
123
|
+
def sanitize_link(url)
|
124
|
+
return url.gsub(/\s+/, "%20")
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: arachnid
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- dchuk
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-11-11 00:00:00 -08:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: typhoeus
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - "="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.3.2
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bloomfilter-rb
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - "="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 2.1.1
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: nokogiri
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - "="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 1.5.0
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id003
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: domainatrix
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - "="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: 0.0.10
|
58
|
+
type: :runtime
|
59
|
+
version_requirements: *id004
|
60
|
+
description: Arachnid relies on Bloom Filters to efficiently store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling.
|
61
|
+
email: me@dchuk.com
|
62
|
+
executables: []
|
63
|
+
|
64
|
+
extensions: []
|
65
|
+
|
66
|
+
extra_rdoc_files: []
|
67
|
+
|
68
|
+
files:
|
69
|
+
- lib/arachnid.rb
|
70
|
+
has_rdoc: true
|
71
|
+
homepage: https://github.com/dchuk/Arachnid
|
72
|
+
licenses: []
|
73
|
+
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
|
77
|
+
require_paths:
|
78
|
+
- lib
|
79
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
80
|
+
none: false
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: "0"
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: "0"
|
91
|
+
requirements: []
|
92
|
+
|
93
|
+
rubyforge_project:
|
94
|
+
rubygems_version: 1.6.2
|
95
|
+
signing_key:
|
96
|
+
specification_version: 3
|
97
|
+
summary: Extremely fast and efficient domain spider
|
98
|
+
test_files: []
|
99
|
+
|