arachnid 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/arachnid.rb +127 -0
  2. metadata +99 -0
data/lib/arachnid.rb ADDED
@@ -0,0 +1,127 @@
1
+ # encoding: utf-8
2
+
3
+ require 'typhoeus'
4
+ require 'bloomfilter-rb'
5
+ require 'nokogiri'
6
+ require 'domainatrix'
7
+
8
+ class Arachnid
9
+
10
+ def initialize(url, options = {})
11
+ @start_url = url
12
+ @domain = parse_domain(url)
13
+
14
+ @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
15
+ @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
16
+ @exclude_urls_with_images = options[:exclude_urls_with_images] ? options[:exclude_urls_with_images] : false
17
+
18
+ @debug = options[:debug] ? options[:debug] : false
19
+ end
20
+
21
+ def crawl(options = {})
22
+
23
+ threads = options[:threads] ? options[:threads] : 1
24
+
25
+ @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
26
+ @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
27
+ @global_queue = []
28
+
29
+ @global_queue << @start_url
30
+
31
+ while(@global_queue.size > 0)
32
+ temp_queue = @global_queue
33
+
34
+ temp_queue.each do |q|
35
+
36
+ begin
37
+ request = Typhoeus::Request.new(q, :timeout => 10000)
38
+
39
+ request.on_complete do |response|
40
+
41
+ yield response
42
+
43
+ links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
44
+
45
+ links.each do |link|
46
+ if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
47
+ @global_queue << sanitize_link(split_url_at_hash(link))
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ @hydra.queue request
54
+
55
+ @global_visited.insert(q)
56
+ @global_queue.delete(q)
57
+
58
+ rescue URI::InvalidURIError => e
59
+ @global_visited.insert(q)
60
+ @global_queue.delete(q)
61
+ end
62
+ end
63
+
64
+ @hydra.run
65
+
66
+ end
67
+
68
+ end
69
+
70
+ def parse_domain(url)
71
+ puts "Parsing URL: #{url}" if @debug == true
72
+
73
+ begin
74
+ parsed_domain = Domainatrix.parse(url)
75
+ parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
76
+ rescue NoMethodError, Addressable::URI::InvalidURIError => e
77
+ puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
78
+ return nil
79
+ end
80
+ end
81
+
82
+ def internal_link?(url)
83
+ parsed_url = parse_domain(url)
84
+ if(@domain == parsed_url)
85
+ return true
86
+ else
87
+ return false
88
+ end
89
+ end
90
+
91
+ def split_url_at_hash(url)
92
+ return url.to_s unless @split_url_at_hash
93
+
94
+ return url.to_s.split('#')[0]
95
+
96
+ end
97
+
98
+ def no_hash_in_url?(url)
99
+ return true unless @exclude_urls_with_hash
100
+
101
+ if(url.to_s.scan(/#/).size > 0)
102
+ return false
103
+ else
104
+ return true
105
+ end
106
+ end
107
+
108
+ def no_image_in_url?(url)
109
+ return true unless @exclude_urls_with_images
110
+
111
+ extensions = ['.jpg', '.gif', '.png', '.jpeg']
112
+ not_found = true
113
+
114
+ extensions.each do |e|
115
+ if(url.to_s[-e.size .. -1] == e.to_s)
116
+ not_found = false
117
+ end
118
+ end
119
+
120
+ return not_found
121
+ end
122
+
123
+ def sanitize_link(url)
124
+ return url.gsub(/\s+/, "%20")
125
+ end
126
+
127
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: arachnid
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - dchuk
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-11-11 00:00:00 -08:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: typhoeus
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - "="
23
+ - !ruby/object:Gem::Version
24
+ version: 0.3.2
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: bloomfilter-rb
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - "="
34
+ - !ruby/object:Gem::Version
35
+ version: 2.1.1
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: nokogiri
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - "="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.5.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: domainatrix
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - "="
56
+ - !ruby/object:Gem::Version
57
+ version: 0.0.10
58
+ type: :runtime
59
+ version_requirements: *id004
60
+ description: Arachnid relies on Bloom Filters to efficiently store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling.
61
+ email: me@dchuk.com
62
+ executables: []
63
+
64
+ extensions: []
65
+
66
+ extra_rdoc_files: []
67
+
68
+ files:
69
+ - lib/arachnid.rb
70
+ has_rdoc: true
71
+ homepage: https://github.com/dchuk/Arachnid
72
+ licenses: []
73
+
74
+ post_install_message:
75
+ rdoc_options: []
76
+
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: "0"
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: "0"
91
+ requirements: []
92
+
93
+ rubyforge_project:
94
+ rubygems_version: 1.6.2
95
+ signing_key:
96
+ specification_version: 3
97
+ summary: Extremely fast and efficient domain spider
98
+ test_files: []
99
+