arachnid 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/arachnid.rb +127 -0
  2. metadata +99 -0
data/lib/arachnid.rb ADDED
@@ -0,0 +1,127 @@
1
+ # encoding: utf-8
2
+
3
+ require 'typhoeus'
4
+ require 'bloomfilter-rb'
5
+ require 'nokogiri'
6
+ require 'domainatrix'
7
+
8
+ class Arachnid
9
+
10
+ def initialize(url, options = {})
11
+ @start_url = url
12
+ @domain = parse_domain(url)
13
+
14
+ @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
15
+ @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
16
+ @exclude_urls_with_images = options[:exclude_urls_with_images] ? options[:exclude_urls_with_images] : false
17
+
18
+ @debug = options[:debug] ? options[:debug] : false
19
+ end
20
+
21
+ def crawl(options = {})
22
+
23
+ threads = options[:threads] ? options[:threads] : 1
24
+
25
+ @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
26
+ @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
27
+ @global_queue = []
28
+
29
+ @global_queue << @start_url
30
+
31
+ while(@global_queue.size > 0)
32
+ temp_queue = @global_queue
33
+
34
+ temp_queue.each do |q|
35
+
36
+ begin
37
+ request = Typhoeus::Request.new(q, :timeout => 10000)
38
+
39
+ request.on_complete do |response|
40
+
41
+ yield response
42
+
43
+ links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
44
+
45
+ links.each do |link|
46
+ if(internal_link?(link) && !@global_visited.include?(split_url_at_hash(link)) && no_hash_in_url?(link) && no_image_in_url?(link))
47
+ @global_queue << sanitize_link(split_url_at_hash(link))
48
+ end
49
+ end
50
+
51
+ end
52
+
53
+ @hydra.queue request
54
+
55
+ @global_visited.insert(q)
56
+ @global_queue.delete(q)
57
+
58
+ rescue URI::InvalidURIError => e
59
+ @global_visited.insert(q)
60
+ @global_queue.delete(q)
61
+ end
62
+ end
63
+
64
+ @hydra.run
65
+
66
+ end
67
+
68
+ end
69
+
70
+ def parse_domain(url)
71
+ puts "Parsing URL: #{url}" if @debug == true
72
+
73
+ begin
74
+ parsed_domain = Domainatrix.parse(url)
75
+ parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
76
+ rescue NoMethodError, Addressable::URI::InvalidURIError => e
77
+ puts "URL Parsing Exception (#{url}): #{e}" if @debug == true
78
+ return nil
79
+ end
80
+ end
81
+
82
+ def internal_link?(url)
83
+ parsed_url = parse_domain(url)
84
+ if(@domain == parsed_url)
85
+ return true
86
+ else
87
+ return false
88
+ end
89
+ end
90
+
91
+ def split_url_at_hash(url)
92
+ return url.to_s unless @split_url_at_hash
93
+
94
+ return url.to_s.split('#')[0]
95
+
96
+ end
97
+
98
+ def no_hash_in_url?(url)
99
+ return true unless @exclude_urls_with_hash
100
+
101
+ if(url.to_s.scan(/#/).size > 0)
102
+ return false
103
+ else
104
+ return true
105
+ end
106
+ end
107
+
108
+ def no_image_in_url?(url)
109
+ return true unless @exclude_urls_with_images
110
+
111
+ extensions = ['.jpg', '.gif', '.png', '.jpeg']
112
+ not_found = true
113
+
114
+ extensions.each do |e|
115
+ if(url.to_s[-e.size .. -1] == e.to_s)
116
+ not_found = false
117
+ end
118
+ end
119
+
120
+ return not_found
121
+ end
122
+
123
+ def sanitize_link(url)
124
+ return url.gsub(/\s+/, "%20")
125
+ end
126
+
127
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: arachnid
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - dchuk
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-11-11 00:00:00 -08:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: typhoeus
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - "="
23
+ - !ruby/object:Gem::Version
24
+ version: 0.3.2
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: bloomfilter-rb
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - "="
34
+ - !ruby/object:Gem::Version
35
+ version: 2.1.1
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: nokogiri
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - "="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.5.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: domainatrix
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - "="
56
+ - !ruby/object:Gem::Version
57
+ version: 0.0.10
58
+ type: :runtime
59
+ version_requirements: *id004
60
+ description: Arachnid relies on Bloom Filters to efficiently store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling.
61
+ email: me@dchuk.com
62
+ executables: []
63
+
64
+ extensions: []
65
+
66
+ extra_rdoc_files: []
67
+
68
+ files:
69
+ - lib/arachnid.rb
70
+ has_rdoc: true
71
+ homepage: https://github.com/dchuk/Arachnid
72
+ licenses: []
73
+
74
+ post_install_message:
75
+ rdoc_options: []
76
+
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: "0"
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: "0"
91
+ requirements: []
92
+
93
+ rubyforge_project:
94
+ rubygems_version: 1.6.2
95
+ signing_key:
96
+ specification_version: 3
97
+ summary: Extremely fast and efficient domain spider
98
+ test_files: []
99
+