arachnidish 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/arachnidish.rb +174 -0
  3. metadata +102 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cbba3f96eda4cf0065a0b7d65c373bbac54b55bf
4
+ data.tar.gz: ebc7dd1e5ae6432c83534f94f77057b0cb2716f3
5
+ SHA512:
6
+ metadata.gz: 3cf72eae18e2c7c9c43a8405f9ad0dc88f131df092d35fb550348cea3e3a84bae4f37bc666e7683c0a459ab19b09cbda380c9fe859b70edfa5870500fa838099
7
+ data.tar.gz: b7e0f862dd41359c9e994c8b6c4f993f0b95e0a0feeb3afe62d3aa9e08697c140d63948b21be57cbcd615aab5895a4a94e5fbaa0657b900f6ebcf674941e6675
@@ -0,0 +1,174 @@
1
+ # encoding: utf-8
2
+
3
+ require 'typhoeus'
4
+ require 'bloomfilter-rb'
5
+ require 'nokogiri'
6
+ require 'domainatrix'
7
+ require 'uri'
8
+
9
+ class Arachnidish
10
+
11
+ def initialize(url, options = {})
12
+ @start_url = url
13
+ @domain = parse_domain(url)
14
+
15
+ @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
16
+ @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
17
+ @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
18
+ @proxy_list = options[:proxy_list] ? options[:proxy_list] : false
19
+
20
+ @debug = options[:debug] ? options[:debug] : false
21
+ end
22
+
23
+ def crawl(options = {})
24
+
25
+ #defaults to 1 thread so people don't do a stupid amount of crawling on unsuspecting domains
26
+ threads = options[:threads] ? options[:threads] : 1
27
+ #defaults to -1 so it will always keep running until it runs out of urls
28
+ max_urls = options[:max_urls] ? options[:max_urls] : nil
29
+
30
+ @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
31
+ @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
32
+ @global_queue = []
33
+
34
+ @global_queue << @start_url
35
+
36
+ while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
37
+ temp_queue = @global_queue
38
+
39
+ temp_queue.each do |q|
40
+
41
+ begin
42
+ ip,port,user,pass = grab_proxy
43
+
44
+ request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
45
+ request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
46
+ request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
47
+
48
+ request.on_complete do |response|
49
+
50
+ yield response
51
+
52
+ links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
53
+
54
+ links.each do |link|
55
+ if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
56
+
57
+ sanitized_link = sanitize_link(split_url_at_hash(link))
58
+ if(sanitized_link)
59
+
60
+ absolute_link = make_absolute(sanitized_link, response.effective_url)
61
+ if(absolute_link)
62
+ @global_queue << absolute_link
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ end
69
+
70
+ @hydra.queue request
71
+
72
+ rescue URI::InvalidURIError, NoMethodError => e
73
+ puts "Exception caught: #{e}" if @debug == true
74
+ end
75
+
76
+ @global_visited.insert(q)
77
+ @global_queue.delete(q)
78
+
79
+ end
80
+
81
+ @hydra.run
82
+
83
+ end
84
+
85
+ end
86
+
87
+ def grab_proxy
88
+
89
+ return nil unless @proxy_list
90
+
91
+ return @proxy_list.sample.split(':')
92
+
93
+ end
94
+
95
+ def parse_domain(url)
96
+ puts "Parsing URL: #{url}" if @debug
97
+
98
+ begin
99
+ parsed_domain = Domainatrix.parse(url)
100
+
101
+ if(parsed_domain.subdomain != "")
102
+ parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
103
+ else
104
+ parsed_domain.domain + '.' + parsed_domain.public_suffix
105
+ end
106
+ rescue NoMethodError, Addressable::URI::InvalidURIError => e
107
+ puts "URL Parsing Exception (#{url}): #{e}"
108
+ return nil
109
+ end
110
+ end
111
+
112
+ def internal_link?(url, effective_url)
113
+
114
+ absolute_url = make_absolute(url, effective_url)
115
+
116
+ parsed_url = parse_domain(absolute_url)
117
+ if(@domain == parsed_url)
118
+ return true
119
+ else
120
+ return false
121
+ end
122
+ end
123
+
124
+ def split_url_at_hash(url)
125
+ return url.to_s unless @split_url_at_hash
126
+
127
+ return url.to_s.split('#')[0]
128
+
129
+ end
130
+
131
+ def no_hash_in_url?(url)
132
+ return true unless @exclude_urls_with_hash
133
+
134
+ if(url.to_s.scan(/#/).size > 0)
135
+ return false
136
+ else
137
+ return true
138
+ end
139
+ end
140
+
141
+ def ignore_extensions(url)
142
+ return true if url.to_s.length == 0
143
+ return true unless @exclude_urls_with_extensions
144
+
145
+ not_found = true
146
+
147
+ @exclude_urls_with_extensions.each do |e|
148
+ if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
149
+ not_found = false
150
+ puts "#{e} Found At URL: #{url}" if @debug
151
+ end
152
+ end
153
+
154
+ return not_found
155
+ end
156
+
157
+ def sanitize_link(url)
158
+ begin
159
+ return url.gsub(/\s+/, "%20")
160
+ rescue
161
+ return false
162
+ end
163
+ end
164
+
165
+ def make_absolute( href, root )
166
+
167
+ begin
168
+ URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
169
+ rescue URI::InvalidURIError, URI::InvalidComponentError => e
170
+ return false
171
+ end
172
+ end
173
+
174
+ end
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: arachnidish
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - csphere
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: typhoeus
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.3.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.3.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: bloomfilter-rb
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 2.1.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 2.1.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: domainatrix
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Arachnidish is a web crawler that relies on Bloom Filters to efficiently
70
+ store visited urls and Typhoeus to avoid the overhead of Mechanize when crawling
71
+ every page on a domain.
72
+ email: cspheregreen@gmail.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - lib/arachnidish.rb
78
+ homepage: https://github.com/csphere/Arachnid
79
+ licenses: []
80
+ metadata: {}
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 2.4.1
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: Extremely fast and efficient domain spider, that isn't quite so strict
101
+ test_files: []
102
+ has_rdoc: