pantopoda 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -1
- data/lib/pantopoda/version.rb +1 -1
- data/lib/pantopoda.rb +105 -103
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 95eb1fb6916786582ba766cc5b7afae0adab80d2
|
4
|
+
data.tar.gz: ad00abae14a4638d1cf2cb49aea278e176f15aaf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d7894b766dfeb8171c91bd720610d81a8a7a6c50a874fa574bef9ad37ca7f184640502d6d0dfd87404cc00d3aadc3f67bdb5fcd2d8e94313973135fb24c739af
|
7
|
+
data.tar.gz: 3ae3b9be93335e67c387dab9287f5376284eac5d30633f57b041f205e5067917654f451b648a6eaf9c379c9e661ba7efdc2aae1accc43049be77c1ff775ae721
|
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# Pantopoda
|
2
2
|
|
3
|
-
|
3
|
+
Pantopoda is a ruby spidering library that was built out of sheer frustration at the lack of good, modern web crawling tools that Python enjoys. Pantopoda uses bloom filters to store the list of visited urls for efficient querying up to hundreds of thousands of urls, and the requests are handled by Typhoeus to allow for multi-threaded crawling.
|
4
|
+
|
5
|
+
Pantopoda will crawl every single page it can find on a particular domain.
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
|
data/lib/pantopoda/version.rb
CHANGED
data/lib/pantopoda.rb
CHANGED
@@ -5,137 +5,139 @@ require 'nokogiri'
|
|
5
5
|
require 'domainatrix'
|
6
6
|
require 'uri'
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
module Pantopoda
|
9
|
+
class Pantopoda
|
10
|
+
def initialize(url, options = {})
|
11
|
+
@start_url = url
|
12
|
+
@domain = parse_domain(url)
|
13
|
+
|
14
|
+
@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
|
15
|
+
@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
|
16
|
+
@exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
|
17
|
+
end
|
17
18
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
19
|
+
def crawl(options = {})
|
20
|
+
# Defaults to 1 thread, so we won't do a crazy amount of crawling on domains
|
21
|
+
threads = options[:threads] ? options[:threads] : 1
|
22
|
+
|
23
|
+
# Defaults to -1 so it will always keep running until it runs out of urls
|
24
|
+
max_urls = options[:max_urls] ? options[:max_urls] : nil
|
25
|
+
|
26
|
+
@hydra = Typheous::Hydra.new(:max_concurrency => threads)
|
27
|
+
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
|
28
|
+
@global_queue = []
|
29
|
+
|
30
|
+
@global_queue << @start_url
|
31
|
+
|
32
|
+
while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
|
33
|
+
temp_queue = @global_queue
|
34
|
+
temp_queue.each do |q|
|
35
|
+
begin
|
36
|
+
ip,port,user,pass = nil
|
37
|
+
|
38
|
+
request = Typheous::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
|
39
|
+
request.on_complete do |response|
|
40
|
+
yield response
|
41
|
+
links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
|
42
|
+
links.each do |link|
|
43
|
+
if (internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
|
44
|
+
sanitized_link = sanitized_link(split_url_at_hash(link))
|
45
|
+
if (sanitized_link)
|
46
|
+
absolute_link = make_absolute(sanitized_link, response.effective_url)
|
47
|
+
if absolute_link
|
48
|
+
@global_queue << absolute_link
|
49
|
+
end
|
48
50
|
end
|
49
51
|
end
|
50
52
|
end
|
51
53
|
end
|
52
|
-
end
|
53
54
|
|
54
|
-
|
55
|
+
@hydra.queue request
|
55
56
|
|
56
|
-
|
57
|
-
|
57
|
+
rescue URI::InvalidURIError, NoMethodError => e
|
58
|
+
puts "Exception caught: #{e}" if @debug == true
|
59
|
+
end
|
60
|
+
|
61
|
+
@global_visited.insert(q)
|
62
|
+
@global_queue.delete(q)
|
58
63
|
end
|
59
64
|
|
60
|
-
@
|
61
|
-
@global_queue.delete(q)
|
65
|
+
@hydra.run
|
62
66
|
end
|
63
|
-
|
64
|
-
@hydra.run
|
65
67
|
end
|
66
|
-
end
|
67
68
|
|
68
|
-
|
69
|
-
|
69
|
+
def parse_domain(url)
|
70
|
+
puts "Parsing URL: #{url}"
|
70
71
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
72
|
+
begin
|
73
|
+
parsed_domain = Domainatrix.parse(url)
|
74
|
+
if (parsed_domain.subdomain != "")
|
75
|
+
parsed_domain.subdomain + '.' + parsed_domain + '.' + parse_domain.public_suffix
|
76
|
+
else
|
77
|
+
parse_domain.domain + '.' + parsed_domain.public_suffix
|
78
|
+
end
|
78
79
|
|
79
|
-
|
80
|
-
|
81
|
-
|
80
|
+
rescue NoMethodError, Addressable::URI::InvalidURIError => e
|
81
|
+
puts "URL Parsing Exception (#{url}) : #{e}"
|
82
|
+
return nil
|
83
|
+
end
|
82
84
|
end
|
83
|
-
end
|
84
85
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
86
|
+
def internal_link?(url, effective_url)
|
87
|
+
absolute_url = make_absolute(url, effective_url)
|
88
|
+
parsed_url = parse_domain(absolute_url)
|
89
|
+
if (@domain == parsed_url)
|
90
|
+
return true
|
91
|
+
else
|
92
|
+
return false
|
93
|
+
end
|
92
94
|
end
|
93
|
-
end
|
94
95
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
96
|
+
def split_url_at_hash(url)
|
97
|
+
return url.to_s unless @split_url_at_hash
|
98
|
+
return url.to_s.split('#')[0]
|
99
|
+
end
|
99
100
|
|
100
|
-
|
101
|
-
|
101
|
+
def no_hash_in_url?(url)
|
102
|
+
return true unless @exclude_urls_with_hash
|
102
103
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
104
|
+
if(url.to_s.scan(/#/).size > 0)
|
105
|
+
return false
|
106
|
+
else
|
107
|
+
return true
|
108
|
+
end
|
107
109
|
end
|
108
|
-
end
|
109
110
|
|
110
|
-
|
111
|
-
|
112
|
-
|
111
|
+
def ignore_extensions(url)
|
112
|
+
return true if url.to_s.length == 0
|
113
|
+
return true unless @exclude_urls_with_extensions
|
113
114
|
|
114
|
-
|
115
|
+
not_found = true
|
115
116
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
117
|
+
@exclude_urls_with_extensions.each do |e|
|
118
|
+
if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
|
119
|
+
not_found = false
|
120
|
+
puts "#{e} Found At URL: #{url}"
|
121
|
+
end
|
120
122
|
end
|
121
|
-
end
|
122
123
|
|
123
|
-
|
124
|
-
|
124
|
+
return not_found
|
125
|
+
end
|
125
126
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
127
|
+
def sanitize_link(url)
|
128
|
+
begin
|
129
|
+
return url.gsub(/\s+/, "%20")
|
130
|
+
rescue
|
131
|
+
return false
|
132
|
+
end
|
131
133
|
end
|
132
|
-
end
|
133
134
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
135
|
+
def make_absolute(href, root)
|
136
|
+
begin
|
137
|
+
URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
|
138
|
+
rescue URI::InvalidURIError, URI::InvalidComponentError => e
|
139
|
+
return false
|
140
|
+
end
|
141
|
+
end
|
140
142
|
end
|
141
143
|
end
|