pantopoda 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 821e68617f3b8e7b7c746f27e5976dbbb8f002f6
4
- data.tar.gz: 159c0c2d53776430972374d543376fcab41eee6e
3
+ metadata.gz: 95eb1fb6916786582ba766cc5b7afae0adab80d2
4
+ data.tar.gz: ad00abae14a4638d1cf2cb49aea278e176f15aaf
5
5
  SHA512:
6
- metadata.gz: 11c83070a2c952e3364c4d80e698e3adcee8593fb5e1814ea56f67f87bd2a696ed52acf2a07afd4e01a9e4f614819c262f4a0718d7015c15b01f379ea4d6b032
7
- data.tar.gz: 48fb7e3b406d21d7d2aadb74b15b7b3e9487aee2b1045b0bb40f9e8676da20978ec7a4c0aae968e55b121fdf4d3ed254f9e8c89d6716c52f164bb90f09535715
6
+ metadata.gz: d7894b766dfeb8171c91bd720610d81a8a7a6c50a874fa574bef9ad37ca7f184640502d6d0dfd87404cc00d3aadc3f67bdb5fcd2d8e94313973135fb24c739af
7
+ data.tar.gz: 3ae3b9be93335e67c387dab9287f5376284eac5d30633f57b041f205e5067917654f451b648a6eaf9c379c9e661ba7efdc2aae1accc43049be77c1ff775ae721
data/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # Pantopoda
2
2
 
3
- TODO: Write a gem description
3
+ Pantopoda is a ruby spidering library that was built out of sheer frustration at the lack of good, modern web crawling tools that Python enjoys. Pantopoda uses bloom filters to store the list of visited urls for efficient querying up to hundreds of thousands of urls, and the requests are handled by Typhoeus to allow for multi-threaded crawling.
4
+
5
+ Pantopoda will crawl every single page it can find on a particular domain.
4
6
 
5
7
  ## Installation
6
8
 
@@ -1,3 +1,3 @@
1
1
  module Pantopoda
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/pantopoda.rb CHANGED
@@ -5,137 +5,139 @@ require 'nokogiri'
5
5
  require 'domainatrix'
6
6
  require 'uri'
7
7
 
8
- class Pantopoda
9
- def initialize(url, options = {})
10
- @start_url = url
11
- @domain = parse_domain(url)
12
-
13
- @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
14
- @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
15
- @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
16
- end
8
+ module Pantopoda
9
+ class Pantopoda
10
+ def initialize(url, options = {})
11
+ @start_url = url
12
+ @domain = parse_domain(url)
13
+
14
+ @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
15
+ @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
16
+ @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
17
+ end
17
18
 
18
- def crawl(options = {})
19
- # Defaults to 1 thread, so we won't do a crazy amount of crawling on domains
20
- threads = options[:threads] ? options[:threads] : 1
21
-
22
- # Defaults to -1 so it will always keep running until it runs out of urls
23
- max_urls = options[:max_urls] ? options[:max_urls] : nil
24
-
25
- @hydra = Typheous::Hydra.new(:max_concurrency => threads)
26
- @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
27
- @global_queue = []
28
-
29
- @global_queue << @start_url
30
-
31
- while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
32
- temp_queue = @global_queue
33
- temp_queue.each do |q|
34
- begin
35
- ip,port,user,pass = nil
36
-
37
- request = Typheous::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
38
- request.on_complete do |response|
39
- yield response
40
- links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
41
- links.each do |link|
42
- if (internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
43
- sanitized_link = sanitized_link(split_url_at_hash(link))
44
- if (sanitized_link)
45
- absolute_link = make_absolute(sanitized_link, response.effective_url)
46
- if absolute_link
47
- @global_queue << absolute_link
19
+ def crawl(options = {})
20
+ # Defaults to 1 thread, so we won't do a crazy amount of crawling on domains
21
+ threads = options[:threads] ? options[:threads] : 1
22
+
23
+ # Defaults to -1 so it will always keep running until it runs out of urls
24
+ max_urls = options[:max_urls] ? options[:max_urls] : nil
25
+
26
+ @hydra = Typheous::Hydra.new(:max_concurrency => threads)
27
+ @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
28
+ @global_queue = []
29
+
30
+ @global_queue << @start_url
31
+
32
+ while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
33
+ temp_queue = @global_queue
34
+ temp_queue.each do |q|
35
+ begin
36
+ ip,port,user,pass = nil
37
+
38
+ request = Typheous::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
39
+ request.on_complete do |response|
40
+ yield response
41
+ links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
42
+ links.each do |link|
43
+ if (internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
44
+ sanitized_link = sanitized_link(split_url_at_hash(link))
45
+ if (sanitized_link)
46
+ absolute_link = make_absolute(sanitized_link, response.effective_url)
47
+ if absolute_link
48
+ @global_queue << absolute_link
49
+ end
48
50
  end
49
51
  end
50
52
  end
51
53
  end
52
- end
53
54
 
54
- @hydra.queue request
55
+ @hydra.queue request
55
56
 
56
- rescue URI::InvalidURIError, NoMethodError => e
57
- puts "Exception caught: #{e}" if @debug == true
57
+ rescue URI::InvalidURIError, NoMethodError => e
58
+ puts "Exception caught: #{e}" if @debug == true
59
+ end
60
+
61
+ @global_visited.insert(q)
62
+ @global_queue.delete(q)
58
63
  end
59
64
 
60
- @global_visited.insert(q)
61
- @global_queue.delete(q)
65
+ @hydra.run
62
66
  end
63
-
64
- @hydra.run
65
67
  end
66
- end
67
68
 
68
- def parse_domain(url)
69
- puts "Parsing URL: #{url}"
69
+ def parse_domain(url)
70
+ puts "Parsing URL: #{url}"
70
71
 
71
- begin
72
- parsed_domain = Domainatrix.parse(url)
73
- if (parsed_domain.subdomain != "")
74
- parsed_domain.subdomain + '.' + parsed_domain + '.' + parse_domain.public_suffix
75
- else
76
- parse_domain.domain + '.' + parsed_domain.public_suffix
77
- end
72
+ begin
73
+ parsed_domain = Domainatrix.parse(url)
74
+ if (parsed_domain.subdomain != "")
75
+ parsed_domain.subdomain + '.' + parsed_domain + '.' + parse_domain.public_suffix
76
+ else
77
+ parse_domain.domain + '.' + parsed_domain.public_suffix
78
+ end
78
79
 
79
- rescue NoMethodError, Addressable::URI::InvalidURIError => e
80
- puts "URL Parsing Exception (#{url}) : #{e}"
81
- return nil
80
+ rescue NoMethodError, Addressable::URI::InvalidURIError => e
81
+ puts "URL Parsing Exception (#{url}) : #{e}"
82
+ return nil
83
+ end
82
84
  end
83
- end
84
85
 
85
- def internal_link?(url, effective_url)
86
- absolute_url = make_absolute(url, effective_url)
87
- parsed_url = parse_domain(absolute_url)
88
- if (@domain == parsed_url)
89
- return true
90
- else
91
- return false
86
+ def internal_link?(url, effective_url)
87
+ absolute_url = make_absolute(url, effective_url)
88
+ parsed_url = parse_domain(absolute_url)
89
+ if (@domain == parsed_url)
90
+ return true
91
+ else
92
+ return false
93
+ end
92
94
  end
93
- end
94
95
 
95
- def split_url_at_hash(url)
96
- return url.to_s unless @split_url_at_hash
97
- return url.to_s.split('#')[0]
98
- end
96
+ def split_url_at_hash(url)
97
+ return url.to_s unless @split_url_at_hash
98
+ return url.to_s.split('#')[0]
99
+ end
99
100
 
100
- def no_hash_in_url?(url)
101
- return true unless @exclude_urls_with_hash
101
+ def no_hash_in_url?(url)
102
+ return true unless @exclude_urls_with_hash
102
103
 
103
- if(url.to_s.scan(/#/).size > 0)
104
- return false
105
- else
106
- return true
104
+ if(url.to_s.scan(/#/).size > 0)
105
+ return false
106
+ else
107
+ return true
108
+ end
107
109
  end
108
- end
109
110
 
110
- def ignore_extensions(url)
111
- return true if url.to_s.length == 0
112
- return true unless @exclude_urls_with_extensions
111
+ def ignore_extensions(url)
112
+ return true if url.to_s.length == 0
113
+ return true unless @exclude_urls_with_extensions
113
114
 
114
- not_found = true
115
+ not_found = true
115
116
 
116
- @exclude_urls_with_extensions.each do |e|
117
- if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
118
- not_found = false
119
- puts "#{e} Found At URL: #{url}"
117
+ @exclude_urls_with_extensions.each do |e|
118
+ if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
119
+ not_found = false
120
+ puts "#{e} Found At URL: #{url}"
121
+ end
120
122
  end
121
- end
122
123
 
123
- return not_found
124
- end
124
+ return not_found
125
+ end
125
126
 
126
- def sanitize_link(url)
127
- begin
128
- return url.gsub(/\s+/, "%20")
129
- rescue
130
- return false
127
+ def sanitize_link(url)
128
+ begin
129
+ return url.gsub(/\s+/, "%20")
130
+ rescue
131
+ return false
132
+ end
131
133
  end
132
- end
133
134
 
134
- def make_absolute(href, root)
135
- begin
136
- URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
137
- rescue URI::InvalidURIError, URI::InvalidComponentError => e
138
- return false
139
- end
135
+ def make_absolute(href, root)
136
+ begin
137
+ URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
138
+ rescue URI::InvalidURIError, URI::InvalidComponentError => e
139
+ return false
140
+ end
141
+ end
140
142
  end
141
143
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pantopoda
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gabriel Lim