pantopoda 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2f6ca88cbcd2df8c738d31bea859f65165dd9253
4
+ data.tar.gz: 1b32bc531837a4343788fc8c5a5941a77fac4ce7
5
+ SHA512:
6
+ metadata.gz: 93f04b120ea1b9c7615aa3b275b6eae758a781743d0e7cc2ecb1c74fc1e5bb6f76d0dba40ebd9518474400efab5b2bcaa967a0023530e44a5c5d799f21eed000
7
+ data.tar.gz: ee84ff6889c09c993290a996fe6a173011873661076f76717a4b6b7d43b730edff30a32fe37ffd6252f686f8b39df8f298120d607e596dfd407b6784ecd71839
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in pantopoda.gemspec
4
+ gemspec
5
+
6
+ gem 'domainatrix', :source => 'http://gemcutter.org'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Gabriel Lim
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # Pantopoda
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'pantopoda'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install pantopoda
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/pantopoda/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,3 @@
1
+ module Pantopoda
2
+ VERSION = "0.0.1"
3
+ end
data/lib/pantopoda.rb ADDED
@@ -0,0 +1,141 @@
1
+ require "pantopoda/version"
2
+ require 'typhoeus'
3
+ require 'bloomfilter-rb'
4
+ require 'nokogiri'
5
+ require 'domainatrix'
6
+ require 'uri'
7
+
8
+ class Pantopoda
9
+ def initialize(url, options = {})
10
+ @start_url = url
11
+ @domain = parse_domain(url)
12
+
13
+ @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
14
+ @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
15
+ @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
16
+ end
17
+
18
+ def crawl(options = {})
19
+ # Defaults to 1 thread, so we won't do a crazy amount of crawling on domains
20
+ threads = options[:threads] ? options[:threads] : 1
21
+
22
+ # Defaults to -1 so it will always keep running until it runs out of urls
23
+ max_urls = options[:max_urls] ? options[:max_urls] : nil
24
+
25
+ @hydra = Typheous::Hydra.new(:max_concurrency => threads)
26
+ @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
27
+ @global_queue = []
28
+
29
+ @global_queue << @start_url
30
+
31
+ while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
32
+ temp_queue = @global_queue
33
+ temp_queue.each do |q|
34
+ begin
35
+ ip,port,user,pass = nil
36
+
37
+ request = Typheous::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
38
+ request.on_complete do |response|
39
+ yield response
40
+ links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
41
+ links.each do |link|
42
+ if (internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
43
+ sanitized_link = sanitized_link(split_url_at_hash(link))
44
+ if (sanitized_link)
45
+ absolute_link = make_absolute(sanitized_link, response.effective_url)
46
+ if absolute_link
47
+ @global_queue << absolute_link
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ @hydra.queue request
55
+
56
+ rescue URI::InvalidURIError, NoMethodError => e
57
+ puts "Exception caught: #{e}" if @debug == true
58
+ end
59
+
60
+ @global_visited.insert(q)
61
+ @global_queue.delete(q)
62
+ end
63
+
64
+ @hydra.run
65
+ end
66
+ end
67
+
68
+ def parse_domain(url)
69
+ puts "Parsing URL: #{url}"
70
+
71
+ begin
72
+ parsed_domain = Domainatrix.parse(url)
73
+ if (parsed_domain.subdomain != "")
74
+ parsed_domain.subdomain + '.' + parsed_domain + '.' + parse_domain.public_suffix
75
+ else
76
+ parse_domain.domain + '.' + parsed_domain.public_suffix
77
+ end
78
+
79
+ rescue NoMethodError, Addressable::URI::InvalidURIError => e
80
+ puts "URL Parsing Exception (#{url}) : #{e}"
81
+ return nil
82
+ end
83
+ end
84
+
85
+ def internal_link?(url, effective_url)
86
+ absolute_url = make_absolute(url, effective_url)
87
+ parsed_url = parse_domain(absolute_url)
88
+ if (@domain == parsed_url)
89
+ return true
90
+ else
91
+ return false
92
+ end
93
+ end
94
+
95
+ def split_url_at_hash(url)
96
+ return url.to_s unless @split_url_at_hash
97
+ return url.to_s.split('#')[0]
98
+ end
99
+
100
+ def no_hash_in_url?(url)
101
+ return true unless @exclude_urls_with_hash
102
+
103
+ if(url.to_s.scan(/#/).size > 0)
104
+ return false
105
+ else
106
+ return true
107
+ end
108
+ end
109
+
110
+ def ignore_extensions(url)
111
+ return true if url.to_s.length == 0
112
+ return true unless @exclude_urls_with_extensions
113
+
114
+ not_found = true
115
+
116
+ @exclude_urls_with_extensions.each do |e|
117
+ if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
118
+ not_found = false
119
+ puts "#{e} Found At URL: #{url}"
120
+ end
121
+ end
122
+
123
+ return not_found
124
+ end
125
+
126
+ def sanitize_link(url)
127
+ begin
128
+ return url.gsub(/\s+/, "%20")
129
+ rescue
130
+ return false
131
+ end
132
+ end
133
+
134
+ def make_absolute(href, root)
135
+ begin
136
+ URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
137
+ rescue URI::InvalidURIError, URI::InvalidComponentError => e
138
+ return false
139
+ end
140
+ end
141
+ end
data/pantopoda.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'pantopoda/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pantopoda"
8
+ spec.version = Pantopoda::VERSION
9
+ spec.authors = ["Gabriel Lim"]
10
+ spec.email = ["gabriel@saleswhale.io"]
11
+ spec.summary = "Efficient domain web spider."
12
+ spec.description = "Pantopoda is a web crawler that visits all links on a given domain that's fast and effective."
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+
24
+ spec.add_dependency('typhoeus', '0.7.2')
25
+ spec.add_dependency('bloomfilter-rb', '2.1.1')
26
+ spec.add_dependency('nokogiri', '1.6.2')
27
+ spec.add_dependency('domainatrix')
28
+
29
+ end
metadata ADDED
@@ -0,0 +1,137 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pantopoda
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Gabriel Lim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: typhoeus
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 0.7.2
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 0.7.2
55
+ - !ruby/object:Gem::Dependency
56
+ name: bloomfilter-rb
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 2.1.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 2.1.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '='
74
+ - !ruby/object:Gem::Version
75
+ version: 1.6.2
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '='
81
+ - !ruby/object:Gem::Version
82
+ version: 1.6.2
83
+ - !ruby/object:Gem::Dependency
84
+ name: domainatrix
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Pantopoda is a web crawler that visits all links on a given domain that's
98
+ fast and effective.
99
+ email:
100
+ - gabriel@saleswhale.io
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - Gemfile
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - lib/pantopoda.rb
111
+ - lib/pantopoda/version.rb
112
+ - pantopoda.gemspec
113
+ homepage: ''
114
+ licenses:
115
+ - MIT
116
+ metadata: {}
117
+ post_install_message:
118
+ rdoc_options: []
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ required_rubygems_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ requirements: []
132
+ rubyforge_project:
133
+ rubygems_version: 2.4.5
134
+ signing_key:
135
+ specification_version: 4
136
+ summary: Efficient domain web spider.
137
+ test_files: []