site_mapper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/site_mapper +6 -0
- data/lib/site_mapper/crawl_url.rb +74 -0
- data/lib/site_mapper/crawler.rb +56 -0
- data/lib/site_mapper/request.rb +29 -0
- data/lib/site_mapper/version.rb +3 -0
- data/lib/site_mapper.rb +12 -0
- metadata +150 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c4123dd260ed51448c09ecca619fd3bf9733cb3b
|
4
|
+
data.tar.gz: f2fbb4cfc682dec7da9188048e89856326d68b9e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 87cd845d828c07778a62bf428cad6b66e0cc9a74fa5b43a7d0676a020d6c282459b7f488d33a02f14a00125b59105cfb9b53e1db6587c5ea9d7dfc75a07e5034
|
7
|
+
data.tar.gz: c62359abc64089d6a8f6e8672657bdf2b56b6dfca747dd831790f9b0dd6ba8acc6b1094aa1e61effd9ffa86691367a9055b229809095809afcc89c323e1ab1c2
|
data/bin/site_mapper
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
module SiteMapper
|
2
|
+
class CrawlUrl
|
3
|
+
attr_reader :resolved_base_url, :base_hostname
|
4
|
+
|
5
|
+
def initialize(base_url)
|
6
|
+
@resolved_base_url = Request.resolve_url(base_url)
|
7
|
+
@base_hostname = URI.parse(@resolved_base_url).hostname
|
8
|
+
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
9
|
+
end
|
10
|
+
|
11
|
+
def absolute_url_from(raw_url, get_url)
|
12
|
+
return nil unless eligible_url?(raw_url)
|
13
|
+
parsed_url = URI.parse(raw_url) rescue URI.parse('')
|
14
|
+
if parsed_url.relative?
|
15
|
+
url_from_relative(raw_url, get_url)
|
16
|
+
elsif same_domain?(raw_url, @resolved_base_url)
|
17
|
+
raw_url
|
18
|
+
else
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def url_from_relative(url, current_page_url)
|
26
|
+
if url.start_with?('/')
|
27
|
+
"#{without_path_suffix(resolved_base_url)}#{url}"
|
28
|
+
elsif url.start_with?('../')
|
29
|
+
"#{url_from_dotted_url(url, current_page_url)}"
|
30
|
+
else
|
31
|
+
"#{with_path_suffix(resolved_base_url)}#{url}"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def url_from_dotted_url(url, current_page_url)
|
36
|
+
absolute_url = with_path_suffix(current_page_url.dup)
|
37
|
+
found_dots = without_path_suffix(url).scan('../').length
|
38
|
+
removed_dots = 0
|
39
|
+
max_levels = 4
|
40
|
+
while found_dots >= removed_dots && max_levels > removed_dots
|
41
|
+
index = absolute_url.rindex('/') or break
|
42
|
+
absolute_url = absolute_url[0..(index - 1)]
|
43
|
+
removed_dots += 1
|
44
|
+
end
|
45
|
+
"#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
|
46
|
+
end
|
47
|
+
|
48
|
+
def with_path_suffix(passed_url)
|
49
|
+
url = passed_url.dup
|
50
|
+
url.end_with?('/') ? url : url << '/'
|
51
|
+
end
|
52
|
+
|
53
|
+
def without_path_suffix(passed_url)
|
54
|
+
url = passed_url.dup
|
55
|
+
url.end_with?('/') ? url[0...(url.length - 1)] : url
|
56
|
+
end
|
57
|
+
|
58
|
+
def eligible_url?(href)
|
59
|
+
return false if href.nil? || href.empty?
|
60
|
+
dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
|
61
|
+
dont_include = %w(/email-protection#)
|
62
|
+
dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
|
63
|
+
|
64
|
+
dont_start.each { |pattern| return false if href.start_with?(pattern) }
|
65
|
+
dont_include.each { |pattern| return false if href.include?(pattern) }
|
66
|
+
dont_end.each { |pattern| return false if href.end_with?(pattern) }
|
67
|
+
true
|
68
|
+
end
|
69
|
+
|
70
|
+
def same_domain?(first, second)
|
71
|
+
first.include?(second)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module SiteMapper
|
5
|
+
class Crawler
|
6
|
+
CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
7
|
+
HEADERS_HASH = {
|
8
|
+
'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
|
+
}
|
10
|
+
|
11
|
+
def initialize(url, resolve = false)
|
12
|
+
base_url = Request.resolve_url(url)
|
13
|
+
@options = { resolve: resolve }
|
14
|
+
@crawl_url = CrawlUrl.new(base_url)
|
15
|
+
@fetch_queue = Set.new
|
16
|
+
@processed = Set.new
|
17
|
+
@fetch_queue << @crawl_url.resolved_base_url
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.collect_urls(base_url)
|
21
|
+
new(base_url).collect_urls
|
22
|
+
end
|
23
|
+
|
24
|
+
def collect_urls
|
25
|
+
until @fetch_queue.empty?
|
26
|
+
url = @fetch_queue.first
|
27
|
+
@fetch_queue.delete(@fetch_queue.first)
|
28
|
+
page_links(url)
|
29
|
+
end
|
30
|
+
puts "Crawling finished, #{@processed.length} links found"
|
31
|
+
@processed.to_a
|
32
|
+
rescue Interrupt, IRB::Abort
|
33
|
+
puts 'Crawl interrupted.'
|
34
|
+
@fetch_queue.to_a
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def page_links(get_url)
|
40
|
+
puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
|
41
|
+
link_elements = Request.get_page(get_url).css('a') rescue []
|
42
|
+
@processed << get_url
|
43
|
+
link_elements.each do |page_link|
|
44
|
+
absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
45
|
+
if absolute_url
|
46
|
+
resolved_url = resolve(absolute_url)
|
47
|
+
@fetch_queue << resolved_url if !@processed.include?(resolved_url)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def resolve(url)
|
53
|
+
@options[:resolve] ? Request.resolve_url(url) : url
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'url_resolver' # TODO: Allow users to use any resolver
|
2
|
+
|
3
|
+
module SiteMapper
|
4
|
+
class Request
|
5
|
+
INFO_LINK = 'https://rubygems.org/gems/site_mapper'
|
6
|
+
USER_AGENT = "SiteMapper/#{VERSION} (+#{INFO_LINK})"
|
7
|
+
|
8
|
+
class << self
|
9
|
+
def get_page(url, document_type = :html)
|
10
|
+
Nokogiri::HTML(Request.get_response(url).body)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_response(url, resolve = false)
|
14
|
+
resolved_url = resolve ? resolve_url(url) : url
|
15
|
+
uri = URI.parse(resolved_url)
|
16
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
17
|
+
http.use_ssl = true if resolved_url.include?('https://')
|
18
|
+
|
19
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
20
|
+
request['User-Agent'] = USER_AGENT
|
21
|
+
http.request(request)
|
22
|
+
end
|
23
|
+
|
24
|
+
def resolve_url(url)
|
25
|
+
UrlResolver.resolve(url)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/site_mapper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: site_mapper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jacob Burenstam
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-10-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: url_resolver
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.1'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.3'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.3'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.1'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.1'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: yard
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ~>
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.8'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.8'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: coveralls
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.7'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.7'
|
111
|
+
description: Find all links on domain.
|
112
|
+
email:
|
113
|
+
- burenstam@gmail.com
|
114
|
+
executables:
|
115
|
+
- site_mapper
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- bin/site_mapper
|
120
|
+
- lib/site_mapper.rb
|
121
|
+
- lib/site_mapper/crawl_url.rb
|
122
|
+
- lib/site_mapper/crawler.rb
|
123
|
+
- lib/site_mapper/request.rb
|
124
|
+
- lib/site_mapper/version.rb
|
125
|
+
homepage: https://github.com/buren/site_mapper
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata: {}
|
129
|
+
post_install_message:
|
130
|
+
rdoc_options: []
|
131
|
+
require_paths:
|
132
|
+
- lib
|
133
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - '>='
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: 1.9.3
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - '>='
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
requirements: []
|
144
|
+
rubyforge_project:
|
145
|
+
rubygems_version: 2.2.2
|
146
|
+
signing_key:
|
147
|
+
specification_version: 4
|
148
|
+
summary: Find all links on domain
|
149
|
+
test_files: []
|
150
|
+
has_rdoc:
|