site_mapper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/site_mapper +6 -0
- data/lib/site_mapper/crawl_url.rb +74 -0
- data/lib/site_mapper/crawler.rb +56 -0
- data/lib/site_mapper/request.rb +29 -0
- data/lib/site_mapper/version.rb +3 -0
- data/lib/site_mapper.rb +12 -0
- metadata +150 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c4123dd260ed51448c09ecca619fd3bf9733cb3b
|
4
|
+
data.tar.gz: f2fbb4cfc682dec7da9188048e89856326d68b9e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 87cd845d828c07778a62bf428cad6b66e0cc9a74fa5b43a7d0676a020d6c282459b7f488d33a02f14a00125b59105cfb9b53e1db6587c5ea9d7dfc75a07e5034
|
7
|
+
data.tar.gz: c62359abc64089d6a8f6e8672657bdf2b56b6dfca747dd831790f9b0dd6ba8acc6b1094aa1e61effd9ffa86691367a9055b229809095809afcc89c323e1ab1c2
|
data/bin/site_mapper
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
module SiteMapper
|
2
|
+
class CrawlUrl
|
3
|
+
attr_reader :resolved_base_url, :base_hostname
|
4
|
+
|
5
|
+
def initialize(base_url)
|
6
|
+
@resolved_base_url = Request.resolve_url(base_url)
|
7
|
+
@base_hostname = URI.parse(@resolved_base_url).hostname
|
8
|
+
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
|
9
|
+
end
|
10
|
+
|
11
|
+
def absolute_url_from(raw_url, get_url)
|
12
|
+
return nil unless eligible_url?(raw_url)
|
13
|
+
parsed_url = URI.parse(raw_url) rescue URI.parse('')
|
14
|
+
if parsed_url.relative?
|
15
|
+
url_from_relative(raw_url, get_url)
|
16
|
+
elsif same_domain?(raw_url, @resolved_base_url)
|
17
|
+
raw_url
|
18
|
+
else
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def url_from_relative(url, current_page_url)
|
26
|
+
if url.start_with?('/')
|
27
|
+
"#{without_path_suffix(resolved_base_url)}#{url}"
|
28
|
+
elsif url.start_with?('../')
|
29
|
+
"#{url_from_dotted_url(url, current_page_url)}"
|
30
|
+
else
|
31
|
+
"#{with_path_suffix(resolved_base_url)}#{url}"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def url_from_dotted_url(url, current_page_url)
|
36
|
+
absolute_url = with_path_suffix(current_page_url.dup)
|
37
|
+
found_dots = without_path_suffix(url).scan('../').length
|
38
|
+
removed_dots = 0
|
39
|
+
max_levels = 4
|
40
|
+
while found_dots >= removed_dots && max_levels > removed_dots
|
41
|
+
index = absolute_url.rindex('/') or break
|
42
|
+
absolute_url = absolute_url[0..(index - 1)]
|
43
|
+
removed_dots += 1
|
44
|
+
end
|
45
|
+
"#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
|
46
|
+
end
|
47
|
+
|
48
|
+
def with_path_suffix(passed_url)
|
49
|
+
url = passed_url.dup
|
50
|
+
url.end_with?('/') ? url : url << '/'
|
51
|
+
end
|
52
|
+
|
53
|
+
def without_path_suffix(passed_url)
|
54
|
+
url = passed_url.dup
|
55
|
+
url.end_with?('/') ? url[0...(url.length - 1)] : url
|
56
|
+
end
|
57
|
+
|
58
|
+
def eligible_url?(href)
|
59
|
+
return false if href.nil? || href.empty?
|
60
|
+
dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
|
61
|
+
dont_include = %w(/email-protection#)
|
62
|
+
dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
|
63
|
+
|
64
|
+
dont_start.each { |pattern| return false if href.start_with?(pattern) }
|
65
|
+
dont_include.each { |pattern| return false if href.include?(pattern) }
|
66
|
+
dont_end.each { |pattern| return false if href.end_with?(pattern) }
|
67
|
+
true
|
68
|
+
end
|
69
|
+
|
70
|
+
def same_domain?(first, second)
|
71
|
+
first.include?(second)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module SiteMapper
|
5
|
+
class Crawler
|
6
|
+
CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
|
7
|
+
HEADERS_HASH = {
|
8
|
+
'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
|
9
|
+
}
|
10
|
+
|
11
|
+
def initialize(url, resolve = false)
|
12
|
+
base_url = Request.resolve_url(url)
|
13
|
+
@options = { resolve: resolve }
|
14
|
+
@crawl_url = CrawlUrl.new(base_url)
|
15
|
+
@fetch_queue = Set.new
|
16
|
+
@processed = Set.new
|
17
|
+
@fetch_queue << @crawl_url.resolved_base_url
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.collect_urls(base_url)
|
21
|
+
new(base_url).collect_urls
|
22
|
+
end
|
23
|
+
|
24
|
+
def collect_urls
|
25
|
+
until @fetch_queue.empty?
|
26
|
+
url = @fetch_queue.first
|
27
|
+
@fetch_queue.delete(@fetch_queue.first)
|
28
|
+
page_links(url)
|
29
|
+
end
|
30
|
+
puts "Crawling finished, #{@processed.length} links found"
|
31
|
+
@processed.to_a
|
32
|
+
rescue Interrupt, IRB::Abort
|
33
|
+
puts 'Crawl interrupted.'
|
34
|
+
@fetch_queue.to_a
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def page_links(get_url)
|
40
|
+
puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
|
41
|
+
link_elements = Request.get_page(get_url).css('a') rescue []
|
42
|
+
@processed << get_url
|
43
|
+
link_elements.each do |page_link|
|
44
|
+
absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
|
45
|
+
if absolute_url
|
46
|
+
resolved_url = resolve(absolute_url)
|
47
|
+
@fetch_queue << resolved_url if !@processed.include?(resolved_url)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def resolve(url)
|
53
|
+
@options[:resolve] ? Request.resolve_url(url) : url
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'url_resolver' # TODO: Allow users to use any resolver
|
2
|
+
|
3
|
+
module SiteMapper
|
4
|
+
class Request
|
5
|
+
INFO_LINK = 'https://rubygems.org/gems/site_mapper'
|
6
|
+
USER_AGENT = "SiteMapper/#{VERSION} (+#{INFO_LINK})"
|
7
|
+
|
8
|
+
class << self
|
9
|
+
def get_page(url, document_type = :html)
|
10
|
+
Nokogiri::HTML(Request.get_response(url).body)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_response(url, resolve = false)
|
14
|
+
resolved_url = resolve ? resolve_url(url) : url
|
15
|
+
uri = URI.parse(resolved_url)
|
16
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
17
|
+
http.use_ssl = true if resolved_url.include?('https://')
|
18
|
+
|
19
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
20
|
+
request['User-Agent'] = USER_AGENT
|
21
|
+
http.request(request)
|
22
|
+
end
|
23
|
+
|
24
|
+
def resolve_url(url)
|
25
|
+
UrlResolver.resolve(url)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/site_mapper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: site_mapper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jacob Burenstam
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-10-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: url_resolver
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.1'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.3'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.3'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.3'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.1'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.1'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: yard
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ~>
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.8'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.8'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: coveralls
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.7'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.7'
|
111
|
+
description: Find all links on domain.
|
112
|
+
email:
|
113
|
+
- burenstam@gmail.com
|
114
|
+
executables:
|
115
|
+
- site_mapper
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- bin/site_mapper
|
120
|
+
- lib/site_mapper.rb
|
121
|
+
- lib/site_mapper/crawl_url.rb
|
122
|
+
- lib/site_mapper/crawler.rb
|
123
|
+
- lib/site_mapper/request.rb
|
124
|
+
- lib/site_mapper/version.rb
|
125
|
+
homepage: https://github.com/buren/site_mapper
|
126
|
+
licenses:
|
127
|
+
- MIT
|
128
|
+
metadata: {}
|
129
|
+
post_install_message:
|
130
|
+
rdoc_options: []
|
131
|
+
require_paths:
|
132
|
+
- lib
|
133
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - '>='
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: 1.9.3
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
requirements:
|
140
|
+
- - '>='
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
requirements: []
|
144
|
+
rubyforge_project:
|
145
|
+
rubygems_version: 2.2.2
|
146
|
+
signing_key:
|
147
|
+
specification_version: 4
|
148
|
+
summary: Find all links on domain
|
149
|
+
test_files: []
|
150
|
+
has_rdoc:
|