site_mapper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c4123dd260ed51448c09ecca619fd3bf9733cb3b
4
+ data.tar.gz: f2fbb4cfc682dec7da9188048e89856326d68b9e
5
+ SHA512:
6
+ metadata.gz: 87cd845d828c07778a62bf428cad6b66e0cc9a74fa5b43a7d0676a020d6c282459b7f488d33a02f14a00125b59105cfb9b53e1db6587c5ea9d7dfc75a07e5034
7
+ data.tar.gz: c62359abc64089d6a8f6e8672657bdf2b56b6dfca747dd831790f9b0dd6ba8acc6b1094aa1e61effd9ffa86691367a9055b229809095809afcc89c323e1ab1c2
data/bin/site_mapper ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'site_mapper'
4
+
5
+ url = ARGV[0]
6
+ SiteMapper.map(url)
@@ -0,0 +1,74 @@
1
+ module SiteMapper
2
+ class CrawlUrl
3
+ attr_reader :resolved_base_url, :base_hostname
4
+
5
+ def initialize(base_url)
6
+ @resolved_base_url = Request.resolve_url(base_url)
7
+ @base_hostname = URI.parse(@resolved_base_url).hostname
8
+ @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
9
+ end
10
+
11
+ def absolute_url_from(raw_url, get_url)
12
+ return nil unless eligible_url?(raw_url)
13
+ parsed_url = URI.parse(raw_url) rescue URI.parse('')
14
+ if parsed_url.relative?
15
+ url_from_relative(raw_url, get_url)
16
+ elsif same_domain?(raw_url, @resolved_base_url)
17
+ raw_url
18
+ else
19
+ nil
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def url_from_relative(url, current_page_url)
26
+ if url.start_with?('/')
27
+ "#{without_path_suffix(resolved_base_url)}#{url}"
28
+ elsif url.start_with?('../')
29
+ "#{url_from_dotted_url(url, current_page_url)}"
30
+ else
31
+ "#{with_path_suffix(resolved_base_url)}#{url}"
32
+ end
33
+ end
34
+
35
+ def url_from_dotted_url(url, current_page_url)
36
+ absolute_url = with_path_suffix(current_page_url.dup)
37
+ found_dots = without_path_suffix(url).scan('../').length
38
+ removed_dots = 0
39
+ max_levels = 4
40
+ while found_dots >= removed_dots && max_levels > removed_dots
41
+ index = absolute_url.rindex('/') or break
42
+ absolute_url = absolute_url[0..(index - 1)]
43
+ removed_dots += 1
44
+ end
45
+ "#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
46
+ end
47
+
48
+ def with_path_suffix(passed_url)
49
+ url = passed_url.dup
50
+ url.end_with?('/') ? url : url << '/'
51
+ end
52
+
53
+ def without_path_suffix(passed_url)
54
+ url = passed_url.dup
55
+ url.end_with?('/') ? url[0...(url.length - 1)] : url
56
+ end
57
+
58
+ def eligible_url?(href)
59
+ return false if href.nil? || href.empty?
60
+ dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
61
+ dont_include = %w(/email-protection#)
62
+ dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
63
+
64
+ dont_start.each { |pattern| return false if href.start_with?(pattern) }
65
+ dont_include.each { |pattern| return false if href.include?(pattern) }
66
+ dont_end.each { |pattern| return false if href.end_with?(pattern) }
67
+ true
68
+ end
69
+
70
+ def same_domain?(first, second)
71
+ first.include?(second)
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,56 @@
1
+ require 'set'
2
+ require 'nokogiri'
3
+
4
+ module SiteMapper
5
+ class Crawler
6
+ CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
+ HEADERS_HASH = {
8
+ 'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
9
+ }
10
+
11
+ def initialize(url, resolve = false)
12
+ base_url = Request.resolve_url(url)
13
+ @options = { resolve: resolve }
14
+ @crawl_url = CrawlUrl.new(base_url)
15
+ @fetch_queue = Set.new
16
+ @processed = Set.new
17
+ @fetch_queue << @crawl_url.resolved_base_url
18
+ end
19
+
20
+ def self.collect_urls(base_url)
21
+ new(base_url).collect_urls
22
+ end
23
+
24
+ def collect_urls
25
+ until @fetch_queue.empty?
26
+ url = @fetch_queue.first
27
+ @fetch_queue.delete(@fetch_queue.first)
28
+ page_links(url)
29
+ end
30
+ puts "Crawling finished, #{@processed.length} links found"
31
+ @processed.to_a
32
+ rescue Interrupt, IRB::Abort
33
+ puts 'Crawl interrupted.'
34
+ @fetch_queue.to_a
35
+ end
36
+
37
+ private
38
+
39
+ def page_links(get_url)
40
+ puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
41
+ link_elements = Request.get_page(get_url).css('a') rescue []
42
+ @processed << get_url
43
+ link_elements.each do |page_link|
44
+ absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
+ if absolute_url
46
+ resolved_url = resolve(absolute_url)
47
+ @fetch_queue << resolved_url if !@processed.include?(resolved_url)
48
+ end
49
+ end
50
+ end
51
+
52
+ def resolve(url)
53
+ @options[:resolve] ? Request.resolve_url(url) : url
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,29 @@
1
+ require 'url_resolver' # TODO: Allow users to use any resolver
2
+
3
+ module SiteMapper
4
+ class Request
5
+ INFO_LINK = 'https://rubygems.org/gems/site_mapper'
6
+ USER_AGENT = "SiteMapper/#{VERSION} (+#{INFO_LINK})"
7
+
8
+ class << self
9
+ def get_page(url, document_type = :html)
10
+ Nokogiri::HTML(Request.get_response(url).body)
11
+ end
12
+
13
+ def get_response(url, resolve = false)
14
+ resolved_url = resolve ? resolve_url(url) : url
15
+ uri = URI.parse(resolved_url)
16
+ http = Net::HTTP.new(uri.host, uri.port)
17
+ http.use_ssl = true if resolved_url.include?('https://')
18
+
19
+ request = Net::HTTP::Get.new(uri.request_uri)
20
+ request['User-Agent'] = USER_AGENT
21
+ http.request(request)
22
+ end
23
+
24
+ def resolve_url(url)
25
+ UrlResolver.resolve(url)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,3 @@
1
+ module SiteMapper
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,12 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+
4
+ require 'site_mapper/request'
5
+ require 'site_mapper/crawler'
6
+ require 'site_mapper/crawl_url'
7
+
8
+ module SiteMapper
9
+ def self.map(source)
10
+ Crawler.collect_urls(source)
11
+ end
12
+ end
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: site_mapper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jacob Burenstam
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: url_resolver
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '0.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '0.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.3'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.3'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '10.3'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: '10.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: '3.1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: '3.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: yard
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '0.8'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '0.8'
97
+ - !ruby/object:Gem::Dependency
98
+ name: coveralls
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '0.7'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '0.7'
111
+ description: Find all links on domain.
112
+ email:
113
+ - burenstam@gmail.com
114
+ executables:
115
+ - site_mapper
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - bin/site_mapper
120
+ - lib/site_mapper.rb
121
+ - lib/site_mapper/crawl_url.rb
122
+ - lib/site_mapper/crawler.rb
123
+ - lib/site_mapper/request.rb
124
+ - lib/site_mapper/version.rb
125
+ homepage: https://github.com/buren/site_mapper
126
+ licenses:
127
+ - MIT
128
+ metadata: {}
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - '>='
136
+ - !ruby/object:Gem::Version
137
+ version: 1.9.3
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubyforge_project:
145
+ rubygems_version: 2.2.2
146
+ signing_key:
147
+ specification_version: 4
148
+ summary: Find all links on domain
149
+ test_files: []
150
+ has_rdoc: