site_mapper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c4123dd260ed51448c09ecca619fd3bf9733cb3b
4
+ data.tar.gz: f2fbb4cfc682dec7da9188048e89856326d68b9e
5
+ SHA512:
6
+ metadata.gz: 87cd845d828c07778a62bf428cad6b66e0cc9a74fa5b43a7d0676a020d6c282459b7f488d33a02f14a00125b59105cfb9b53e1db6587c5ea9d7dfc75a07e5034
7
+ data.tar.gz: c62359abc64089d6a8f6e8672657bdf2b56b6dfca747dd831790f9b0dd6ba8acc6b1094aa1e61effd9ffa86691367a9055b229809095809afcc89c323e1ab1c2
data/bin/site_mapper ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'site_mapper'
4
+
5
+ url = ARGV[0]
6
+ SiteMapper.map(url)
@@ -0,0 +1,74 @@
1
+ module SiteMapper
2
+ class CrawlUrl
3
+ attr_reader :resolved_base_url, :base_hostname
4
+
5
+ def initialize(base_url)
6
+ @resolved_base_url = Request.resolve_url(base_url)
7
+ @base_hostname = URI.parse(@resolved_base_url).hostname
8
+ @resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
9
+ end
10
+
11
+ def absolute_url_from(raw_url, get_url)
12
+ return nil unless eligible_url?(raw_url)
13
+ parsed_url = URI.parse(raw_url) rescue URI.parse('')
14
+ if parsed_url.relative?
15
+ url_from_relative(raw_url, get_url)
16
+ elsif same_domain?(raw_url, @resolved_base_url)
17
+ raw_url
18
+ else
19
+ nil
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def url_from_relative(url, current_page_url)
26
+ if url.start_with?('/')
27
+ "#{without_path_suffix(resolved_base_url)}#{url}"
28
+ elsif url.start_with?('../')
29
+ "#{url_from_dotted_url(url, current_page_url)}"
30
+ else
31
+ "#{with_path_suffix(resolved_base_url)}#{url}"
32
+ end
33
+ end
34
+
35
+ def url_from_dotted_url(url, current_page_url)
36
+ absolute_url = with_path_suffix(current_page_url.dup)
37
+ found_dots = without_path_suffix(url).scan('../').length
38
+ removed_dots = 0
39
+ max_levels = 4
40
+ while found_dots >= removed_dots && max_levels > removed_dots
41
+ index = absolute_url.rindex('/') or break
42
+ absolute_url = absolute_url[0..(index - 1)]
43
+ removed_dots += 1
44
+ end
45
+ "#{with_path_suffix(absolute_url)}#{url.gsub('../', '')}"
46
+ end
47
+
48
+ def with_path_suffix(passed_url)
49
+ url = passed_url.dup
50
+ url.end_with?('/') ? url : url << '/'
51
+ end
52
+
53
+ def without_path_suffix(passed_url)
54
+ url = passed_url.dup
55
+ url.end_with?('/') ? url[0...(url.length - 1)] : url
56
+ end
57
+
58
+ def eligible_url?(href)
59
+ return false if href.nil? || href.empty?
60
+ dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
61
+ dont_include = %w(/email-protection#)
62
+ dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)
63
+
64
+ dont_start.each { |pattern| return false if href.start_with?(pattern) }
65
+ dont_include.each { |pattern| return false if href.include?(pattern) }
66
+ dont_end.each { |pattern| return false if href.end_with?(pattern) }
67
+ true
68
+ end
69
+
70
+ def same_domain?(first, second)
71
+ first.include?(second)
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,56 @@
1
+ require 'set'
2
+ require 'nokogiri'
3
+
4
+ module SiteMapper
5
+ class Crawler
6
+ CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
+ HEADERS_HASH = {
8
+ 'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
9
+ }
10
+
11
+ def initialize(url, resolve = false)
12
+ base_url = Request.resolve_url(url)
13
+ @options = { resolve: resolve }
14
+ @crawl_url = CrawlUrl.new(base_url)
15
+ @fetch_queue = Set.new
16
+ @processed = Set.new
17
+ @fetch_queue << @crawl_url.resolved_base_url
18
+ end
19
+
20
+ def self.collect_urls(base_url)
21
+ new(base_url).collect_urls
22
+ end
23
+
24
+ def collect_urls
25
+ until @fetch_queue.empty?
26
+ url = @fetch_queue.first
27
+ @fetch_queue.delete(@fetch_queue.first)
28
+ page_links(url)
29
+ end
30
+ puts "Crawling finished, #{@processed.length} links found"
31
+ @processed.to_a
32
+ rescue Interrupt, IRB::Abort
33
+ puts 'Crawl interrupted.'
34
+ @fetch_queue.to_a
35
+ end
36
+
37
+ private
38
+
39
+ def page_links(get_url)
40
+ puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
41
+ link_elements = Request.get_page(get_url).css('a') rescue []
42
+ @processed << get_url
43
+ link_elements.each do |page_link|
44
+ absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
45
+ if absolute_url
46
+ resolved_url = resolve(absolute_url)
47
+ @fetch_queue << resolved_url if !@processed.include?(resolved_url)
48
+ end
49
+ end
50
+ end
51
+
52
+ def resolve(url)
53
+ @options[:resolve] ? Request.resolve_url(url) : url
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,29 @@
1
+ require 'url_resolver' # TODO: Allow users to use any resolver
2
+
3
+ module SiteMapper
4
+ class Request
5
+ INFO_LINK = 'https://rubygems.org/gems/site_mapper'
6
+ USER_AGENT = "SiteMapper/#{VERSION} (+#{INFO_LINK})"
7
+
8
+ class << self
9
+ def get_page(url, document_type = :html)
10
+ Nokogiri::HTML(Request.get_response(url).body)
11
+ end
12
+
13
+ def get_response(url, resolve = false)
14
+ resolved_url = resolve ? resolve_url(url) : url
15
+ uri = URI.parse(resolved_url)
16
+ http = Net::HTTP.new(uri.host, uri.port)
17
+ http.use_ssl = true if resolved_url.include?('https://')
18
+
19
+ request = Net::HTTP::Get.new(uri.request_uri)
20
+ request['User-Agent'] = USER_AGENT
21
+ http.request(request)
22
+ end
23
+
24
+ def resolve_url(url)
25
+ UrlResolver.resolve(url)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,3 @@
1
+ module SiteMapper
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,12 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+
4
+ require 'site_mapper/request'
5
+ require 'site_mapper/crawler'
6
+ require 'site_mapper/crawl_url'
7
+
8
+ module SiteMapper
9
+ def self.map(source)
10
+ Crawler.collect_urls(source)
11
+ end
12
+ end
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: site_mapper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jacob Burenstam
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: url_resolver
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '0.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '0.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.3'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.3'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '10.3'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: '10.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: '3.1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: '3.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: yard
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '0.8'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '0.8'
97
+ - !ruby/object:Gem::Dependency
98
+ name: coveralls
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '0.7'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '0.7'
111
+ description: Find all links on domain.
112
+ email:
113
+ - burenstam@gmail.com
114
+ executables:
115
+ - site_mapper
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - bin/site_mapper
120
+ - lib/site_mapper.rb
121
+ - lib/site_mapper/crawl_url.rb
122
+ - lib/site_mapper/crawler.rb
123
+ - lib/site_mapper/request.rb
124
+ - lib/site_mapper/version.rb
125
+ homepage: https://github.com/buren/site_mapper
126
+ licenses:
127
+ - MIT
128
+ metadata: {}
129
+ post_install_message:
130
+ rdoc_options: []
131
+ require_paths:
132
+ - lib
133
+ required_ruby_version: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - '>='
136
+ - !ruby/object:Gem::Version
137
+ version: 1.9.3
138
+ required_rubygems_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ requirements: []
144
+ rubyforge_project:
145
+ rubygems_version: 2.2.2
146
+ signing_key:
147
+ specification_version: 4
148
+ summary: Find all links on domain
149
+ test_files: []
150
+ has_rdoc: