site_mapper 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6870976b0c732915cd3077c0be8bbfd6f92d7f45
4
- data.tar.gz: 1f2949d91774c87dd2f156b562ffb9e532f924fe
3
+ metadata.gz: 472f6b099e7d4c2fe67862faf59b703c65e39a8c
4
+ data.tar.gz: ce4b315b256fdded26a12665f50d60dccf127ea3
5
5
  SHA512:
6
- metadata.gz: d736d6c6d62c55e9f38417bb9061dfb19de907ef00d2e41cb31bd1d190d92fd16b319d02483673c333cfcc6cf6feed720906b2ba914957f13f602fc595ef2223
7
- data.tar.gz: 69661165ac90ede2611f2d24ba43c58c1eed7b17c5405b23d00df0de023e4896210f75ab152e8d2311832ee4aa828d17c59887e0e943e00b8a0daa34d2d3b1ad
6
+ metadata.gz: 971616d2f3dd773e63be7d01259582099fd285538b52e04cbcc8055958d98fa57644d84e5546408401c390c6fc56ff70b9ecfc13b2c97bd15660a0a81dd98107
7
+ data.tar.gz: e27f9ae323e7a536696625071b30b9d046bbaced373e9ae798cd2083556c27bf228ff197a154a2291900b89fc4510264a4565ad1592a86a330f83989f2914a8f
@@ -1,5 +1,5 @@
1
1
  module SiteMapper
2
- # Crawl URL formatter
2
+ # Crawl URL formatter.
3
3
  class CrawlUrl
4
4
  attr_reader :resolved_base_url, :base_hostname
5
5
 
@@ -11,7 +11,7 @@ module SiteMapper
11
11
 
12
12
  # Given a link it constructs the absolute path,
13
13
  # if valid URL & URL has same domain as @resolved_base_url.
14
- # @return [String] with absolute path to resource.
14
+ # @return [String] with absolute path to resource
15
15
  # @param [String, String] raw_url from link element and current page URL
16
16
  # @example Construct absolute URL for '/path', example.com
17
17
  # cu = CrawlUrl.new('example.com')
@@ -8,12 +8,13 @@ module SiteMapper
8
8
  'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
9
9
  }
10
10
 
11
- def initialize(url, resolve = false)
11
+ def initialize(url, resolve: false)
12
12
  base_url = Request.resolve_url(url)
13
13
  @options = { resolve: resolve }
14
14
  @crawl_url = CrawlUrl.new(base_url)
15
15
  @fetch_queue = CrawlQueue.new
16
16
  @processed = Set.new
17
+ @robots = Robots.new(base_url, HEADERS_HASH['User-Agent'])
17
18
  end
18
19
 
19
20
  # @see #collect_urls
@@ -21,7 +22,7 @@ module SiteMapper
21
22
  new(base_url).collect_urls { |url| yield(url) }
22
23
  end
23
24
 
24
- # Collects all links on domain for domain
25
+ # Collects all links on domain for domain.
25
26
  # @return [Array] with links.
26
27
  # @example URLs for example.com
27
28
  # crawler = Crawler.new('example.com')
@@ -52,14 +53,15 @@ module SiteMapper
52
53
  link_elements = Request.get_page(get_url).css('a') rescue []
53
54
  @processed << get_url
54
55
  link_elements.each do |page_link|
55
- absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
56
- if absolute_url
57
- url = resolve(absolute_url)
58
- @fetch_queue << url unless @processed.include?(url)
59
- end
56
+ url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
57
+ @fetch_queue << url if url && eligible_for_queue?(resolve(url))
60
58
  end
61
59
  end
62
60
 
61
+ def eligible_for_queue?(url)
62
+ @robots.allowed?(url) && !@processed.include?(url)
63
+ end
64
+
63
65
  def resolve(url)
64
66
  @options[:resolve] ? Request.resolve_url(url) : url
65
67
  end
@@ -1,13 +1,16 @@
1
1
  require 'url_resolver' # TODO: Allow users to use any resolver
2
2
 
3
3
  module SiteMapper
4
+ # Get webpage wrapper.
4
5
  class Request
5
6
  INFO_LINK = 'https://rubygems.org/gems/site_mapper'
6
7
  USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
7
8
 
8
9
  class << self
9
- def get_page(url, document_type = :html)
10
- Nokogiri::HTML(Request.get_response(url).body)
10
+ # Given an URL get it then parse it with Nokogiri::HTML.
11
+ # @return [Nokogiri::HTML] a nokogiri HTML object
12
+ def get_page(url)
13
+ Nokogiri::HTML(Request.get_response_body(url))
11
14
  end
12
15
 
13
16
  def get_response(url, resolve = false)
@@ -21,12 +24,29 @@ module SiteMapper
21
24
  http.request(request)
22
25
  end
23
26
 
27
+ # Get response body, rescues with nil if an exception is raised
28
+ # @see #get_response
29
+ def get_response_body(*args)
30
+ get_response(*args).body rescue nil
31
+ end
32
+
33
+ # Resolve an URL string and follows redirects
34
+ # if the URL can't be resolved the original URL is returned.
35
+ # @return [String] a URL string that potentially is a redirected URL
36
+ # @example Resolve google.com
37
+ # resolve_url('google.com')
38
+ # # => 'https://www.google.com'
24
39
  def resolve_url(url, with_query: true)
25
40
  resolved = UrlResolver.resolve(url)
26
41
  resolved = remove_query(resolved) unless with_query
27
42
  resolved
28
43
  end
29
44
 
45
+ # Removes query string from URL string.
46
+ # @return [String] an URL string without query
47
+ # @example Removes query string
48
+ # remove_query('example.com/path?q=keyword')
49
+ # # => 'example.com/path'
30
50
  def remove_query(url)
31
51
  index = url.index('?')
32
52
  index.nil? ? url : url[0...index]
@@ -0,0 +1,144 @@
1
+ # Based on: https://rubygems.org/gems/robots, v0.10.1
2
+ module SiteMapper
3
+ # Provided a base URL it checks whether a given URL is
4
+ # allowed to be crawled according to /robots.txt
5
+ # @see https://rubygems.org/gems/robots
6
+ class Robots
7
+ # Parses robots.txt
8
+ class ParsedRobots
9
+ def initialize(body, user_agent)
10
+ @other = {}
11
+ @disallows = {}
12
+ @allows = {}
13
+ @delays = {}
14
+ parse(body)
15
+ end
16
+
17
+ # Parse robots.txt body.
18
+ def parse(body)
19
+ agent = /.*/
20
+ body = body || "User-agent: *\nAllow: /\n"
21
+ body = body.downcase
22
+ body.each_line.each do |line|
23
+ next if line =~ /^\s*(#.*|$)/
24
+ arr = line.split(':')
25
+ key = arr.shift
26
+ value = arr.join(':').strip
27
+ value.strip!
28
+ case key
29
+ when 'user-agent'
30
+ agent = to_regex(value)
31
+ when 'allow'
32
+ @allows[agent] ||= []
33
+ @allows[agent] << to_regex(value)
34
+ when 'disallow'
35
+ @disallows[agent] ||= []
36
+ @disallows[agent] << to_regex(value)
37
+ when 'crawl-delay'
38
+ @delays[agent] = value.to_i
39
+ else
40
+ @other[key] ||= []
41
+ @other[key] << value
42
+ end
43
+ end
44
+ @parsed = true
45
+ end
46
+
47
+ # @return [Boolean] true if uri is allowed to be crawled
48
+ # @example Check if http://www.google.com/googlesites is allowed to be crawled
49
+ # uri = URI.parse('http://www.google.com/googlesites')
50
+ # robots.allowed?(uri, 'SiteMapper') # => false (as of 2014-10-22)
51
+ def allowed?(uri, user_agent)
52
+ return true unless @parsed
53
+ allowed = true
54
+ path = uri.request_uri
55
+
56
+ @disallows.each do |key, value|
57
+ if user_agent =~ key
58
+ value.each do |rule|
59
+ if path =~ rule
60
+ allowed = false
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ @allows.each do |key, value|
67
+ unless allowed
68
+ if user_agent =~ key
69
+ value.each do |rule|
70
+ if path =~ rule
71
+ allowed = true
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ allowed
78
+ end
79
+
80
+ # @return [Hash] key/value pairs from robots.txt
81
+ def other_values
82
+ @other
83
+ end
84
+
85
+ protected
86
+
87
+ def to_regex(pattern)
88
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
89
+ pattern = Regexp.escape(pattern)
90
+ pattern.gsub!(Regexp.escape('*'), '.*')
91
+ Regexp.compile("^#{pattern}")
92
+ end
93
+ end
94
+
95
+ def initialize(url, user_agent)
96
+ @user_agent = user_agent
97
+ @parsed = {}
98
+ @robots_txt = Request.get_response_body("#{url}/robots.txt", true)
99
+ end
100
+
101
+ # @return [Boolean] true if uri is allowed to be crawled
102
+ # @example Check if http://www.google.com/googlesites is allowed to be crawled
103
+ # robots = Robots.new('google.com', 'SiteMapper')
104
+ # robots.allowed?('http://www.google.com/googlesites') # => false (as of 2014-10-22)
105
+ def allowed?(uri)
106
+ uri = to_uri(uri)
107
+ host = uri.host
108
+ @parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
109
+ @parsed[host].allowed?(uri, @user_agent)
110
+ rescue
111
+ true
112
+ end
113
+
114
+ # @return [Array] array of sitemaps defined in robots.txt
115
+ # @example Get sitemap for google.com
116
+ # robots = Robots.new('google.com', 'SiteMapper')
117
+ # robots.sitemaps
118
+ def sitemaps
119
+ uri = to_uri(uri)
120
+ values = other_values(uri.host)
121
+ values['sitemap'] or []
122
+ rescue
123
+ []
124
+ end
125
+
126
+ # @return [Hash] key/value pairs from robots.txt
127
+ # @example Get other values for google.com
128
+ # robots = Robots.new('google.com', 'SiteMapper')
129
+ # robots.other_values
130
+ def other_values(uri)
131
+ uri = to_uri(uri)
132
+ host = uri.host
133
+ @parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
134
+ @parsed[host].other_values
135
+ end
136
+
137
+ private
138
+
139
+ def to_uri(uri)
140
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
141
+ uri
142
+ end
143
+ end
144
+ end
@@ -1,3 +1,3 @@
1
1
  module SiteMapper
2
- VERSION = '0.0.6'
2
+ VERSION = '0.0.7'
3
3
  end
data/lib/site_mapper.rb CHANGED
@@ -3,6 +3,7 @@ require 'net/http'
3
3
 
4
4
  require 'site_mapper/version'
5
5
  require 'site_mapper/request'
6
+ require 'site_mapper/robots'
6
7
  require 'site_mapper/crawler'
7
8
  require 'site_mapper/crawl_url'
8
9
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
@@ -108,7 +108,7 @@ dependencies:
108
108
  - - ~>
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0.7'
111
- description: Find all links on domain to domain.
111
+ description: Map all links on a given site.
112
112
  email:
113
113
  - burenstam@gmail.com
114
114
  executables:
@@ -117,11 +117,12 @@ extensions: []
117
117
  extra_rdoc_files: []
118
118
  files:
119
119
  - bin/site_mapper
120
- - lib/site_mapper.rb
121
120
  - lib/site_mapper/crawl_url.rb
122
121
  - lib/site_mapper/crawler.rb
123
122
  - lib/site_mapper/request.rb
123
+ - lib/site_mapper/robots.rb
124
124
  - lib/site_mapper/version.rb
125
+ - lib/site_mapper.rb
125
126
  homepage: https://github.com/buren/site_mapper
126
127
  licenses:
127
128
  - MIT
@@ -142,9 +143,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
142
143
  version: '0'
143
144
  requirements: []
144
145
  rubyforge_project:
145
- rubygems_version: 2.2.2
146
+ rubygems_version: 2.0.0
146
147
  signing_key:
147
148
  specification_version: 4
148
- summary: Find all links on domain to domain
149
+ summary: Map all links on a given site.
149
150
  test_files: []
150
151
  has_rdoc: