site_mapper 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6870976b0c732915cd3077c0be8bbfd6f92d7f45
4
- data.tar.gz: 1f2949d91774c87dd2f156b562ffb9e532f924fe
3
+ metadata.gz: 472f6b099e7d4c2fe67862faf59b703c65e39a8c
4
+ data.tar.gz: ce4b315b256fdded26a12665f50d60dccf127ea3
5
5
  SHA512:
6
- metadata.gz: d736d6c6d62c55e9f38417bb9061dfb19de907ef00d2e41cb31bd1d190d92fd16b319d02483673c333cfcc6cf6feed720906b2ba914957f13f602fc595ef2223
7
- data.tar.gz: 69661165ac90ede2611f2d24ba43c58c1eed7b17c5405b23d00df0de023e4896210f75ab152e8d2311832ee4aa828d17c59887e0e943e00b8a0daa34d2d3b1ad
6
+ metadata.gz: 971616d2f3dd773e63be7d01259582099fd285538b52e04cbcc8055958d98fa57644d84e5546408401c390c6fc56ff70b9ecfc13b2c97bd15660a0a81dd98107
7
+ data.tar.gz: e27f9ae323e7a536696625071b30b9d046bbaced373e9ae798cd2083556c27bf228ff197a154a2291900b89fc4510264a4565ad1592a86a330f83989f2914a8f
@@ -1,5 +1,5 @@
1
1
  module SiteMapper
2
- # Crawl URL formatter
2
+ # Crawl URL formatter.
3
3
  class CrawlUrl
4
4
  attr_reader :resolved_base_url, :base_hostname
5
5
 
@@ -11,7 +11,7 @@ module SiteMapper
11
11
 
12
12
  # Given a link it constructs the absolute path,
13
13
  # if valid URL & URL has same domain as @resolved_base_url.
14
- # @return [String] with absolute path to resource.
14
+ # @return [String] with absolute path to resource
15
15
  # @param [String, String] raw_url from link element and current page URL
16
16
  # @example Construct absolute URL for '/path', example.com
17
17
  # cu = CrawlUrl.new('example.com')
@@ -8,12 +8,13 @@ module SiteMapper
8
8
  'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
9
9
  }
10
10
 
11
- def initialize(url, resolve = false)
11
+ def initialize(url, resolve: false)
12
12
  base_url = Request.resolve_url(url)
13
13
  @options = { resolve: resolve }
14
14
  @crawl_url = CrawlUrl.new(base_url)
15
15
  @fetch_queue = CrawlQueue.new
16
16
  @processed = Set.new
17
+ @robots = Robots.new(base_url, HEADERS_HASH['User-Agent'])
17
18
  end
18
19
 
19
20
  # @see #collect_urls
@@ -21,7 +22,7 @@ module SiteMapper
21
22
  new(base_url).collect_urls { |url| yield(url) }
22
23
  end
23
24
 
24
- # Collects all links on domain for domain
25
+ # Collects all links on domain for domain.
25
26
  # @return [Array] with links.
26
27
  # @example URLs for example.com
27
28
  # crawler = Crawler.new('example.com')
@@ -52,14 +53,15 @@ module SiteMapper
52
53
  link_elements = Request.get_page(get_url).css('a') rescue []
53
54
  @processed << get_url
54
55
  link_elements.each do |page_link|
55
- absolute_url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
56
- if absolute_url
57
- url = resolve(absolute_url)
58
- @fetch_queue << url unless @processed.include?(url)
59
- end
56
+ url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
57
+ @fetch_queue << url if url && eligible_for_queue?(resolve(url))
60
58
  end
61
59
  end
62
60
 
61
+ def eligible_for_queue?(url)
62
+ @robots.allowed?(url) && !@processed.include?(url)
63
+ end
64
+
63
65
  def resolve(url)
64
66
  @options[:resolve] ? Request.resolve_url(url) : url
65
67
  end
@@ -1,13 +1,16 @@
1
1
  require 'url_resolver' # TODO: Allow users to use any resolver
2
2
 
3
3
  module SiteMapper
4
+ # Get webpage wrapper.
4
5
  class Request
5
6
  INFO_LINK = 'https://rubygems.org/gems/site_mapper'
6
7
  USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
7
8
 
8
9
  class << self
9
- def get_page(url, document_type = :html)
10
- Nokogiri::HTML(Request.get_response(url).body)
10
+ # Given an URL get it then parse it with Nokogiri::HTML.
11
+ # @return [Nokogiri::HTML] a nokogiri HTML object
12
+ def get_page(url)
13
+ Nokogiri::HTML(Request.get_response_body(url))
11
14
  end
12
15
 
13
16
  def get_response(url, resolve = false)
@@ -21,12 +24,29 @@ module SiteMapper
21
24
  http.request(request)
22
25
  end
23
26
 
27
+ # Get response body, rescues with nil if an exception is raised
28
+ # @see #get_response
29
+ def get_response_body(*args)
30
+ get_response(*args).body rescue nil
31
+ end
32
+
33
+ # Resolve an URL string and follows redirects
34
+ # if the URL can't be resolved the original URL is returned.
35
+ # @return [String] a URL string that potentially is a redirected URL
36
+ # @example Resolve google.com
37
+ # resolve_url('google.com')
38
+ # # => 'https://www.google.com'
24
39
  def resolve_url(url, with_query: true)
25
40
  resolved = UrlResolver.resolve(url)
26
41
  resolved = remove_query(resolved) unless with_query
27
42
  resolved
28
43
  end
29
44
 
45
+ # Removes query string from URL string.
46
+ # @return [String] an URL string without query
47
+ # @example Removes query string
48
+ # remove_query('example.com/path?q=keyword')
49
+ # # => 'example.com/path'
30
50
  def remove_query(url)
31
51
  index = url.index('?')
32
52
  index.nil? ? url : url[0...index]
@@ -0,0 +1,144 @@
1
+ # Based on: https://rubygems.org/gems/robots, v0.10.1
2
+ module SiteMapper
3
+ # Provided a base URL it checks whether a given URL is
4
+ # allowed to be crawled according to /robots.txt
5
+ # @see https://rubygems.org/gems/robots
6
+ class Robots
7
+ # Parses robots.txt
8
+ class ParsedRobots
9
+ def initialize(body, user_agent)
10
+ @other = {}
11
+ @disallows = {}
12
+ @allows = {}
13
+ @delays = {}
14
+ parse(body)
15
+ end
16
+
17
+ # Parse robots.txt body.
18
+ def parse(body)
19
+ agent = /.*/
20
+ body = body || "User-agent: *\nAllow: /\n"
21
+ body = body.downcase
22
+ body.each_line.each do |line|
23
+ next if line =~ /^\s*(#.*|$)/
24
+ arr = line.split(':')
25
+ key = arr.shift
26
+ value = arr.join(':').strip
27
+ value.strip!
28
+ case key
29
+ when 'user-agent'
30
+ agent = to_regex(value)
31
+ when 'allow'
32
+ @allows[agent] ||= []
33
+ @allows[agent] << to_regex(value)
34
+ when 'disallow'
35
+ @disallows[agent] ||= []
36
+ @disallows[agent] << to_regex(value)
37
+ when 'crawl-delay'
38
+ @delays[agent] = value.to_i
39
+ else
40
+ @other[key] ||= []
41
+ @other[key] << value
42
+ end
43
+ end
44
+ @parsed = true
45
+ end
46
+
47
+ # @return [Boolean] true if uri is allowed to be crawled
48
+ # @example Check if http://www.google.com/googlesites is allowed to be crawled
49
+ # uri = URI.parse('http://www.google.com/googlesites')
50
+ # robots.allowed?(uri, 'SiteMapper') # => false (as of 2014-10-22)
51
+ def allowed?(uri, user_agent)
52
+ return true unless @parsed
53
+ allowed = true
54
+ path = uri.request_uri
55
+
56
+ @disallows.each do |key, value|
57
+ if user_agent =~ key
58
+ value.each do |rule|
59
+ if path =~ rule
60
+ allowed = false
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ @allows.each do |key, value|
67
+ unless allowed
68
+ if user_agent =~ key
69
+ value.each do |rule|
70
+ if path =~ rule
71
+ allowed = true
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
77
+ allowed
78
+ end
79
+
80
+ # @return [Hash] key/value pairs from robots.txt
81
+ def other_values
82
+ @other
83
+ end
84
+
85
+ protected
86
+
87
+ def to_regex(pattern)
88
+ return /should-not-match-anything-123456789/ if pattern.strip.empty?
89
+ pattern = Regexp.escape(pattern)
90
+ pattern.gsub!(Regexp.escape('*'), '.*')
91
+ Regexp.compile("^#{pattern}")
92
+ end
93
+ end
94
+
95
+ def initialize(url, user_agent)
96
+ @user_agent = user_agent
97
+ @parsed = {}
98
+ @robots_txt = Request.get_response_body("#{url}/robots.txt", true)
99
+ end
100
+
101
+ # @return [Boolean] true if uri is allowed to be crawled
102
+ # @example Check if http://www.google.com/googlesites is allowed to be crawled
103
+ # robots = Robots.new('google.com', 'SiteMapper')
104
+ # robots.allowed?('http://www.google.com/googlesites') # => false (as of 2014-10-22)
105
+ def allowed?(uri)
106
+ uri = to_uri(uri)
107
+ host = uri.host
108
+ @parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
109
+ @parsed[host].allowed?(uri, @user_agent)
110
+ rescue
111
+ true
112
+ end
113
+
114
+ # @return [Array] array of sitemaps defined in robots.txt
115
+ # @example Get sitemap for google.com
116
+ # robots = Robots.new('google.com', 'SiteMapper')
117
+ # robots.sitemaps
118
+ def sitemaps
119
+ uri = to_uri(uri)
120
+ values = other_values(uri.host)
121
+ values['sitemap'] or []
122
+ rescue
123
+ []
124
+ end
125
+
126
+ # @return [Hash] key/value pairs from robots.txt
127
+ # @example Get other values for google.com
128
+ # robots = Robots.new('google.com', 'SiteMapper')
129
+ # robots.other_values
130
+ def other_values(uri)
131
+ uri = to_uri(uri)
132
+ host = uri.host
133
+ @parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
134
+ @parsed[host].other_values
135
+ end
136
+
137
+ private
138
+
139
+ def to_uri(uri)
140
+ uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
141
+ uri
142
+ end
143
+ end
144
+ end
@@ -1,3 +1,3 @@
1
1
  module SiteMapper
2
- VERSION = '0.0.6'
2
+ VERSION = '0.0.7'
3
3
  end
data/lib/site_mapper.rb CHANGED
@@ -3,6 +3,7 @@ require 'net/http'
3
3
 
4
4
  require 'site_mapper/version'
5
5
  require 'site_mapper/request'
6
+ require 'site_mapper/robots'
6
7
  require 'site_mapper/crawler'
7
8
  require 'site_mapper/crawl_url'
8
9
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
@@ -108,7 +108,7 @@ dependencies:
108
108
  - - ~>
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0.7'
111
- description: Find all links on domain to domain.
111
+ description: Map all links on a given site.
112
112
  email:
113
113
  - burenstam@gmail.com
114
114
  executables:
@@ -117,11 +117,12 @@ extensions: []
117
117
  extra_rdoc_files: []
118
118
  files:
119
119
  - bin/site_mapper
120
- - lib/site_mapper.rb
121
120
  - lib/site_mapper/crawl_url.rb
122
121
  - lib/site_mapper/crawler.rb
123
122
  - lib/site_mapper/request.rb
123
+ - lib/site_mapper/robots.rb
124
124
  - lib/site_mapper/version.rb
125
+ - lib/site_mapper.rb
125
126
  homepage: https://github.com/buren/site_mapper
126
127
  licenses:
127
128
  - MIT
@@ -142,9 +143,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
142
143
  version: '0'
143
144
  requirements: []
144
145
  rubyforge_project:
145
- rubygems_version: 2.2.2
146
+ rubygems_version: 2.0.0
146
147
  signing_key:
147
148
  specification_version: 4
148
- summary: Find all links on domain to domain
149
+ summary: Map all links on a given site.
149
150
  test_files: []
150
151
  has_rdoc: