site_mapper 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b52f224f6a1dbfdc6207741e9efe92fd6e4adbc5
4
- data.tar.gz: dc70e4ba139a385164aa122e17e61e06bf469614
3
+ metadata.gz: d2f7c3a2410e0cb508297f6fb229c1e5ec5e8f98
4
+ data.tar.gz: 6495918729c0e0d4c8788c4eb126c829b72dc17b
5
5
  SHA512:
6
- metadata.gz: 6546905269282bb362050fb41c679a42aa7ea91f849425c31b0493f1f6c1727e043e7e147ab51042cafe18afd9e092b91293247c5f3de6f57d8fb13d0a58ca06
7
- data.tar.gz: f1821f346544348ac708ed2582538d7c56a7151058a0880a8cb6cf14a93b23481c458e9a512a630e7bf25b0bd768739511245f3eb40b8d8a90504e8cb4f378d1
6
+ metadata.gz: 80fceb2fd976b3c0e6ffe2f7237ffe9645257a271b99ca7f919e147a132fd09275862ba8c0563924e533e6b5049cd192726a06c48a8c4a4efa1b61b0af2c1e55
7
+ data.tar.gz: 9ff4ef13486b564bc9f8b3d60c42c248c26ed4245584af3e05a0a907b41ac7bf6bfc44ab5ece3cee91216c240201990c65e0adfa20960b28a0b37f8f6eb6ad4e
@@ -9,12 +9,17 @@ require 'site_mapper/crawl_url'
9
9
 
10
10
  # Find all links on domain to domain
11
11
  module SiteMapper
12
- # Returns all links found on domain to domain.
12
+ # SiteMapper info link
13
+ INFO_LINK = 'https://rubygems.org/gems/site_mapper'
14
+ # SiteMapper User-Agent
15
+ USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
16
+
17
+ # Map all links on a given site.
13
18
  # @return [Array] with links.
14
19
  # @param [String] link to domain
15
20
  # @example Collect all URLs from example.com
16
21
  # SiteMapper.map('example.com')
17
- def self.map(source)
18
- Crawler.collect_urls(source) { |url| yield(url) if block_given? }
22
+ def self.map(link)
23
+ Crawler.collect_urls(link) { |url| yield(url) if block_given? }
19
24
  end
20
25
  end
@@ -3,6 +3,7 @@ module SiteMapper
3
3
  class CrawlUrl
4
4
  attr_reader :resolved_base_url, :base_hostname
5
5
 
6
+ # @param [String] base_url
6
7
  def initialize(base_url)
7
8
  @resolved_base_url = Request.resolve_url(base_url, with_query: false)
8
9
  @base_hostname = URI.parse(@resolved_base_url).hostname
@@ -11,8 +12,9 @@ module SiteMapper
11
12
 
12
13
  # Given a link it constructs the absolute path,
13
14
  # if valid URL & URL has same domain as @resolved_base_url.
15
+ # @param [String] raw_url url found on page
16
+ # @param [String] get_url current page url
14
17
  # @return [String] with absolute path to resource
15
- # @param [String, String] raw_url from link element and current page URL
16
18
  # @example Construct absolute URL for '/path', example.com
17
19
  # cu = CrawlUrl.new('example.com')
18
20
  # cu.absolute_url_from('/path', 'example.com/some/path')
@@ -2,19 +2,17 @@ require 'set'
2
2
  require 'nokogiri'
3
3
 
4
4
  module SiteMapper
5
+ # Crawls a given site.
5
6
  class Crawler
6
- CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
- HEADERS_HASH = {
8
- 'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
9
- }
10
-
7
+ # @param [String] url base url for crawler
8
+ # @param [Hash] resolve (optional false by default)
11
9
  def initialize(url, resolve: false)
12
- base_url = Request.resolve_url(url)
10
+ @base_url = Request.resolve_url(url)
13
11
  @options = { resolve: resolve }
14
- @crawl_url = CrawlUrl.new(base_url)
12
+ @crawl_url = CrawlUrl.new(@base_url)
15
13
  @fetch_queue = CrawlQueue.new
16
14
  @processed = Set.new
17
- @robots = Robots.new(base_url, HEADERS_HASH['User-Agent'])
15
+ @robots = nil
18
16
  end
19
17
 
20
18
  # @see #collect_urls
@@ -59,7 +57,14 @@ module SiteMapper
59
57
  end
60
58
 
61
59
  def eligible_for_queue?(url)
62
- @robots.allowed?(url) && !@processed.include?(url)
60
+ robots.allowed?(url) && !@processed.include?(url)
61
+ end
62
+
63
+ def robots
64
+ return @robots unless @robots.nil?
65
+ robots_body = Request.get_response_body("#{@base_url}/robots.txt")
66
+ @robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
67
+ @robots
63
68
  end
64
69
 
65
70
  def resolve(url)
@@ -67,12 +72,18 @@ module SiteMapper
67
72
  end
68
73
  end
69
74
 
75
+ # Queue of urls to be crawled.
70
76
  class CrawlQueue
77
+ # @return [Set] that exends EnumerablePop module
71
78
  def self.new
72
79
  Set.new.extend(EnumerablePop)
73
80
  end
74
81
 
82
+ # Add pop method when added to class.
83
+ # The class that extends this module need to implement #first and #delete.
75
84
  module EnumerablePop
85
+ # Pop first element from list.
86
+ # @return [Object] the first object in the list or nil
76
87
  def pop
77
88
  first_element = first
78
89
  delete(first_element)
@@ -3,16 +3,28 @@ require 'url_resolver' # TODO: Allow users to use any resolver
3
3
  module SiteMapper
4
4
  # Get webpage wrapper.
5
5
  class Request
6
+ # Request info link
6
7
  INFO_LINK = 'https://rubygems.org/gems/site_mapper'
8
+ # Request User-Agent
7
9
  USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
8
10
 
9
11
  class << self
10
12
  # Given an URL get it then parse it with Nokogiri::HTML.
13
+ # @param [String] url
11
14
  # @return [Nokogiri::HTML] a nokogiri HTML object
12
15
  def get_page(url)
13
16
  Nokogiri::HTML(Request.get_response_body(url))
14
17
  end
15
18
 
19
+ # Given an URL get the response.
20
+ # @param [String] url
21
+ # @param [Boolean] resolve (optional and false by default)
22
+ # @return [Net::HTTPOK] if response is successfull, raises error otherwise
23
+ # @example get example.com and resolve the URL
24
+ # Request.get_response('example.com', true)
25
+ # @example get example.com and do *not* resolve the URL
26
+ # Request.get_response('http://example.com')
27
+ # Request.get_response('http://example.com', false)
16
28
  def get_response(url, resolve = false)
17
29
  resolved_url = resolve ? resolve_url(url) : url
18
30
  uri = URI.parse(resolved_url)
@@ -20,18 +32,20 @@ module SiteMapper
20
32
  http.use_ssl = true if resolved_url.include?('https://')
21
33
 
22
34
  request = Net::HTTP::Get.new(uri.request_uri)
23
- request['User-Agent'] = USER_AGENT
35
+ request['User-Agent'] = SiteMapper::USER_AGENT
24
36
  http.request(request)
25
37
  end
26
38
 
27
- # Get response body, rescues with nil if an exception is raised
39
+ # Get response body, rescues with nil if an exception is raised.
28
40
  # @see #get_response
29
41
  def get_response_body(*args)
30
42
  get_response(*args).body rescue nil
31
43
  end
32
44
 
33
- # Resolve an URL string and follows redirects
45
+ # Resolve an URL string and follows redirects.
34
46
  # if the URL can't be resolved the original URL is returned.
47
+ # @param [String] url
48
+ # @param [Boolean] with_query (optional and true by default)
35
49
  # @return [String] a URL string that potentially is a redirected URL
36
50
  # @example Resolve google.com
37
51
  # resolve_url('google.com')
@@ -43,6 +57,7 @@ module SiteMapper
43
57
  end
44
58
 
45
59
  # Removes query string from URL string.
60
+ # @param [String] url
46
61
  # @return [String] an URL string without query
47
62
  # @example Removes query string
48
63
  # remove_query('example.com/path?q=keyword')
@@ -1,7 +1,7 @@
1
1
  # Based on: https://rubygems.org/gems/robots, v0.10.1
2
2
  module SiteMapper
3
3
  # Provided a base URL it checks whether a given URL is
4
- # allowed to be crawled according to /robots.txt
4
+ # allowed to be crawled according to /robots.txt.
5
5
  # @see https://rubygems.org/gems/robots
6
6
  class Robots
7
7
  # Parses robots.txt
@@ -11,10 +11,12 @@ module SiteMapper
11
11
  @disallows = {}
12
12
  @allows = {}
13
13
  @delays = {}
14
+ @sitemaps = []
14
15
  parse(body)
15
16
  end
16
17
 
17
18
  # Parse robots.txt body.
19
+ # @param [String] body the webpage body HTML
18
20
  def parse(body)
19
21
  agent = /.*/
20
22
  body = body || "User-agent: *\nAllow: /\n"
@@ -36,6 +38,8 @@ module SiteMapper
36
38
  @disallows[agent] << to_regex(value)
37
39
  when 'crawl-delay'
38
40
  @delays[agent] = value.to_i
41
+ when 'sitemap'
42
+ @sitemaps << value
39
43
  else
40
44
  @other[key] ||= []
41
45
  @other[key] << value
@@ -43,15 +47,20 @@ module SiteMapper
43
47
  end
44
48
  @parsed = true
45
49
  end
46
-
50
+
51
+ # @param [URI] uri to be checked
52
+ # @param [String] user_agent to be checked
47
53
  # @return [Boolean] true if uri is allowed to be crawled
48
54
  # @example Check if http://www.google.com/googlesites is allowed to be crawled
49
55
  # uri = URI.parse('http://www.google.com/googlesites')
50
- # robots.allowed?(uri, 'SiteMapper') # => false (as of 2014-10-22)
56
+ # robots.allowed?(uri, 'SiteMapper')
57
+ # # => false (as of 2014-10-22)
51
58
  def allowed?(uri, user_agent)
52
59
  return true unless @parsed
53
60
  allowed = true
54
61
  path = uri.request_uri
62
+
63
+ user_agent.downcase!
55
64
 
56
65
  @disallows.each do |key, value|
57
66
  if user_agent =~ key
@@ -76,11 +85,24 @@ module SiteMapper
76
85
  end
77
86
  allowed
78
87
  end
88
+
89
+ # @param [String] user_agent
90
+ # @return [Integer] crawl delay for user_agent
91
+ def crawl_delay(user_agent)
92
+ agent = user_agent.dup
93
+ agent = to_regex(agent.downcase) if user_agent.is_a?(String)
94
+ @delays[agent]
95
+ end
79
96
 
80
97
  # @return [Hash] key/value pairs from robots.txt
81
98
  def other_values
82
99
  @other
83
100
  end
101
+
102
+ # @return [Array] returns sitemaps defined in robots.txt
103
+ def sitemaps
104
+ @sitemaps
105
+ end
84
106
 
85
107
  protected
86
108
 
@@ -92,12 +114,15 @@ module SiteMapper
92
114
  end
93
115
  end
94
116
 
95
- def initialize(url, user_agent)
117
+ # @param [String] url to fetch /robots.txt from
118
+ def initialize(robots_txt, hostname, user_agent)
119
+ @robots_txt = robots_txt
120
+ @hostname = hostname
96
121
  @user_agent = user_agent
97
122
  @parsed = {}
98
- @robots_txt = Request.get_response_body("#{url}/robots.txt", true)
99
123
  end
100
124
 
125
+ # @param [String, URI] uri String or URI to check
101
126
  # @return [Boolean] true if uri is allowed to be crawled
102
127
  # @example Check if http://www.google.com/googlesites is allowed to be crawled
103
128
  # robots = Robots.new('google.com', 'SiteMapper')
@@ -116,22 +141,24 @@ module SiteMapper
116
141
  # robots = Robots.new('google.com', 'SiteMapper')
117
142
  # robots.sitemaps
118
143
  def sitemaps
119
- uri = to_uri(uri)
120
- values = other_values(uri.host)
121
- values['sitemap'] or []
144
+ host = @hostname
145
+ @parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
146
+ @parsed[host].sitemaps
122
147
  rescue
123
148
  []
124
149
  end
125
150
 
151
+ # @param [String, URI] uri String or URI get other_values from
126
152
  # @return [Hash] key/value pairs from robots.txt
127
153
  # @example Get other values for google.com
128
154
  # robots = Robots.new('google.com', 'SiteMapper')
129
155
  # robots.other_values
130
- def other_values(uri)
131
- uri = to_uri(uri)
132
- host = uri.host
156
+ def other_values
157
+ host = @hostname
133
158
  @parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
134
159
  @parsed[host].other_values
160
+ rescue
161
+ {}
135
162
  end
136
163
 
137
164
  private
@@ -1,3 +1,4 @@
1
1
  module SiteMapper
2
- VERSION = '0.0.8'
2
+ # Gem version
3
+ VERSION = '0.0.9'
3
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-22 00:00:00.000000000 Z
11
+ date: 2014-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri