site_mapper 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b52f224f6a1dbfdc6207741e9efe92fd6e4adbc5
4
- data.tar.gz: dc70e4ba139a385164aa122e17e61e06bf469614
3
+ metadata.gz: d2f7c3a2410e0cb508297f6fb229c1e5ec5e8f98
4
+ data.tar.gz: 6495918729c0e0d4c8788c4eb126c829b72dc17b
5
5
  SHA512:
6
- metadata.gz: 6546905269282bb362050fb41c679a42aa7ea91f849425c31b0493f1f6c1727e043e7e147ab51042cafe18afd9e092b91293247c5f3de6f57d8fb13d0a58ca06
7
- data.tar.gz: f1821f346544348ac708ed2582538d7c56a7151058a0880a8cb6cf14a93b23481c458e9a512a630e7bf25b0bd768739511245f3eb40b8d8a90504e8cb4f378d1
6
+ metadata.gz: 80fceb2fd976b3c0e6ffe2f7237ffe9645257a271b99ca7f919e147a132fd09275862ba8c0563924e533e6b5049cd192726a06c48a8c4a4efa1b61b0af2c1e55
7
+ data.tar.gz: 9ff4ef13486b564bc9f8b3d60c42c248c26ed4245584af3e05a0a907b41ac7bf6bfc44ab5ece3cee91216c240201990c65e0adfa20960b28a0b37f8f6eb6ad4e
@@ -9,12 +9,17 @@ require 'site_mapper/crawl_url'
9
9
 
10
10
  # Find all links on domain to domain
11
11
  module SiteMapper
12
- # Returns all links found on domain to domain.
12
+ # SiteMapper info link
13
+ INFO_LINK = 'https://rubygems.org/gems/site_mapper'
14
+ # SiteMapper User-Agent
15
+ USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
16
+
17
+ # Map all links on a given site.
13
18
  # @return [Array] with links.
14
19
  # @param [String] link to domain
15
20
  # @example Collect all URLs from example.com
16
21
  # SiteMapper.map('example.com')
17
- def self.map(source)
18
- Crawler.collect_urls(source) { |url| yield(url) if block_given? }
22
+ def self.map(link)
23
+ Crawler.collect_urls(link) { |url| yield(url) if block_given? }
19
24
  end
20
25
  end
@@ -3,6 +3,7 @@ module SiteMapper
3
3
  class CrawlUrl
4
4
  attr_reader :resolved_base_url, :base_hostname
5
5
 
6
+ # @param [String] base_url
6
7
  def initialize(base_url)
7
8
  @resolved_base_url = Request.resolve_url(base_url, with_query: false)
8
9
  @base_hostname = URI.parse(@resolved_base_url).hostname
@@ -11,8 +12,9 @@ module SiteMapper
11
12
 
12
13
  # Given a link it constructs the absolute path,
13
14
  # if valid URL & URL has same domain as @resolved_base_url.
15
+ # @param [String] raw_url url found on page
16
+ # @param [String] get_url current page url
14
17
  # @return [String] with absolute path to resource
15
- # @param [String, String] raw_url from link element and current page URL
16
18
  # @example Construct absolute URL for '/path', example.com
17
19
  # cu = CrawlUrl.new('example.com')
18
20
  # cu.absolute_url_from('/path', 'example.com/some/path')
@@ -2,19 +2,17 @@ require 'set'
2
2
  require 'nokogiri'
3
3
 
4
4
  module SiteMapper
5
+ # Crawls a given site.
5
6
  class Crawler
6
- CRAWLER_INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'
7
- HEADERS_HASH = {
8
- 'User-Agent' => "SiteMapper/#{SiteMapper::VERSION} (+#{CRAWLER_INFO_LINK})"
9
- }
10
-
7
+ # @param [String] url base url for crawler
8
+ # @param [Hash] resolve (optional false by default)
11
9
  def initialize(url, resolve: false)
12
- base_url = Request.resolve_url(url)
10
+ @base_url = Request.resolve_url(url)
13
11
  @options = { resolve: resolve }
14
- @crawl_url = CrawlUrl.new(base_url)
12
+ @crawl_url = CrawlUrl.new(@base_url)
15
13
  @fetch_queue = CrawlQueue.new
16
14
  @processed = Set.new
17
- @robots = Robots.new(base_url, HEADERS_HASH['User-Agent'])
15
+ @robots = nil
18
16
  end
19
17
 
20
18
  # @see #collect_urls
@@ -59,7 +57,14 @@ module SiteMapper
59
57
  end
60
58
 
61
59
  def eligible_for_queue?(url)
62
- @robots.allowed?(url) && !@processed.include?(url)
60
+ robots.allowed?(url) && !@processed.include?(url)
61
+ end
62
+
63
+ def robots
64
+ return @robots unless @robots.nil?
65
+ robots_body = Request.get_response_body("#{@base_url}/robots.txt")
66
+ @robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
67
+ @robots
63
68
  end
64
69
 
65
70
  def resolve(url)
@@ -67,12 +72,18 @@ module SiteMapper
67
72
  end
68
73
  end
69
74
 
75
+ # Queue of urls to be crawled.
70
76
  class CrawlQueue
77
+ # @return [Set] that exends EnumerablePop module
71
78
  def self.new
72
79
  Set.new.extend(EnumerablePop)
73
80
  end
74
81
 
82
+ # Add pop method when added to class.
83
+ # The class that extends this module need to implement #first and #delete.
75
84
  module EnumerablePop
85
+ # Pop first element from list.
86
+ # @return [Object] the first object in the list or nil
76
87
  def pop
77
88
  first_element = first
78
89
  delete(first_element)
@@ -3,16 +3,28 @@ require 'url_resolver' # TODO: Allow users to use any resolver
3
3
  module SiteMapper
4
4
  # Get webpage wrapper.
5
5
  class Request
6
+ # Request info link
6
7
  INFO_LINK = 'https://rubygems.org/gems/site_mapper'
8
+ # Request User-Agent
7
9
  USER_AGENT = "SiteMapper/#{SiteMapper::VERSION} (+#{INFO_LINK})"
8
10
 
9
11
  class << self
10
12
  # Given an URL get it then parse it with Nokogiri::HTML.
13
+ # @param [String] url
11
14
  # @return [Nokogiri::HTML] a nokogiri HTML object
12
15
  def get_page(url)
13
16
  Nokogiri::HTML(Request.get_response_body(url))
14
17
  end
15
18
 
19
+ # Given an URL get the response.
20
+ # @param [String] url
21
+ # @param [Boolean] resolve (optional and false by default)
22
+ # @return [Net::HTTPOK] if response is successfull, raises error otherwise
23
+ # @example get example.com and resolve the URL
24
+ # Request.get_response('example.com', true)
25
+ # @example get example.com and do *not* resolve the URL
26
+ # Request.get_response('http://example.com')
27
+ # Request.get_response('http://example.com', false)
16
28
  def get_response(url, resolve = false)
17
29
  resolved_url = resolve ? resolve_url(url) : url
18
30
  uri = URI.parse(resolved_url)
@@ -20,18 +32,20 @@ module SiteMapper
20
32
  http.use_ssl = true if resolved_url.include?('https://')
21
33
 
22
34
  request = Net::HTTP::Get.new(uri.request_uri)
23
- request['User-Agent'] = USER_AGENT
35
+ request['User-Agent'] = SiteMapper::USER_AGENT
24
36
  http.request(request)
25
37
  end
26
38
 
27
- # Get response body, rescues with nil if an exception is raised
39
+ # Get response body, rescues with nil if an exception is raised.
28
40
  # @see #get_response
29
41
  def get_response_body(*args)
30
42
  get_response(*args).body rescue nil
31
43
  end
32
44
 
33
- # Resolve an URL string and follows redirects
45
+ # Resolve an URL string and follows redirects.
34
46
  # if the URL can't be resolved the original URL is returned.
47
+ # @param [String] url
48
+ # @param [Boolean] with_query (optional and true by default)
35
49
  # @return [String] a URL string that potentially is a redirected URL
36
50
  # @example Resolve google.com
37
51
  # resolve_url('google.com')
@@ -43,6 +57,7 @@ module SiteMapper
43
57
  end
44
58
 
45
59
  # Removes query string from URL string.
60
+ # @param [String] url
46
61
  # @return [String] an URL string without query
47
62
  # @example Removes query string
48
63
  # remove_query('example.com/path?q=keyword')
@@ -1,7 +1,7 @@
1
1
  # Based on: https://rubygems.org/gems/robots, v0.10.1
2
2
  module SiteMapper
3
3
  # Provided a base URL it checks whether a given URL is
4
- # allowed to be crawled according to /robots.txt
4
+ # allowed to be crawled according to /robots.txt.
5
5
  # @see https://rubygems.org/gems/robots
6
6
  class Robots
7
7
  # Parses robots.txt
@@ -11,10 +11,12 @@ module SiteMapper
11
11
  @disallows = {}
12
12
  @allows = {}
13
13
  @delays = {}
14
+ @sitemaps = []
14
15
  parse(body)
15
16
  end
16
17
 
17
18
  # Parse robots.txt body.
19
+ # @param [String] body the webpage body HTML
18
20
  def parse(body)
19
21
  agent = /.*/
20
22
  body = body || "User-agent: *\nAllow: /\n"
@@ -36,6 +38,8 @@ module SiteMapper
36
38
  @disallows[agent] << to_regex(value)
37
39
  when 'crawl-delay'
38
40
  @delays[agent] = value.to_i
41
+ when 'sitemap'
42
+ @sitemaps << value
39
43
  else
40
44
  @other[key] ||= []
41
45
  @other[key] << value
@@ -43,15 +47,20 @@ module SiteMapper
43
47
  end
44
48
  @parsed = true
45
49
  end
46
-
50
+
51
+ # @param [URI] uri to be checked
52
+ # @param [String] user_agent to be checked
47
53
  # @return [Boolean] true if uri is allowed to be crawled
48
54
  # @example Check if http://www.google.com/googlesites is allowed to be crawled
49
55
  # uri = URI.parse('http://www.google.com/googlesites')
50
- # robots.allowed?(uri, 'SiteMapper') # => false (as of 2014-10-22)
56
+ # robots.allowed?(uri, 'SiteMapper')
57
+ # # => false (as of 2014-10-22)
51
58
  def allowed?(uri, user_agent)
52
59
  return true unless @parsed
53
60
  allowed = true
54
61
  path = uri.request_uri
62
+
63
+ user_agent.downcase!
55
64
 
56
65
  @disallows.each do |key, value|
57
66
  if user_agent =~ key
@@ -76,11 +85,24 @@ module SiteMapper
76
85
  end
77
86
  allowed
78
87
  end
88
+
89
+ # @param [String] user_agent
90
+ # @return [Integer] crawl delay for user_agent
91
+ def crawl_delay(user_agent)
92
+ agent = user_agent.dup
93
+ agent = to_regex(agent.downcase) if user_agent.is_a?(String)
94
+ @delays[agent]
95
+ end
79
96
 
80
97
  # @return [Hash] key/value pairs from robots.txt
81
98
  def other_values
82
99
  @other
83
100
  end
101
+
102
+ # @return [Array] returns sitemaps defined in robots.txt
103
+ def sitemaps
104
+ @sitemaps
105
+ end
84
106
 
85
107
  protected
86
108
 
@@ -92,12 +114,15 @@ module SiteMapper
92
114
  end
93
115
  end
94
116
 
95
- def initialize(url, user_agent)
117
+ # @param [String] url to fetch /robots.txt from
118
+ def initialize(robots_txt, hostname, user_agent)
119
+ @robots_txt = robots_txt
120
+ @hostname = hostname
96
121
  @user_agent = user_agent
97
122
  @parsed = {}
98
- @robots_txt = Request.get_response_body("#{url}/robots.txt", true)
99
123
  end
100
124
 
125
+ # @param [String, URI] uri String or URI to check
101
126
  # @return [Boolean] true if uri is allowed to be crawled
102
127
  # @example Check if http://www.google.com/googlesites is allowed to be crawled
103
128
  # robots = Robots.new('google.com', 'SiteMapper')
@@ -116,22 +141,24 @@ module SiteMapper
116
141
  # robots = Robots.new('google.com', 'SiteMapper')
117
142
  # robots.sitemaps
118
143
  def sitemaps
119
- uri = to_uri(uri)
120
- values = other_values(uri.host)
121
- values['sitemap'] or []
144
+ host = @hostname
145
+ @parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
146
+ @parsed[host].sitemaps
122
147
  rescue
123
148
  []
124
149
  end
125
150
 
151
+ # @param [String, URI] uri String or URI get other_values from
126
152
  # @return [Hash] key/value pairs from robots.txt
127
153
  # @example Get other values for google.com
128
154
  # robots = Robots.new('google.com', 'SiteMapper')
129
155
  # robots.other_values
130
- def other_values(uri)
131
- uri = to_uri(uri)
132
- host = uri.host
156
+ def other_values
157
+ host = @hostname
133
158
  @parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
134
159
  @parsed[host].other_values
160
+ rescue
161
+ {}
135
162
  end
136
163
 
137
164
  private
@@ -1,3 +1,4 @@
1
1
  module SiteMapper
2
- VERSION = '0.0.8'
2
+ # Gem version
3
+ VERSION = '0.0.9'
3
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-22 00:00:00.000000000 Z
11
+ date: 2014-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri