content_urls 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MTUxOTdmMDIxMmQzZDQ4ZGE0YWVlNTk5YjBlOTAxM2NkOWZmOTk4Mg==
4
+ MTEwNGE1ZjIwZTdjNDA1ZGJjODM0N2Q4ZWEyNmRlMGFiMDVjOTc4Yg==
5
5
  data.tar.gz: !binary |-
6
- MGM3ODQ4MzQ2NTdiYzk1NWYxZDgyMzA1ZjNlOTMzYzZlMTM1Yjc5Yg==
6
+ NzVlMDU0ZDRlMDBiMTNkZDBmYzg5YTJiYjEwNjc1M2U4ZmQ1MDQ2Yg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZTIxYjJhN2FkYzc5NTA1NTdjN2IxOTJmNGNmNzM4OWYwYWE5NjAzYzI4NDU1
10
- ZWRhMDcyYzA0ZjY0OTI5NTY3MjZjMjZhNWRjOTYyNzJhYThjZmQ4MjcxNmI0
11
- MjAzMGJhY2M4YTRkNDllMWI0YmE5NTkwMzljZGFlYWYzMTYyNmU=
9
+ NWEzNGVmZjFlOWVhZDdjOTNlN2IyNjgzNjA0OThhNWMwNzRjNDNhNjc4NTZi
10
+ NjI0N2NjYzUwYTRkNjYzNmEzM2RiMmI3ZDZkM2NiYWUxZGNmNzc1NzU0ZjBh
11
+ ZDY5ZTAxMjUzMWM3YTZiOTg3ZWVkMTE4MDRhYzY3MjI5ZTk3ZDA=
12
12
  data.tar.gz: !binary |-
13
- NTQ0MDg3YTJlYmRiZmVmNDU5NzUzYWQyMDFiNmI4YTU1YWE2YWNjODQ3MzQ0
14
- NzAzNWY1ZTFiNjAyYWFkNTY1YjNmZjc4OTc2MTE1NTJkNTQwYjA3MzIzMDE4
15
- OTg2NmQ4NTM2YTU3NTcyN2Q1ODRiN2I3MmU4N2NlNDBjODAzZTE=
13
+ MTczNGZhNDE1MDZhZWJkZTEwNjg1NDdlZDFlMzVjODRiMzg4NjE3ZTc0ZWI5
14
+ NDdkNWQ2OWFhNmU3ODI0ZmM4NmEzNGM2MzIxZmVkODRmOGZiYWJlZTJkNDhl
15
+ NjgzNDZjMzVhY2QwMDQ3MjIxYzk3OGZlOWVhNmFlZGM4ZGE5YmI=
data/README.rdoc CHANGED
@@ -27,6 +27,8 @@ ContentUrls was developed to address two use cases:
27
27
  * url() notation
28
28
  * JavaScript content
29
29
  * URI module's REGEXP
30
+ * Can convert relative URLs to absolute URLs by providing resource URL
31
+ * Can convert relative URLs to absolute URLs when base URL found in HTML content
30
32
 
31
33
  == Examples
32
34
  === Find URLs in an HTML document
data/content_urls.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "content_urls"
8
- s.version = "0.1.6"
8
+ s.version = "0.1.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Dennis Sutch"]
12
- s.date = "2013-07-16"
12
+ s.date = "2013-07-18"
13
13
  s.description = "Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs."
14
14
  s.email = "dennis@sutch.com"
15
15
  s.extra_rdoc_files = [
@@ -62,8 +62,6 @@ class ContentUrls
62
62
  # - should href URL be changed?
63
63
  # - should relative URLs be modified using base?
64
64
  # - how should rewritten relative URLs be handled?
65
- base = doc.search('//head/base/@href') # base URI for resolving relative URIs
66
- base = nil if base && base.to_s.strip.empty?
67
65
 
68
66
  @@parser_definition.each do |type, definition|
69
67
  doc.search(definition[:xpath]).each do |obj|
@@ -2,7 +2,7 @@ class ContentUrls
2
2
  module Version
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- PATCH = 6
5
+ PATCH = 7
6
6
  BUILD = nil
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
data/lib/content_urls.rb CHANGED
@@ -9,6 +9,9 @@ class ContentUrls
9
9
  #
10
10
  # @param [String] content the content.
11
11
  # @param [String] type the media type of the content.
12
+ # @param [Hash] opts the options for manipulating returned URLs
13
+ # @option opts [String] :use_base_url (false) if base URL is found in content, this option indicates whether base URL will be used to change each relative URL to an absolute URL (note: base URL ignored if determined to be relative)
14
+ # @option opts [String] :content_url the URL from which content was retrieved; will be used to change each relative URL to an absolute URL (note: :use_base_url option takes precedence over :content_url option; content URL will ignored if determined to be relative)
12
15
  # @return [Array] the unique URLs found in the content.
13
16
  #
14
17
  # @example Parse HTML code for URLs
@@ -18,7 +21,14 @@ class ContentUrls
18
21
  # end
19
22
  # # => "Found URL: index.html"
20
23
  #
21
- # @example Parse content obtained from a robot
24
+ # @example Parse HTML code for URLs, changing each to an absolute URL based on the address of the the original resource
25
+ # content = '<html><a href="index.html">Home</a></html>'
26
+ # ContentUrls.urls(content, 'text/html', content_url: 'http://www.example.com/sample.html').each do |url|
27
+ # puts "Found URL: #{url}"
28
+ # end
29
+ # # => "Found URL: http://www.example.com/index.html"
30
+ #
31
+ # # @example Parse content obtained from a robot
22
32
  # response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
23
33
  # puts "URLs found at http://example.com/sample-1:"
24
34
  # ContentUrls.urls(response.body, response.content_type).each do |url|
@@ -26,14 +36,50 @@ class ContentUrls
26
36
  # end
27
37
  # # => [a list of URLs found in the content located at http://example.com/sample-1]
28
38
  #
29
- def self.urls(content, type)
39
+ def self.urls(content, type, options = {})
40
+ options = {
41
+ :use_base_url => false,
42
+ :content_url => nil,
43
+ }.merge(options)
30
44
  urls = []
31
45
  if (parser = get_parser(type))
32
- parser.urls(content).each { |url| urls << url }
46
+ base = base_url(content, type) if options[:use_base_url]
47
+ base = '' if URI(base || '').relative?
48
+ if options[:content_url]
49
+ content_url = URI(options[:content_url]) rescue ''
50
+ content_url = '' if URI(content_url).relative?
51
+ base = URI.join(content_url, base)
52
+ end
53
+ if URI(base).relative?
54
+ parser.urls(content).each { |url| urls << url }
55
+ else
56
+ parser.urls(content).each { |url| urls << URI.join( base, url).to_s }
57
+ end
33
58
  end
34
59
  urls
35
60
  end
36
61
 
62
+ # Returns base URL found in the content, if available.
63
+ #
64
+ # @param [String] content the content.
65
+ # @param [String] type the media type of the content.
66
+ # @return [String] the base URL found in the content.
67
+ #
68
+ # @example Parse HTML code for base URL
69
+ # content = '<html><head><base href="/home/">'
70
+ # puts "Found base URL: #{ContentUrls.base_url(content, 'text/html')}"
71
+ # # => "Found base URL: /home/"
72
+ #
73
+ def self.base_url(content, type)
74
+ base = nil
75
+ if (parser = get_parser(type))
76
+ if (parser.respond_to?(:base))
77
+ base = parser.base(content)
78
+ end
79
+ end
80
+ base
81
+ end
82
+
37
83
  # Rewrites each URL in the content by calling the supplied block with each URL.
38
84
  #
39
85
  # @param [String] content the HTML content.
@@ -27,3 +27,57 @@ describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b})
27
27
  ContentUrls.get_parser('content/test').should eq 'some_parser_class'
28
28
  end
29
29
  end
30
+
31
+ describe ContentUrls do
32
+ it "should return relative URLs as absolute when requested" do
33
+
34
+ html_base_sample =<<BASE_SAMPLE
35
+ <html>
36
+ <head>
37
+ <base href='http://www.example.com/sample/'>
38
+ <title>HTML base Sample</title>
39
+ </head>
40
+ <body>
41
+ <h1>HTML base Sample</h1>
42
+ <a href='about.html'>about</a>
43
+ </body>
44
+ </html>
45
+ BASE_SAMPLE
46
+
47
+ urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
48
+ urls[0].should eq 'http://www.example.com/sample/about.html'
49
+
50
+ urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
51
+ urls[0].should eq 'https://www2.example.com/test/about.html'
52
+
53
+ urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
54
+ urls[0].should eq 'http://www.example.com/sample/about.html'
55
+ end
56
+ end
57
+
58
+ describe ContentUrls do
59
+ it "should not change absolute URLs when requested to make absolute URLs from relative URLs" do
60
+
61
+ html_base_sample =<<BASE_SAMPLE
62
+ <html>
63
+ <head>
64
+ <base href='http://www2.example.com/sample/'>
65
+ <title>HTML base Sample</title>
66
+ </head>
67
+ <body>
68
+ <h1>HTML base Sample</h1>
69
+ <a href='http://www.example.com/about.html'>about</a>
70
+ </body>
71
+ </html>
72
+ BASE_SAMPLE
73
+
74
+ urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
75
+ urls[0].should eq 'http://www.example.com/about.html'
76
+
77
+ urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
78
+ urls[0].should eq 'http://www.example.com/about.html'
79
+
80
+ urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
81
+ urls[0].should eq 'http://www.example.com/about.html'
82
+ end
83
+ end
@@ -334,13 +334,32 @@ SAMPLE_13
334
334
  end
335
335
  end
336
336
 
337
+ describe ContentUrls::HtmlParser do
338
+ it "should parse the HTML and return nil when no 'base' URL" do
339
+
340
+ html_missing_base_sample =<<MISSING_BASE_SAMPLE
341
+ <html>
342
+ <head>
343
+ <title>HTML no base Sample</title>
344
+ </head>
345
+ <body>
346
+ <h1>HTML no base Sample</h1>
347
+ </body>
348
+ </html>
349
+ MISSING_BASE_SAMPLE
350
+
351
+ url = ContentUrls::HtmlParser.base(html_missing_base_sample)
352
+ url.should eq nil
353
+ end
354
+ end
355
+
337
356
  describe ContentUrls::HtmlParser do
338
357
  it "should parse the HTML and return the 'base' URL and no other URLs" do
339
358
 
340
359
  html_base_sample =<<BASE_SAMPLE
341
360
  <html>
342
361
  <head>
343
- <base href='/en/'
362
+ <base href='/en/'>
344
363
  <title>HTML base Sample</title>
345
364
  </head>
346
365
  <body>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_urls
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dennis Sutch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-16 00:00:00.000000000 Z
11
+ date: 2013-07-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri