content_urls 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MTUxOTdmMDIxMmQzZDQ4ZGE0YWVlNTk5YjBlOTAxM2NkOWZmOTk4Mg==
4
+ MTEwNGE1ZjIwZTdjNDA1ZGJjODM0N2Q4ZWEyNmRlMGFiMDVjOTc4Yg==
5
5
  data.tar.gz: !binary |-
6
- MGM3ODQ4MzQ2NTdiYzk1NWYxZDgyMzA1ZjNlOTMzYzZlMTM1Yjc5Yg==
6
+ NzVlMDU0ZDRlMDBiMTNkZDBmYzg5YTJiYjEwNjc1M2U4ZmQ1MDQ2Yg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZTIxYjJhN2FkYzc5NTA1NTdjN2IxOTJmNGNmNzM4OWYwYWE5NjAzYzI4NDU1
10
- ZWRhMDcyYzA0ZjY0OTI5NTY3MjZjMjZhNWRjOTYyNzJhYThjZmQ4MjcxNmI0
11
- MjAzMGJhY2M4YTRkNDllMWI0YmE5NTkwMzljZGFlYWYzMTYyNmU=
9
+ NWEzNGVmZjFlOWVhZDdjOTNlN2IyNjgzNjA0OThhNWMwNzRjNDNhNjc4NTZi
10
+ NjI0N2NjYzUwYTRkNjYzNmEzM2RiMmI3ZDZkM2NiYWUxZGNmNzc1NzU0ZjBh
11
+ ZDY5ZTAxMjUzMWM3YTZiOTg3ZWVkMTE4MDRhYzY3MjI5ZTk3ZDA=
12
12
  data.tar.gz: !binary |-
13
- NTQ0MDg3YTJlYmRiZmVmNDU5NzUzYWQyMDFiNmI4YTU1YWE2YWNjODQ3MzQ0
14
- NzAzNWY1ZTFiNjAyYWFkNTY1YjNmZjc4OTc2MTE1NTJkNTQwYjA3MzIzMDE4
15
- OTg2NmQ4NTM2YTU3NTcyN2Q1ODRiN2I3MmU4N2NlNDBjODAzZTE=
13
+ MTczNGZhNDE1MDZhZWJkZTEwNjg1NDdlZDFlMzVjODRiMzg4NjE3ZTc0ZWI5
14
+ NDdkNWQ2OWFhNmU3ODI0ZmM4NmEzNGM2MzIxZmVkODRmOGZiYWJlZTJkNDhl
15
+ NjgzNDZjMzVhY2QwMDQ3MjIxYzk3OGZlOWVhNmFlZGM4ZGE5YmI=
data/README.rdoc CHANGED
@@ -27,6 +27,8 @@ ContentUrls was developed to address two use cases:
27
27
  * url() notation
28
28
  * JavaScript content
29
29
  * URI module's REGEXP
30
+ * Can convert relative URLs to absolute URLs by providing resource URL
31
+ * Can convert relative URLs to absolute URLs when base URL found in HTML content
30
32
 
31
33
  == Examples
32
34
  === Find URLs in an HTML document
data/content_urls.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "content_urls"
8
- s.version = "0.1.6"
8
+ s.version = "0.1.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Dennis Sutch"]
12
- s.date = "2013-07-16"
12
+ s.date = "2013-07-18"
13
13
  s.description = "Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs."
14
14
  s.email = "dennis@sutch.com"
15
15
  s.extra_rdoc_files = [
@@ -62,8 +62,6 @@ class ContentUrls
62
62
  # - should href URL be changed?
63
63
  # - should relative URLs be modified using base?
64
64
  # - how should rewritten relative URLs be handled?
65
- base = doc.search('//head/base/@href') # base URI for resolving relative URIs
66
- base = nil if base && base.to_s.strip.empty?
67
65
 
68
66
  @@parser_definition.each do |type, definition|
69
67
  doc.search(definition[:xpath]).each do |obj|
@@ -2,7 +2,7 @@ class ContentUrls
2
2
  module Version
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- PATCH = 6
5
+ PATCH = 7
6
6
  BUILD = nil
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
data/lib/content_urls.rb CHANGED
@@ -9,6 +9,9 @@ class ContentUrls
9
9
  #
10
10
  # @param [String] content the content.
11
11
  # @param [String] type the media type of the content.
12
+ # @param [Hash] opts the options for manipulating returned URLs
13
+ # @option opts [String] :use_base_url (false) if base URL is found in content, this option indicates whether base URL will be used to change each relative URL to an absolute URL (note: base URL ignored if determined to be relative)
14
+ # @option opts [String] :content_url the URL from which content was retrieved; will be used to change each relative URL to an absolute URL (note: :use_base_url option takes precedence over :content_url option; content URL will ignored if determined to be relative)
12
15
  # @return [Array] the unique URLs found in the content.
13
16
  #
14
17
  # @example Parse HTML code for URLs
@@ -18,7 +21,14 @@ class ContentUrls
18
21
  # end
19
22
  # # => "Found URL: index.html"
20
23
  #
21
- # @example Parse content obtained from a robot
24
+ # @example Parse HTML code for URLs, changing each to an absolute URL based on the address of the the original resource
25
+ # content = '<html><a href="index.html">Home</a></html>'
26
+ # ContentUrls.urls(content, 'text/html', content_url: 'http://www.example.com/sample.html').each do |url|
27
+ # puts "Found URL: #{url}"
28
+ # end
29
+ # # => "Found URL: http://www.example.com/index.html"
30
+ #
31
+ # # @example Parse content obtained from a robot
22
32
  # response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
23
33
  # puts "URLs found at http://example.com/sample-1:"
24
34
  # ContentUrls.urls(response.body, response.content_type).each do |url|
@@ -26,14 +36,50 @@ class ContentUrls
26
36
  # end
27
37
  # # => [a list of URLs found in the content located at http://example.com/sample-1]
28
38
  #
29
- def self.urls(content, type)
39
+ def self.urls(content, type, options = {})
40
+ options = {
41
+ :use_base_url => false,
42
+ :content_url => nil,
43
+ }.merge(options)
30
44
  urls = []
31
45
  if (parser = get_parser(type))
32
- parser.urls(content).each { |url| urls << url }
46
+ base = base_url(content, type) if options[:use_base_url]
47
+ base = '' if URI(base || '').relative?
48
+ if options[:content_url]
49
+ content_url = URI(options[:content_url]) rescue ''
50
+ content_url = '' if URI(content_url).relative?
51
+ base = URI.join(content_url, base)
52
+ end
53
+ if URI(base).relative?
54
+ parser.urls(content).each { |url| urls << url }
55
+ else
56
+ parser.urls(content).each { |url| urls << URI.join( base, url).to_s }
57
+ end
33
58
  end
34
59
  urls
35
60
  end
36
61
 
62
+ # Returns base URL found in the content, if available.
63
+ #
64
+ # @param [String] content the content.
65
+ # @param [String] type the media type of the content.
66
+ # @return [String] the base URL found in the content.
67
+ #
68
+ # @example Parse HTML code for base URL
69
+ # content = '<html><head><base href="/home/">'
70
+ # puts "Found base URL: #{ContentUrls.base_url(content, 'text/html')}"
71
+ # # => "Found base URL: /home/"
72
+ #
73
+ def self.base_url(content, type)
74
+ base = nil
75
+ if (parser = get_parser(type))
76
+ if (parser.respond_to?(:base))
77
+ base = parser.base(content)
78
+ end
79
+ end
80
+ base
81
+ end
82
+
37
83
  # Rewrites each URL in the content by calling the supplied block with each URL.
38
84
  #
39
85
  # @param [String] content the HTML content.
@@ -27,3 +27,57 @@ describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b})
27
27
  ContentUrls.get_parser('content/test').should eq 'some_parser_class'
28
28
  end
29
29
  end
30
+
31
+ describe ContentUrls do
32
+ it "should return relative URLs as absolute when requested" do
33
+
34
+ html_base_sample =<<BASE_SAMPLE
35
+ <html>
36
+ <head>
37
+ <base href='http://www.example.com/sample/'>
38
+ <title>HTML base Sample</title>
39
+ </head>
40
+ <body>
41
+ <h1>HTML base Sample</h1>
42
+ <a href='about.html'>about</a>
43
+ </body>
44
+ </html>
45
+ BASE_SAMPLE
46
+
47
+ urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
48
+ urls[0].should eq 'http://www.example.com/sample/about.html'
49
+
50
+ urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
51
+ urls[0].should eq 'https://www2.example.com/test/about.html'
52
+
53
+ urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
54
+ urls[0].should eq 'http://www.example.com/sample/about.html'
55
+ end
56
+ end
57
+
58
+ describe ContentUrls do
59
+ it "should not change absolute URLs when requested to make absolute URLs from relative URLs" do
60
+
61
+ html_base_sample =<<BASE_SAMPLE
62
+ <html>
63
+ <head>
64
+ <base href='http://www2.example.com/sample/'>
65
+ <title>HTML base Sample</title>
66
+ </head>
67
+ <body>
68
+ <h1>HTML base Sample</h1>
69
+ <a href='http://www.example.com/about.html'>about</a>
70
+ </body>
71
+ </html>
72
+ BASE_SAMPLE
73
+
74
+ urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
75
+ urls[0].should eq 'http://www.example.com/about.html'
76
+
77
+ urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
78
+ urls[0].should eq 'http://www.example.com/about.html'
79
+
80
+ urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
81
+ urls[0].should eq 'http://www.example.com/about.html'
82
+ end
83
+ end
@@ -334,13 +334,32 @@ SAMPLE_13
334
334
  end
335
335
  end
336
336
 
337
+ describe ContentUrls::HtmlParser do
338
+ it "should parse the HTML and return nil when no 'base' URL" do
339
+
340
+ html_missing_base_sample =<<MISSING_BASE_SAMPLE
341
+ <html>
342
+ <head>
343
+ <title>HTML no base Sample</title>
344
+ </head>
345
+ <body>
346
+ <h1>HTML no base Sample</h1>
347
+ </body>
348
+ </html>
349
+ MISSING_BASE_SAMPLE
350
+
351
+ url = ContentUrls::HtmlParser.base(html_missing_base_sample)
352
+ url.should eq nil
353
+ end
354
+ end
355
+
337
356
  describe ContentUrls::HtmlParser do
338
357
  it "should parse the HTML and return the 'base' URL and no other URLs" do
339
358
 
340
359
  html_base_sample =<<BASE_SAMPLE
341
360
  <html>
342
361
  <head>
343
- <base href='/en/'
362
+ <base href='/en/'>
344
363
  <title>HTML base Sample</title>
345
364
  </head>
346
365
  <body>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: content_urls
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dennis Sutch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-16 00:00:00.000000000 Z
11
+ date: 2013-07-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri