RubyGems - content_urls - Versions diffs - 0.1.6 → 0.1.7 - Mend

content_urls 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +8 -8
data/README.rdoc +2 -0
data/content_urls.gemspec +2 -2
data/lib/content_urls/parsers/html_parser.rb +0 -2
data/lib/content_urls/version.rb +1 -1
data/lib/content_urls.rb +49 -3
data/spec/content_urls_spec.rb +54 -0
data/spec/html_parser_spec.rb +20 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    MTUxOTdmMDIxMmQzZDQ4ZGE0YWVlNTk5YjBlOTAxM2NkOWZmOTk4Mg==
+    MTEwNGE1ZjIwZTdjNDA1ZGJjODM0N2Q4ZWEyNmRlMGFiMDVjOTc4Yg==
   data.tar.gz: !binary |-
-    MGM3ODQ4MzQ2NTdiYzk1NWYxZDgyMzA1ZjNlOTMzYzZlMTM1Yjc5Yg==
+    NzVlMDU0ZDRlMDBiMTNkZDBmYzg5YTJiYjEwNjc1M2U4ZmQ1MDQ2Yg==
 !binary "U0hBNTEy":
   metadata.gz: !binary |-
-    ZTIxYjJhN2FkYzc5NTA1NTdjN2IxOTJmNGNmNzM4OWYwYWE5NjAzYzI4NDU1
-    ZWRhMDcyYzA0ZjY0OTI5NTY3MjZjMjZhNWRjOTYyNzJhYThjZmQ4MjcxNmI0
-    MjAzMGJhY2M4YTRkNDllMWI0YmE5NTkwMzljZGFlYWYzMTYyNmU=
+    NWEzNGVmZjFlOWVhZDdjOTNlN2IyNjgzNjA0OThhNWMwNzRjNDNhNjc4NTZi
+    NjI0N2NjYzUwYTRkNjYzNmEzM2RiMmI3ZDZkM2NiYWUxZGNmNzc1NzU0ZjBh
+    ZDY5ZTAxMjUzMWM3YTZiOTg3ZWVkMTE4MDRhYzY3MjI5ZTk3ZDA=
   data.tar.gz: !binary |-
-    NTQ0MDg3YTJlYmRiZmVmNDU5NzUzYWQyMDFiNmI4YTU1YWE2YWNjODQ3MzQ0
-    NzAzNWY1ZTFiNjAyYWFkNTY1YjNmZjc4OTc2MTE1NTJkNTQwYjA3MzIzMDE4
-    OTg2NmQ4NTM2YTU3NTcyN2Q1ODRiN2I3MmU4N2NlNDBjODAzZTE=
+    MTczNGZhNDE1MDZhZWJkZTEwNjg1NDdlZDFlMzVjODRiMzg4NjE3ZTc0ZWI5
+    NDdkNWQ2OWFhNmU3ODI0ZmM4NmEzNGM2MzIxZmVkODRmOGZiYWJlZTJkNDhl
+    NjgzNDZjMzVhY2QwMDQ3MjIxYzk3OGZlOWVhNmFlZGM4ZGE5YmI=

data/README.rdoc CHANGED Viewed

@@ -27,6 +27,8 @@ ContentUrls was developed to address two use cases:
   * url() notation
 * JavaScript content
   * URI module's REGEXP
+* Can convert relative URLs to absolute URLs by providing resource URL
+* Can convert relative URLs to absolute URLs when base URL found in HTML content
 == Examples
 === Find URLs in an HTML document

data/content_urls.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "content_urls"
-  s.version = "0.1.6"
+  s.version = "0.1.7"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Dennis Sutch"]
-  s.date = "2013-07-16"
+  s.date = "2013-07-18"
   s.description = "Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs."
   s.email = "dennis@sutch.com"
   s.extra_rdoc_files = [

data/lib/content_urls/parsers/html_parser.rb CHANGED Viewed

@@ -62,8 +62,6 @@ class ContentUrls
       #  - should href URL be changed?
       #  - should relative URLs be modified using base?
       #  - how should rewritten relative URLs be handled?
-      base = doc.search('//head/base/@href')  # base URI for resolving relative URIs
-      base = nil if base && base.to_s.strip.empty?
       @@parser_definition.each do |type, definition|
         doc.search(definition[:xpath]).each do |obj|

data/lib/content_urls/version.rb CHANGED Viewed

@@ -2,7 +2,7 @@ class ContentUrls
   module Version
     MAJOR = 0
     MINOR = 1
-    PATCH = 6
+    PATCH = 7
     BUILD = nil
     STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')

data/lib/content_urls.rb CHANGED Viewed

@@ -9,6 +9,9 @@ class ContentUrls
   #
   # @param [String] content the content.
   # @param [String] type the media type of the content.
+  # @param [Hash] opts the options for manipulating returned URLs
+  # @option opts [String] :use_base_url (false) if base URL is found in content, this option indicates whether base URL will be used to change each relative URL to an absolute URL (note: base URL ignored if determined to be relative)
+  # @option opts [String] :content_url the URL from which content was retrieved; will be used to change each relative URL to an absolute URL (note: :use_base_url option takes precedence over :content_url option; content URL will ignored if determined to be relative)
   # @return [Array] the unique URLs found in the content.
   #
   # @example Parse HTML code for URLs
@@ -18,7 +21,14 @@ class ContentUrls
   #   end
   #   # => "Found URL: index.html"
   #
-  # @example Parse content obtained from a robot
+  # @example Parse HTML code for URLs, changing each to an absolute URL based on the address of the the original resource
+  #   content = '<html><a href="index.html">Home</a></html>'
+  #   ContentUrls.urls(content, 'text/html', content_url: 'http://www.example.com/sample.html').each do |url|
+  #     puts "Found URL: #{url}"
+  #   end
+  #   # => "Found URL: http://www.example.com/index.html"
+  #
+  #  # @example Parse content obtained from a robot
   #   response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
   #   puts "URLs found at http://example.com/sample-1:"
   #   ContentUrls.urls(response.body, response.content_type).each do |url|
@@ -26,14 +36,50 @@ class ContentUrls
   #   end
   #   # => [a list of URLs found in the content located at http://example.com/sample-1]
   #
-  def self.urls(content, type)
+  def self.urls(content, type, options = {})
+    options = {
+        :use_base_url => false,
+        :content_url => nil,
+    }.merge(options)
     urls = []
     if (parser = get_parser(type))
-      parser.urls(content).each { |url| urls << url }
+      base = base_url(content, type) if options[:use_base_url]
+      base = '' if URI(base || '').relative?
+      if options[:content_url]
+        content_url = URI(options[:content_url]) rescue ''
+        content_url = '' if URI(content_url).relative?
+        base = URI.join(content_url, base)
+      end
+      if URI(base).relative?
+        parser.urls(content).each { |url| urls << url }
+      else
+        parser.urls(content).each { |url| urls << URI.join( base, url).to_s }
+      end
     end
     urls
   end
+  # Returns base URL found in the content, if available.
+  #
+  # @param [String] content the content.
+  # @param [String] type the media type of the content.
+  # @return [String] the base URL found in the content.
+  #
+  # @example Parse HTML code for base URL
+  #   content = '<html><head><base href="/home/">'
+  #   puts "Found base URL: #{ContentUrls.base_url(content, 'text/html')}"
+  #   # => "Found base URL: /home/"
+  #
+  def self.base_url(content, type)
+    base = nil
+    if (parser = get_parser(type))
+      if (parser.respond_to?(:base))
+        base = parser.base(content)
+      end
+    end
+    base
+  end
   # Rewrites each URL in the content by calling the supplied block with each URL.
   #
   # @param [String] content the HTML content.

data/spec/content_urls_spec.rb CHANGED Viewed

@@ -27,3 +27,57 @@ describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b})
     ContentUrls.get_parser('content/test').should eq 'some_parser_class'
   end
 end
+describe ContentUrls do
+  it "should return relative URLs as absolute when requested" do
+    html_base_sample =<<BASE_SAMPLE
+<html>
+<head>
+  <base href='http://www.example.com/sample/'>
+  <title>HTML base Sample</title>
+</head>
+<body>
+  <h1>HTML base Sample</h1>
+  <a href='about.html'>about</a>
+</body>
+</html>
+BASE_SAMPLE
+    urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
+    urls[0].should eq 'http://www.example.com/sample/about.html'
+    urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
+    urls[0].should eq 'https://www2.example.com/test/about.html'
+    urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
+    urls[0].should eq 'http://www.example.com/sample/about.html'
+  end
+end
+describe ContentUrls do
+  it "should not change absolute URLs when requested to make absolute URLs from relative URLs" do
+    html_base_sample =<<BASE_SAMPLE
+<html>
+<head>
+  <base href='http://www2.example.com/sample/'>
+  <title>HTML base Sample</title>
+</head>
+<body>
+  <h1>HTML base Sample</h1>
+  <a href='http://www.example.com/about.html'>about</a>
+</body>
+</html>
+BASE_SAMPLE
+    urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
+    urls[0].should eq 'http://www.example.com/about.html'
+    urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
+    urls[0].should eq 'http://www.example.com/about.html'
+    urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
+    urls[0].should eq 'http://www.example.com/about.html'
+  end
+end

data/spec/html_parser_spec.rb CHANGED Viewed

@@ -334,13 +334,32 @@ SAMPLE_13
   end
 end
+describe ContentUrls::HtmlParser do
+  it "should parse the HTML and return nil when no 'base' URL" do
+    html_missing_base_sample =<<MISSING_BASE_SAMPLE
+<html>
+<head>
+  <title>HTML no base Sample</title>
+</head>
+<body>
+  <h1>HTML no base Sample</h1>
+</body>
+</html>
+MISSING_BASE_SAMPLE
+    url = ContentUrls::HtmlParser.base(html_missing_base_sample)
+    url.should eq nil
+  end
+end
 describe ContentUrls::HtmlParser do
   it "should parse the HTML and return the 'base' URL and no other URLs" do
   html_base_sample =<<BASE_SAMPLE
 <html>
 <head>
-  <base href='/en/'
+  <base href='/en/'>
   <title>HTML base Sample</title>
 </head>
 <body>

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: content_urls
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.1.7
 platform: ruby
 authors:
 - Dennis Sutch
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-16 00:00:00.000000000 Z
+date: 2013-07-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri