RubyGems - content_urls - Versions diffs - 0.1.6 → 0.1.7 - Mend

content_urls 0.1.6 → 0.1.7

Files changed (9) hide show

checksums.yaml +8 -8
data/README.rdoc +2 -0
data/content_urls.gemspec +2 -2
data/lib/content_urls/parsers/html_parser.rb +0 -2
data/lib/content_urls/version.rb +1 -1
data/lib/content_urls.rb +49 -3
data/spec/content_urls_spec.rb +54 -0
data/spec/html_parser_spec.rb +20 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    MTUxOTdmMDIxMmQzZDQ4ZGE0YWVlNTk5YjBlOTAxM2NkOWZmOTk4Mg==
+    MTEwNGE1ZjIwZTdjNDA1ZGJjODM0N2Q4ZWEyNmRlMGFiMDVjOTc4Yg==
   data.tar.gz: !binary |-
-    MGM3ODQ4MzQ2NTdiYzk1NWYxZDgyMzA1ZjNlOTMzYzZlMTM1Yjc5Yg==
+    NzVlMDU0ZDRlMDBiMTNkZDBmYzg5YTJiYjEwNjc1M2U4ZmQ1MDQ2Yg==
 !binary "U0hBNTEy":
   metadata.gz: !binary |-
-    ZTIxYjJhN2FkYzc5NTA1NTdjN2IxOTJmNGNmNzM4OWYwYWE5NjAzYzI4NDU1
-    ZWRhMDcyYzA0ZjY0OTI5NTY3MjZjMjZhNWRjOTYyNzJhYThjZmQ4MjcxNmI0
-    MjAzMGJhY2M4YTRkNDllMWI0YmE5NTkwMzljZGFlYWYzMTYyNmU=
+    NWEzNGVmZjFlOWVhZDdjOTNlN2IyNjgzNjA0OThhNWMwNzRjNDNhNjc4NTZi
+    NjI0N2NjYzUwYTRkNjYzNmEzM2RiMmI3ZDZkM2NiYWUxZGNmNzc1NzU0ZjBh
+    ZDY5ZTAxMjUzMWM3YTZiOTg3ZWVkMTE4MDRhYzY3MjI5ZTk3ZDA=
   data.tar.gz: !binary |-
-    NTQ0MDg3YTJlYmRiZmVmNDU5NzUzYWQyMDFiNmI4YTU1YWE2YWNjODQ3MzQ0
-    NzAzNWY1ZTFiNjAyYWFkNTY1YjNmZjc4OTc2MTE1NTJkNTQwYjA3MzIzMDE4
-    OTg2NmQ4NTM2YTU3NTcyN2Q1ODRiN2I3MmU4N2NlNDBjODAzZTE=
+    MTczNGZhNDE1MDZhZWJkZTEwNjg1NDdlZDFlMzVjODRiMzg4NjE3ZTc0ZWI5
+    NDdkNWQ2OWFhNmU3ODI0ZmM4NmEzNGM2MzIxZmVkODRmOGZiYWJlZTJkNDhl
+    NjgzNDZjMzVhY2QwMDQ3MjIxYzk3OGZlOWVhNmFlZGM4ZGE5YmI=

data/README.rdoc CHANGED Viewed

@@ -27,6 +27,8 @@ ContentUrls was developed to address two use cases:
   * url() notation
 * JavaScript content
   * URI module's REGEXP
+* Can convert relative URLs to absolute URLs by providing resource URL
+* Can convert relative URLs to absolute URLs when base URL found in HTML content
 == Examples
 === Find URLs in an HTML document

data/content_urls.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "content_urls"
-  s.version = "0.1.6"
+  s.version = "0.1.7"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Dennis Sutch"]
-  s.date = "2013-07-16"
+  s.date = "2013-07-18"
   s.description = "Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs."
   s.email = "dennis@sutch.com"
   s.extra_rdoc_files = [

data/lib/content_urls/parsers/html_parser.rb CHANGED Viewed

@@ -62,8 +62,6 @@ class ContentUrls
       #  - should href URL be changed?
       #  - should relative URLs be modified using base?
       #  - how should rewritten relative URLs be handled?
-      base = doc.search('//head/base/@href')  # base URI for resolving relative URIs
-      base = nil if base && base.to_s.strip.empty?
       @@parser_definition.each do |type, definition|
         doc.search(definition[:xpath]).each do |obj|

data/lib/content_urls/version.rb CHANGED Viewed

@@ -2,7 +2,7 @@ class ContentUrls
   module Version
     MAJOR = 0
     MINOR = 1
-    PATCH = 6
+    PATCH = 7
     BUILD = nil
     STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')

data/lib/content_urls.rb CHANGED Viewed

@@ -9,6 +9,9 @@ class ContentUrls
   #
   # @param [String] content the content.
   # @param [String] type the media type of the content.
+  # @param [Hash] opts the options for manipulating returned URLs
+  # @option opts [String] :use_base_url (false) if base URL is found in content, this option indicates whether base URL will be used to change each relative URL to an absolute URL (note: base URL ignored if determined to be relative)
+  # @option opts [String] :content_url the URL from which content was retrieved; will be used to change each relative URL to an absolute URL (note: :use_base_url option takes precedence over :content_url option; content URL will ignored if determined to be relative)
   # @return [Array] the unique URLs found in the content.
   #
   # @example Parse HTML code for URLs
@@ -18,7 +21,14 @@ class ContentUrls
   #   end
   #   # => "Found URL: index.html"
   #
-  # @example Parse content obtained from a robot
+  # @example Parse HTML code for URLs, changing each to an absolute URL based on the address of the the original resource
+  #   content = '<html><a href="index.html">Home</a></html>'
+  #   ContentUrls.urls(content, 'text/html', content_url: 'http://www.example.com/sample.html').each do |url|
+  #     puts "Found URL: #{url}"
+  #   end
+  #   # => "Found URL: http://www.example.com/index.html"
+  #
+  #  # @example Parse content obtained from a robot
   #   response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
   #   puts "URLs found at http://example.com/sample-1:"
   #   ContentUrls.urls(response.body, response.content_type).each do |url|
@@ -26,14 +36,50 @@ class ContentUrls
   #   end
   #   # => [a list of URLs found in the content located at http://example.com/sample-1]
   #
-  def self.urls(content, type)
+  def self.urls(content, type, options = {})
+    options = {
+        :use_base_url => false,
+        :content_url => nil,
+    }.merge(options)
     urls = []
     if (parser = get_parser(type))
-      parser.urls(content).each { |url| urls << url }
+      base = base_url(content, type) if options[:use_base_url]
+      base = '' if URI(base || '').relative?
+      if options[:content_url]
+        content_url = URI(options[:content_url]) rescue ''
+        content_url = '' if URI(content_url).relative?
+        base = URI.join(content_url, base)
+      end
+      if URI(base).relative?
+        parser.urls(content).each { |url| urls << url }
+      else
+        parser.urls(content).each { |url| urls << URI.join( base, url).to_s }
+      end
     end
     urls
   end
+  # Returns base URL found in the content, if available.
+  #
+  # @param [String] content the content.
+  # @param [String] type the media type of the content.
+  # @return [String] the base URL found in the content.
+  #
+  # @example Parse HTML code for base URL
+  #   content = '<html><head><base href="/home/">'
+  #   puts "Found base URL: #{ContentUrls.base_url(content, 'text/html')}"
+  #   # => "Found base URL: /home/"
+  #
+  def self.base_url(content, type)
+    base = nil
+    if (parser = get_parser(type))
+      if (parser.respond_to?(:base))
+        base = parser.base(content)
+      end
+    end
+    base
+  end
   # Rewrites each URL in the content by calling the supplied block with each URL.
   #
   # @param [String] content the HTML content.

data/spec/content_urls_spec.rb CHANGED Viewed

@@ -27,3 +27,57 @@ describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b})
     ContentUrls.get_parser('content/test').should eq 'some_parser_class'
   end
 end
+describe ContentUrls do
+  it "should return relative URLs as absolute when requested" do
+    html_base_sample =<<BASE_SAMPLE
+<html>
+<head>
+  <base href='http://www.example.com/sample/'>
+  <title>HTML base Sample</title>
+</head>
+<body>
+  <h1>HTML base Sample</h1>
+  <a href='about.html'>about</a>
+</body>
+</html>
+BASE_SAMPLE
+    urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
+    urls[0].should eq 'http://www.example.com/sample/about.html'
+    urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
+    urls[0].should eq 'https://www2.example.com/test/about.html'
+    urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
+    urls[0].should eq 'http://www.example.com/sample/about.html'
+  end
+end
+describe ContentUrls do
+  it "should not change absolute URLs when requested to make absolute URLs from relative URLs" do
+    html_base_sample =<<BASE_SAMPLE
+<html>
+<head>
+  <base href='http://www2.example.com/sample/'>
+  <title>HTML base Sample</title>
+</head>
+<body>
+  <h1>HTML base Sample</h1>
+  <a href='http://www.example.com/about.html'>about</a>
+</body>
+</html>
+BASE_SAMPLE
+    urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true)
+    urls[0].should eq 'http://www.example.com/about.html'
+    urls = ContentUrls.urls(html_base_sample, 'text/html', content_url: 'https://www2.example.com/test/index.html')
+    urls[0].should eq 'http://www.example.com/about.html'
+    urls = ContentUrls.urls(html_base_sample, 'text/html', use_base_url: true, content_url: 'https://www2.example.com/test/index.html')
+    urls[0].should eq 'http://www.example.com/about.html'
+  end
+end

data/spec/html_parser_spec.rb CHANGED Viewed

@@ -334,13 +334,32 @@ SAMPLE_13
   end
 end
+describe ContentUrls::HtmlParser do
+  it "should parse the HTML and return nil when no 'base' URL" do
+    html_missing_base_sample =<<MISSING_BASE_SAMPLE
+<html>
+<head>
+  <title>HTML no base Sample</title>
+</head>
+<body>
+  <h1>HTML no base Sample</h1>
+</body>
+</html>
+MISSING_BASE_SAMPLE
+    url = ContentUrls::HtmlParser.base(html_missing_base_sample)
+    url.should eq nil
+  end
+end
 describe ContentUrls::HtmlParser do
   it "should parse the HTML and return the 'base' URL and no other URLs" do
   html_base_sample =<<BASE_SAMPLE
 <html>
 <head>
-  <base href='/en/'
+  <base href='/en/'>
   <title>HTML base Sample</title>
 </head>
 <body>

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: content_urls
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.1.7
 platform: ruby
 authors:
 - Dennis Sutch
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-16 00:00:00.000000000 Z
+date: 2013-07-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri