RubyGems - content_urls - Versions diffs - 0.1.0 - Mend

content_urls 0.1.0

Files changed (19) hide show

data/.document +5 -0
data/.rspec +1 -0
data/Gemfile +13 -0
data/LICENSE.txt +20 -0
data/README.rdoc +63 -0
data/Rakefile +42 -0
data/VERSION +1 -0
data/content_urls.gemspec +78 -0
data/lib/content_urls.rb +107 -0
data/lib/content_urls/parsers/css_parser.rb +126 -0
data/lib/content_urls/parsers/html_parser.rb +150 -0
data/lib/content_urls/parsers/java_script_parser.rb +64 -0
data/lib/content_urls/version.rb +3 -0
data/spec/content_urls_spec.rb +29 -0
data/spec/css_parser_spec.rb +34 -0
data/spec/html_parser_spec.rb +318 -0
data/spec/java_script_parser_spec.rb +31 -0
data/spec/spec_helper.rb +12 -0
metadata +195 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/.rspec ADDED

	@@ -0,0 +1 @@
1	+ #--color

data/Gemfile ADDED

@@ -0,0 +1,13 @@
+source "http://rubygems.org"
+gem "nokogiri"
+group :development do
+  gem "rspec", "~> 2.8.0"
+  gem "yard", "~> 0.7"
+  gem "rdoc", "~> 3.12"
+  gem "bundler"
+  gem "jeweler", "~> 1.8.4"
+  gem "rcov", "0.9.9"
+  gem "rake", "~> 0.9.2.2"
+end

data/LICENSE.txt ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2012 Dennis Sutch
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,63 @@
+= content_urls
+Find and rewrite URLs in different types of content.
+ContentUrls was developed to address two use cases:
+* Find each URL in content retrieved from a website in order to spider and find all content on the website.
+* Rewrite each URL in content retrieved from a website in order to make a working local copy of the website.
+== Features
+* Three types of content: HTML, CSS and JavaScript
+  * HTML content
+    * <a> tag href attribute
+    * <area> tag href attribute
+    * <body> tag background attribute
+    * <embed> tag src attribute
+    * <img> tag src attribute
+    * <link> tag href attribute
+    * <meta> tag content attribute containing URL
+    * <object> tag data attribute
+    * <script> tag src attribute
+    * style attribute of any tag (parsed as CSS content)
+    * body of <style> tag (parsed as CSS content)
+    * body of <script> tag when type or language attribute identifies JavaScript (parsed as JavaScript content)
+* CSS content
+  * url() notation
+* JavaScript content
+  * URI module's REGEXP
+== Examples
+=== Find URLs in an HTML document
+Provide the HTML content and the content type and obtain an array of unique URLs.
+ ContentUrls.urls(html, 'text/html').each do |url|
+   puts "Found URL: #{url}"
+ end
+=== Rewrite URLs in an HTML document
+Provide the HTML content, the content type, and a block to rewrite each URL's extension.
+ rewritten_html = ContentUrls.rewrite_each_url(html, 'text/html') {|url| url.sub(/.htm/, '.html'}
+== Requirements
+* nokogiri
+== Development
+To test and develop this gem, additional requirements are:
+* bundler
+* jeweler
+* rake
+* rcov
+* rdoc
+* rspec
+* yard"
+== Contributing to content_urls
+* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
+* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
+* Fork the project.
+* Start a feature/bugfix branch.
+* Commit and push until you are happy with your contribution.
+* Make sure to add tests for it. This is important so I don't unintentionally break it in a future version.
+* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
+== Copyright
+Copyright (c) 2012 Dennis Sutch. See LICENSE.txt for further details.

data/Rakefile ADDED

@@ -0,0 +1,42 @@
+# encoding: utf-8
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "content_urls"
+  gem.homepage = "http://github.com/sutch/content_urls"
+  gem.license = "MIT"
+  gem.summary = %Q{Find and rewrite URLs in different types of content.}
+  gem.description = %Q{Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs.}
+  gem.email = "dennis@sutch.com"
+  gem.authors = ["Dennis Sutch"]
+  # dependencies defined in Gemfile
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+RSpec::Core::RakeTask.new(:rcov) do |spec|
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+task :default => :spec
+require 'yard'
+YARD::Rake::YardocTask.new

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.1.0

data/content_urls.gemspec ADDED

@@ -0,0 +1,78 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = "content_urls"
+  s.version = "0.1.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Dennis Sutch"]
+  s.date = "2012-10-03"
+  s.description = "Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs."
+  s.email = "dennis@sutch.com"
+  s.extra_rdoc_files = [
+    "LICENSE.txt",
+    "README.rdoc"
+  ]
+  s.files = [
+    ".document",
+    ".rspec",
+    "Gemfile",
+    "LICENSE.txt",
+    "README.rdoc",
+    "Rakefile",
+    "VERSION",
+    "content_urls.gemspec",
+    "lib/content_urls.rb",
+    "lib/content_urls/parsers/css_parser.rb",
+    "lib/content_urls/parsers/html_parser.rb",
+    "lib/content_urls/parsers/java_script_parser.rb",
+    "lib/content_urls/version.rb",
+    "spec/content_urls_spec.rb",
+    "spec/css_parser_spec.rb",
+    "spec/html_parser_spec.rb",
+    "spec/java_script_parser_spec.rb",
+    "spec/spec_helper.rb"
+  ]
+  s.homepage = "http://github.com/sutch/content_urls"
+  s.licenses = ["MIT"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = "1.8.23"
+  s.summary = "Find and rewrite URLs in different types of content."
+  if s.respond_to? :specification_version then
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
+      s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
+      s.add_development_dependency(%q<yard>, ["~> 0.7"])
+      s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
+      s.add_development_dependency(%q<bundler>, [">= 0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
+      s.add_development_dependency(%q<rcov>, ["= 0.9.9"])
+      s.add_development_dependency(%q<rake>, ["~> 0.9.2.2"])
+    else
+      s.add_dependency(%q<nokogiri>, [">= 0"])
+      s.add_dependency(%q<rspec>, ["~> 2.8.0"])
+      s.add_dependency(%q<yard>, ["~> 0.7"])
+      s.add_dependency(%q<rdoc>, ["~> 3.12"])
+      s.add_dependency(%q<bundler>, [">= 0"])
+      s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
+      s.add_dependency(%q<rcov>, ["= 0.9.9"])
+      s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
+    end
+  else
+    s.add_dependency(%q<nokogiri>, [">= 0"])
+    s.add_dependency(%q<rspec>, ["~> 2.8.0"])
+    s.add_dependency(%q<yard>, ["~> 0.7"])
+    s.add_dependency(%q<rdoc>, ["~> 3.12"])
+    s.add_dependency(%q<bundler>, [">= 0"])
+    s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
+    s.add_dependency(%q<rcov>, ["= 0.9.9"])
+    s.add_dependency(%q<rake>, ["~> 0.9.2.2"])
+  end
+end

data/lib/content_urls.rb ADDED

@@ -0,0 +1,107 @@
+require 'content_urls/version'
+require 'uri'
+# +ContentUrls+ parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides methods for iterating through URLs and changing URLs.
+#
+class ContentUrls
+  # Returns the URLs found in the content.
+  #
+  # @param [String] content the content.
+  # @param [String] type the media type of the content.
+  # @return [Array] the unique URLs found in the content.
+  #
+  # @example Parse HTML code for URLs
+  #   content = '<html><a href="index.html">Home</a></html>'
+  #   ContentUrls.urls(content, 'text/html').each do |url|
+  #     puts "Found URL: #{url}"
+  #   end
+  #   # => "Found URL: index.html"
+  #
+  # @example Parse content obtained from a robot
+  #   response = Net::HTTP.get_response(URI('http://example.com/sample-1'))
+  #   puts "URLs found at http://example.com/sample-1:"
+  #   ContentUrls.urls(response.body, response.content_type).each do |url|
+  #     puts "  #{url}"
+  #   end
+  #   # => [a list of URLs found in the content located at http://example.com/sample-1]
+  #
+  def self.urls(content, type)
+    urls = []
+    if (parser = get_parser(type))
+      parser.new(content).urls.each { |url| urls << url }
+    end
+    urls
+  end
+  # Rewrites each URL in the content by calling the supplied block with each URL.
+  #
+  # @param [String] content the HTML content.
+  # @param [String] type the media type of the content.
+  # @returns [string] content the rewritten content.
+  #
+  # @example Rewrite URLs in HTML code
+  #   content = '<html><a href="index.htm">Home</a></html>'
+  #   content = ContentUrls.rewrite_each_url(content, 'text/html') {|url| 'gone.html'}
+  #   puts "Rewritten: #{content}"
+  #   # => "Rewritten: <html><a href="gone.html">Home</a></html>"
+  #
+  def self.rewrite_each_url(content, type, &block)
+    if (parser = get_parser(type))
+      parser.rewrite_each_url(content) do |url|
+        replacement = yield url
+        (replacement.nil? ? url : replacement)
+      end
+    end
+    content
+  end
+  # Convert a relative URL to an absolute URL using base_url (for example, the content's original location or an HTML document's href attribute of the base tag).
+  #
+  # @example Obtain absolute URL of "../index.html" of page obtained from "http://example.com/one/two/sample.html"
+  #   puts ContentUrls.to_absolute("../index.html", "http://example.com/folder/sample.html")
+  #   # => "http://example.com/index.html"
+  #
+  def self.to_absolute(url, base_url)
+    return nil if url.nil?
+    url = URI.encode(URI.decode(url.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))  # remove anchor
+    absolute = URI(base_url).merge(url)
+    absolute.path = '/' if absolute.path.empty?
+    absolute.to_s
+  end
+  protected
+  @@type_parser = Hash.new { |hash, key| hash[key] = [] }  # mapping of type regex to parser class
+  # Register a parser implementation class for one or more content type regular expressions
+  def self.register_parser(parser_class, *type_regexes)
+    type_regexes.each do |regex|
+      @@type_parser[regex].push parser_class
+    end
+  end
+  # Return parser for a file type or nil if content type not recognized
+  def self.get_parser(type)
+    @@type_parser.each_pair do |regex, parser|
+      if type =~ regex
+        return parser.first
+      end
+    end
+    return nil
+  end
+  # Parser implementations
+  # - each implementation's urls method should return unique URLs
+  require 'content_urls/parsers/html_parser'
+  register_parser ContentUrls::HtmlParser, %r{^(text/html)\b}, %r{^(application/xhtml+xml)\b}
+  require 'content_urls/parsers/css_parser'
+  register_parser ContentUrls::CssParser, %r{^(text/css)\b}
+  require 'content_urls/parsers/java_script_parser'
+  register_parser ContentUrls::JavaScriptParser, %r{^(application/x-javascript)\b}, %r{^(application/javascript)\b}, %r{^(text/javascript)\b}
+end

data/lib/content_urls/parsers/css_parser.rb ADDED

@@ -0,0 +1,126 @@
+class ContentUrls
+  # +CssParser+ finds and rewrites URLs in CSS content.
+  #
+  # === Implementation note:
+  # This methods in this class identify URLs by using regular expressions based on the W3C CSS 2.1 Specification (http://www.w3.org/TR/CSS21/syndata.html).
+  class CssParser
+    # Returns the URLs found in the CSS content.
+    #
+    # @param [String] content the CSS content.
+    # @return [Array] the unique URLs found in the content.
+    #
+    # @example Parse CSS code for URLs
+    #   css = 'body { background: url(/images/rainbows.jpg) }'
+    #   ContentUrls::CssParser.urls(css).each do |url|
+    #     puts "Found URL: #{url}"
+    #   end
+    #   # => "Found URL: /images/rainbows.jpg"
+    def self.urls(content)
+      urls = []
+      remaining = content
+      while ! remaining.empty?
+        if @@regex_uri =~ remaining
+          match = $1
+          url = $7 || $14 || $23
+          #if @@regex_baduri =~ match  ## bad URL
+          #  remaining = remaining[Regexp.last_match.begin(0)+1..-1]  # Use last_match from regex_uri test
+          #else
+            remaining = Regexp.last_match.post_match
+            urls << url
+          #end
+        else
+          remaining = ''
+        end
+      end
+      urls.uniq!
+      urls
+    end
+    # Rewrites each URL in the CSS content by calling the supplied block with each URL.
+    #
+    # @param [String] content the CSS content.
+    #
+    # @example Rewrite URLs in CSS code
+    #   css = 'body { background: url(/images/rainbows.jpg) }'
+    #   css = ContentUrls::CssParser.rewrite_each_url(css) {|url| url.sub(/rainbows.jpg/, 'unicorns.jpg')}
+    #   puts "Rewritten: #{css}"
+    #   # => "Rewritten: body { background: url(/images/unicorns.jpg) }"
+    #
+    def self.rewrite_each_url(content, &block)
+      done = false
+      remaining = content
+      rewritten = ''
+      while ! remaining.empty?
+        if match = @@regex_uri.match(remaining)
+          url = match[7] || match[14] || match[23]
+          rewritten += match.pre_match
+          remaining = match.post_match
+          replacement = yield url
+          rewritten += (replacement.nil? ? match[0] : match[0].sub(url, replacement))
+        else
+          rewritten += remaining
+          remaining = ''
+        end
+      end
+      return rewritten
+    end
+    protected
+    # Regular expressions based on http://www.w3.org/TR/CSS21/syndata.html
+    # {w}:  [ \t\r\n\f]*
+    @@w = '([ \t\r\n\f]*)'
+    # {nl}:  \n|\r\n|\r|\f
+    @@nl = '(\n|\r\n|\r|\f)'
+    # {unicode}:    \\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?
+    @@unicode = '(\\\\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?)'
+    # {escape}:       {unicode}|\\[^\n\r\f0-9a-f]
+    @@escape = '(' + @@unicode + '|\\\\[^\n\r\f0-9a-f])'
+    # {string1}:  \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
+    @@string1 = '(\"(([^\n\r\f\\\\"]|\\\\' + @@nl + '|' + @@escape + ')*)\")'
+    # {string2}:    \'([^\n\r\f\\']|\\{nl}|{escape})*\'
+    @@string2 = '(\\\'(([^\n\r\f\\\\\']|\\\\' + @@nl + '|' + @@escape + ')*)\\\')'
+    # {string}:       {string1}|{string2}
+    @@string = '(' + @@string1 + '|' + @@string2 + ')'
+    # {nonascii}:  [^\0-\237]
+    @@nonascii = '([^\x0-\x237])'
+    # {uri}:    url\({w}{string}{w}\)|url\({w}([!#$%&*-\[\]-~]|{nonascii}|{escape})*{w}\)
+    @@uri = '(((url\(' + @@w + @@string + @@w + '\))|(url\(' + @@w + '(([!#$%&*-\[\]-~]|' + @@nonascii + '|' + @@escape + ')*)' + @@w + '\))))'
+    # {badstring1}:  \"([^\n\r\f\\"]|\\{nl}|{escape})*\\?
+    @@badstring1 = '(\"([^\n\r\f\\\\"]|\\\\' + @@nl + '|' + @@escape + ')*\\\\?)'
+    # {badstring2}:    \'([^\n\r\f\\']|\\{nl}|{escape})*\\?
+    @@badstring2 = '(\\\'([^\n\r\f\\\\\']|\\\\' + @@nl + '|' + @@escape + ')*\\\\?)'
+    # {badstring}:      {badstring1}|{badstring2}
+    @@badstring = '(' + @@badstring1 + '|' + @@badstring2 + ')'
+    # {baduri1}:  url\({w}([!#$%&*-~]|{nonascii}|{escape})*{w}
+    @@baduri1 = '(url\(' + @@w + '([!#$%&*-~]|' + @@nonascii + '|' + @@escape + ')*' + @@w + ')'
+    # {baduri2}:  url\({w}{string}{w}
+    @@baduri2 = '(url\(' + @@w + @@string + @@w + ')'
+    # {baduri3}:  url\({w}{badstring}
+    @@baduri3 = '(url\(' + @@w + @@badstring + ')'
+    # {baduri}:       {baduri1}|{baduri2}|{baduri3}
+    @@baduri = '(' + @@baduri1 + '|' + @@baduri2 + '|' + @@baduri3 + ')'
+    @@regex_uri = Regexp.new(@@uri)
+    @@regex_baduri = Regexp.new(@@baduri)
+  end
+end

data/lib/content_urls/parsers/html_parser.rb ADDED

@@ -0,0 +1,150 @@
+require 'nokogiri'
+class ContentUrls
+  # +HtmlParser+ finds and rewrites URLs in HTML content.
+  #
+  # === Implementation note:
+  # This methods in this class use Nokogiri to identify URLs.  Nokogiri cleans HTML code when rewriting, so expect some changes to rewritten content.
+  class HtmlParser
+    # Returns the URLs found in the HTML content.
+    #
+    # @param [String] content the HTML content.
+    # @return [Array] the unique URLs found in the content.
+    #
+    # @example Parse HTML code for URLs
+    #   html = '<html><a href="index.htm">Click me</a></html>'
+    #   ContentUrls::HtmlParser.urls(html).each do |url|
+    #     puts "Found URL: #{url}"
+    #   end
+    #   # => "Found URL: index.htm"
+    #
+    def self.urls(content)
+      doc = Nokogiri::HTML(content) if content rescue nil
+      urls = []
+      return urls if !doc
+      rewrite_each_url(content) { |url| urls << url; url }
+      urls.uniq!
+      urls
+    end
+    # Rewrites each URL in the HTML content by calling the supplied block with each URL.
+    #
+    # @param [String] content the HTML content.
+    #
+    # @example Rewrite URLs in HTML code
+    #   html = '<html><a href="index.htm">Click me</a></html>'
+    #   html = ContentUrls::HtmlParser.rewrite_each_url(html) {|url| 'index.php'}
+    #   puts "Rewritten: #{html}"
+    #   # => "Rewritten: <html><a href="index.php">Click me</a></html>"
+    #
+    def self.rewrite_each_url(content, &block)
+      doc = Nokogiri::HTML(content) if content rescue nil
+      return nil if !doc
+      # TODO: handle href attribute of base tag
+      #  - should href URL be changed?
+      #  - should relative URLs be modified using base?
+      #  - how should rewritten relative URLs be handled?
+      base = doc.search('//head/base/@href')  # base URI for resolving relative URIs
+      base = nil if base && base.to_s.strip.empty?
+      @@parser_definition.each do |type, definition|
+        doc.search(definition[:xpath]).each do |obj|
+          if definition.has_key?(:attribute)  # use tag attribute if provided
+            value = obj[definition[:attribute]]
+          else  # otherwise use tag's content
+            value = obj.to_s
+          end
+          next if value.nil? or value.strip.empty?
+          if definition.has_key?(:parser)  # parse value using parser
+            ContentUrls.rewrite_each_url(value, definition[:parser]) { |url| yield url }
+          elsif definition.has_key?(:attribute)  # rewrite the URL within the attribute
+            if definition.has_key?(:url_regex)  # use regex to obtain URL
+              if (match = definition[:url_regex].match(value))
+                url = yield match[:url]
+                next if url.nil? or url.to_s == match.to_s  # don't change URL
+                obj[definition[:attribute]] = match.pre_match + url.to_s + match.post_match
+              end
+            else  # value is the URL
+              next if value =~ /^#/  # do not capture anchors within the content being parsed
+              url = yield value
+              next if url.nil? or url.to_s == match.to_s  # don't change URL
+              #obj[definition[:attribute]] = url.to_s
+              obj.set_attribute(definition[:attribute], url.to_s)
+            end
+          else
+            $stderr.puts "WARNING: unable to rewrite URL for #{value.to_s}"
+          end
+        end
+      end
+      return doc.to_s
+    end  # rewrite_each
+    protected
+    @@parser_definition = {
+      a_href: {
+        xpath: "//a[@href]",
+        attribute: 'href'
+      },
+      area_href: {
+        xpath: "//area[@href]",
+        attribute: 'href'
+      },
+      body_background: {
+        xpath: "//body[@background]",
+        attribute: 'background'
+      },
+      embed_src: {
+        xpath: "//embed[@src]",
+        attribute: 'src'
+      },
+      img_src: {
+        xpath: "//img[@src]",
+        attribute: 'src'
+      },
+      link_href: {
+        xpath: "//link[@href]",
+        attribute: 'href'
+      },
+      meta_content: {
+        xpath: "//meta[((@http-equiv='location') or (@http-equiv='refresh'))
+          and @content and contains(@content,';')
+          and number(substring-before(@content,';'))=substring-before(@content,';')]",
+        attribute: 'content',
+        url_regex: %r{^\d+\s*;\s*url\s*=\s*(?<quote>['"]?)(?<url>[^'"]+)\k<quote>$}i  # must return named capture of :url containing URL
+      },
+      object_data: {
+        xpath: "//object[@data]",
+        attribute: 'data'
+      },
+      script_src: {
+        xpath: "//script[@src]",
+        attribute: 'src'
+      },
+      style_attribute: {
+        xpath: "//*[@style]",
+        attribute: 'style',
+        parser: 'text/css'
+      },
+      style_tag: {
+        xpath: "//style",
+        parser: 'text/css'
+      },
+      javascript: {
+        xpath: "//script[(@type='application/javascript')
+          or (@type='text/javascript')
+          or (@language='javascript')]/text()",
+        parser: 'application/x-javascript'
+       }
+    }
+  end
+end

data/lib/content_urls/parsers/java_script_parser.rb ADDED

@@ -0,0 +1,64 @@
+require 'uri'
+class ContentUrls
+  # +JavaScriptParser+ finds and rewrites URLs in JavaScript content.
+  #
+  # === Implementation note:
+  # This methods in this class identify URLs by locating strings which match +URI+'s regexp.
+  class JavaScriptParser
+    # Returns the URLs found in the JavaScript content.
+    #
+    # @param [String] content the JavaScript content.
+    # @return [Array] the unique URLs found in the content.
+    #
+    # @example Parse JavaScript code for URLs
+    #   javascript = 'var link="http://example.com/"'
+    #   ContentUrls::JavaScriptParser.urls(javascript).each do |url|
+    #     puts "Found URL: #{url}"
+    #   end
+    #   # => "Found URL: http://example.com/"
+    def self.urls(content)
+      urls = []
+      URI.extract(content).each { |u| urls << u }
+      urls.uniq!
+      urls
+    end
+    # Rewrites each URL in the JavaScript content by calling the supplied block with each URL.
+    #
+    # @param [String] content the JavaScript content.
+    #
+    # @example Rewrite URLs in JavaScript code
+    #   javascript = 'var link="http://example.com/"'
+    #   javascript = ContentUrls::JavaScriptParser.rewrite_each_url(javascript) {|url| url.upcase}
+    #   puts "Rewritten: #{javascript}"
+    #   # => "Rewritten: var link="HTTP://EXAMPLE.COM/""
+    #
+    def self.rewrite_each_url(content, &block)
+      done = false
+      remaining = content
+      rewritten = ''
+      while ! remaining.empty?
+        if match = URI.regexp.match(remaining)
+          url = match.to_s
+          rewritten += match.pre_match
+          replacement = url.nil? ? nil : (yield url)
+          if replacement.nil? or replacement == url  # no change in URL
+            rewritten += url[0]
+            remaining = url[1..-1] + match.post_match
+          else
+            rewritten += replacement
+            remaining = match.post_match
+          end
+        else
+          rewritten += remaining
+          remaining = ''
+        end
+      end
+      return rewritten
+    end
+  end
+end

data/lib/content_urls/version.rb ADDED

@@ -0,0 +1,3 @@
+class ContentUrls
+  VERSION = "0.1.0"
+end

data/spec/content_urls_spec.rb ADDED

@@ -0,0 +1,29 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe ContentUrls.to_absolute(nil, 'http://www.sample.com/') do
+  it "returns nil when url is nil" do
+    ContentUrls.to_absolute(nil, 'http://www.sample.com/').should eq nil
+  end
+end
+describe ContentUrls.to_absolute('index.html', 'http://www.sample.com/') do
+  it "merges url to base_url" do
+    ContentUrls.to_absolute('index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/three/index.html'
+    ContentUrls.to_absolute('/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/index.html'
+    ContentUrls.to_absolute('/four/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/four/index.html'
+    ContentUrls.to_absolute('../index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/index.html'
+    ContentUrls.to_absolute('../four/index.html', 'http://www.sample.com/one/two/three/').should eq 'http://www.sample.com/one/two/four/index.html'
+  end
+end
+describe ContentUrls.get_parser('bogus/bogus') do
+  it "returns nil when content type is unknown" do
+    ContentUrls.get_parser('bogus/bogus').should eq nil
+  end
+end
+describe ContentUrls.register_parser('some_parser_class', %r{^(content/test)\b}) do
+  it "returns the class for the content type" do
+    ContentUrls.get_parser('content/test').should eq 'some_parser_class'
+  end
+end

data/spec/css_parser_spec.rb ADDED

@@ -0,0 +1,34 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe ContentUrls::CssParser do
+  it "should return no URLs given no content" do
+    ContentUrls::CssParser.urls('').should eq []
+  end
+  it "should return no URLs given garbage content" do
+    ContentUrls::CssParser.urls('j;alksdjfkladsjflkajdfaksdjfsdj  kladjsf lkfjalkdfj lkajdf9458094djjf').should eq []
+  end
+end
+describe ContentUrls::CssParser do
+  it "should return the URLs in the content" do
+    ContentUrls::CssParser.urls("body {background-image:url('image.png');}").first.should eq 'image.png'
+  end
+end
+describe ContentUrls::CssParser do
+  it "should execute the sample code for rewrite_each_url method" do
+    output = ''
+    css = 'body { background: url(/images/rainbows.jpg) }'
+    css = ContentUrls::CssParser.rewrite_each_url(css) {|url| url.sub(/rainbows.jpg/, 'unicorns.jpg')}
+    output += "Rewritten: #{css}" + "\n"
+    output.should eq %Q{Rewritten: body { background: url(/images/unicorns.jpg) }\n}
+  end
+  it "should execute sample code for urls method" do
+    output = ''
+    css = 'body { background: url(/images/rainbows.jpg) }'
+    ContentUrls::CssParser.urls(css).each do |url|
+      output += "Found URL: #{url}" + "\n"
+    end
+    output.should eq %Q{Found URL: /images/rainbows.jpg\n}
+  end
+end

data/spec/html_parser_spec.rb ADDED

@@ -0,0 +1,318 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe ContentUrls::HtmlParser do
+  it "should return no URLs when given no content" do
+    ContentUrls::HtmlParser.urls('').should eq []
+  end
+  it "should return no URLs when given garbage content" do
+    ContentUrls::HtmlParser.urls('j;alksdjfkladsjflkajdfaksdjfsdj  kladjsf lkfjalkdfj lkajdf9458094djjf').should eq []
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should return the URLs in the content" do
+    ContentUrls::HtmlParser.urls("<a href='index.html").first.should eq 'index.html'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 1 and return all a links" do
+html_sample_1 =<<SAMPLE_1
+<html>
+<head>
+  <title>HTML Sample 1</title>
+</head>
+<body>
+  <h1>HTML Sample 1</h1>
+  <a href="a-href-link-1.html"></a>
+  <a href="http://www.example.com/1/2/3/a-href-link-2.html"></a>
+  <a href="/folder/a-href-link-3.html?a=1"></a>
+</body>
+</html>
+SAMPLE_1
+    urls = ContentUrls::HtmlParser.urls(html_sample_1)
+    urls.include?('a-href-link-1.html').should eq true
+    urls.include?('http://www.example.com/1/2/3/a-href-link-2.html').should eq true
+    urls.include?('/folder/a-href-link-3.html?a=1').should eq true
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 1 and rewrite all a links" do
+html_sample_1 =<<SAMPLE_1
+<html>
+<head>
+  <title>HTML Sample 1</title>
+</head>
+<body>
+  <h1>HTML Sample 1</h1>
+  <a href="a-href-link-1.html"></a>
+  <a href="http://www.example.com/1/2/3/a-href-link-2.html"></a>
+  <a href="/folder/a-href-link-3.html?a=1"></a>
+</body>
+</html>
+SAMPLE_1
+    content = ContentUrls::HtmlParser.rewrite_each_url(html_sample_1) do |url|
+      url = URI.parse url
+      url.path = url.path.sub(/\.html\b/, '.php')
+      url
+    end
+    urls = ContentUrls::HtmlParser.urls(content)
+    urls.include?('a-href-link-1.php').should eq true
+    urls.include?('http://www.example.com/1/2/3/a-href-link-2.php').should eq true
+    urls.include?('/folder/a-href-link-3.php?a=1').should eq true
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 2 and return all 'area href' URLs" do
+html_sample_2 =<<SAMPLE_2
+<html>
+<head>
+  <title>HTML Sample 2</title>
+</head>
+<body>
+  <h1>HTML Sample 2</h1>
+  <img src="sample.gif" width="200" height="200" alt="Click somewhere" usemap="#sample-map">
+  <map name="sample-map">
+    <area shape="rect" coords="0,0,100,100" href="area-href-link-1.html" alt="link 1">
+    <area shape="circle" coords="150,150,2" href="http://www.example.com/1/2/3/area-href-link-2.html" alt="link 2">
+    <area shape="circle" coords="100,180,1" href="/folder/area-href-link-3.html?a=1" alt="link 3">
+  </map>
+</body>
+</html>
+SAMPLE_2
+    urls = ContentUrls::HtmlParser.urls(html_sample_2)
+    urls.include?('area-href-link-1.html').should eq true
+    urls.include?('http://www.example.com/1/2/3/area-href-link-2.html').should eq true
+    urls.include?('/folder/area-href-link-3.html?a=1').should eq true
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 3 and return 'body background' URL" do
+html_sample_3 =<<SAMPLE_3
+<html>
+<head>
+  <title>HTML Sample 3</title>
+</head>
+<body background="/images/background.png">
+  <h1>HTML Sample 3</h1>
+</body>
+</html>
+SAMPLE_3
+    urls = ContentUrls::HtmlParser.urls(html_sample_3)
+    urls.first.should eq '/images/background.png'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 4 and return 'embed src' URL" do
+html_sample_4 =<<SAMPLE_4
+<html>
+<head>
+  <title>HTML Sample 4</title>
+</head>
+<body>
+  <h1>HTML Sample 4</h1>
+  <embed src="sample.swf" />
+</body>
+</html>
+SAMPLE_4
+    urls = ContentUrls::HtmlParser.urls(html_sample_4)
+    urls.first.should eq 'sample.swf'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 5 and return 'img src' URL" do
+html_sample_5 =<<SAMPLE_5
+<html>
+<head>
+  <title>HTML Sample 5</title>
+</head>
+<body>
+  <h1>HTML Sample 5</h1>
+  <img src="sample.gif">
+</body>
+</html>
+SAMPLE_5
+    urls = ContentUrls::HtmlParser.urls(html_sample_5)
+    urls.first.should eq 'sample.gif'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 6 and return 'link href' URL" do
+html_sample_6 =<<SAMPLE_6
+<html>
+<head>
+  <title>HTML Sample 6</title>
+  <link href="/index.php" REL="index">
+</head>
+<body>
+  <h1>HTML Sample 6</h1>
+</body>
+</html>
+SAMPLE_6
+    urls = ContentUrls::HtmlParser.urls(html_sample_6)
+    urls.first.should eq '/index.php'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 7 and return 'object data' URL" do
+html_sample_7 =<<SAMPLE_7
+<html>
+<head>
+  <title>HTML Sample 7</title>
+</head>
+<body>
+  <h1>HTML Sample 7</h1>
+  <object width="400" height="400" data="/stuff/example.swf"></object>
+</body>
+</html>
+SAMPLE_7
+    urls = ContentUrls::HtmlParser.urls(html_sample_7)
+    urls.first.should eq '/stuff/example.swf'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 8 and return 'script src' URL" do
+html_sample_8 =<<SAMPLE_8
+<html>
+<head>
+  <title>HTML Sample 8</title>
+</head>
+<body>
+  <h1>HTML Sample 8</h1>
+  <script language="javascript" src="../scripts/go.js"></script>
+</body>
+</html>
+SAMPLE_8
+    urls = ContentUrls::HtmlParser.urls(html_sample_8)
+    urls.first.should eq '../scripts/go.js'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 9 and return 'meta content' URL" do
+html_sample_9 =<<SAMPLE_9
+<html>
+<head>
+  <title>HTML Sample 9</title>
+  <meta http-equiv="refresh" content="5;URL='http://example.com/'">
+</head>
+<body>
+  <h1>HTML Sample 9</h1>
+</body>
+</html>
+SAMPLE_9
+    urls = ContentUrls::HtmlParser.urls(html_sample_9)
+    urls.first.should eq 'http://example.com/'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 10 and return URLs found within 'style' attributes" do
+html_sample_10 =<<SAMPLE_10
+<html>
+<head>
+  <title>HTML Sample 10</title>
+</head>
+<body style="background-image:url('background.jpg');">
+  <h1>HTML Sample 10</h1>
+</body>
+</html>
+SAMPLE_10
+    urls = ContentUrls::HtmlParser.urls(html_sample_10)
+    urls.first.should eq 'background.jpg'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 11 and return URLs found within 'style' tags" do
+html_sample_11 =<<SAMPLE_11
+<html>
+<head>
+  <title>HTML Sample 11</title>
+  <style type="text/css">
+body {background-image:url('/image/background.jpg');}
+  </style>
+</head>
+<body>
+  <h1>HTML Sample 11</h1>
+</body>
+</html>
+SAMPLE_11
+    urls = ContentUrls::HtmlParser.urls(html_sample_11)
+    urls.first.should eq '/image/background.jpg'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should parse HTML Sample 12 and return URLs found within 'script' tags" do
+html_sample_12 =<<SAMPLE_12
+<html>
+<head>
+  <title>HTML Sample 12</title>
+<script type="text/javascript">
+var link="http://www.sample.com/index.html"
+// ...
+</script>
+</head>
+<body>
+  <h1>HTML Sample 12</h1>
+</body>
+</html>
+SAMPLE_12
+    urls = ContentUrls::HtmlParser.urls(html_sample_12)
+    urls.first.should eq 'http://www.sample.com/index.html'
+  end
+end
+describe ContentUrls::HtmlParser do
+  it "should execute the sample code for rewrite_each_url method" do
+    #output = ''
+    html = '<html><a href="index.htm">Click me</a></html>'
+    html = ContentUrls::HtmlParser.rewrite_each_url(html) {|url| 'index.php'}
+    #output += "Rewritten: #{html}" + "\n"
+    #output.should eq %Q{Rewritten: <html><a href="index.php">Click me</a></html>\n}
+    ContentUrls::HtmlParser.urls(html).first.should eq 'index.php'  # Nokogiri rewrites HTML, instead check rewritten URL
+  end
+  it "should execute sample code for urls method" do
+    output = ''
+    html = '<html><a href="index.htm">Click me</a></html>'
+    ContentUrls::HtmlParser.urls(html).each do |url|
+      output += "Found URL: #{url}" + "\n"
+    end
+    output.should eq %Q{Found URL: index.htm\n}
+  end
+end

data/spec/java_script_parser_spec.rb ADDED

@@ -0,0 +1,31 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe ContentUrls::JavaScriptParser do
+  it "should return no URLs given no content" do
+    ContentUrls::JavaScriptParser.urls('').should eq []
+  end
+end
+describe ContentUrls::JavaScriptParser do
+  it "should return the URLs in the content" do
+    ContentUrls::JavaScriptParser.urls('var link="http://www.sample.com/index.html"').first.should eq 'http://www.sample.com/index.html'
+  end
+end
+describe ContentUrls::JavaScriptParser do
+  it "should execute the sample code for rewrite_each_url method" do
+    output = ''
+    javascript = 'var link="http://example.com/"'
+    javascript = ContentUrls::JavaScriptParser.rewrite_each_url(javascript) {|url| url.upcase}
+    output += "Rewritten: #{javascript}" + "\n"
+    output.should eq %Q{Rewritten: var link="HTTP://EXAMPLE.COM/"\n}
+  end
+  it "should execute sample code for urls method" do
+    output = ''
+    javascript = 'var link="http://example.com/"'
+    ContentUrls::JavaScriptParser.urls(javascript).each do |url|
+      output += "Found URL: #{url}" + "\n"
+    end
+    output.should eq %Q{Found URL: http://example.com/\n}
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,12 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'rspec'
+require 'content_urls'
+# Requires supporting files with custom matchers and macros, etc,
+# in ./support/ and its subdirectories.
+Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
+RSpec.configure do |config|
+end

metadata ADDED

@@ -0,0 +1,195 @@
+--- !ruby/object:Gem::Specification
+name: content_urls
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- Dennis Sutch
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-10-03 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.8.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.8.0
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.7'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.7'
+- !ruby/object:Gem::Dependency
+  name: rdoc
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '3.12'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '3.12'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: jeweler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.8.4
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.8.4
+- !ruby/object:Gem::Dependency
+  name: rcov
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.9.9
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.9.9
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.9.2.2
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.9.2.2
+description: Parses various file types (HTML, CSS, JavaScript, ...) for URLs and provides
+  methods for iterating through URLs and changing URLs.
+email: dennis@sutch.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.rdoc
+files:
+- .document
+- .rspec
+- Gemfile
+- LICENSE.txt
+- README.rdoc
+- Rakefile
+- VERSION
+- content_urls.gemspec
+- lib/content_urls.rb
+- lib/content_urls/parsers/css_parser.rb
+- lib/content_urls/parsers/html_parser.rb
+- lib/content_urls/parsers/java_script_parser.rb
+- lib/content_urls/version.rb
+- spec/content_urls_spec.rb
+- spec/css_parser_spec.rb
+- spec/html_parser_spec.rb
+- spec/java_script_parser_spec.rb
+- spec/spec_helper.rb
+homepage: http://github.com/sutch/content_urls
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Find and rewrite URLs in different types of content.
+test_files: []
+has_rdoc: