RubyGems - url_parser - Versions diffs - 0.4.0 → 0.5.0 - Mend

url_parser 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/.travis.yml +7 -0
data/CHANGELOG.md +20 -0
data/Gemfile +4 -0
data/Guardfile +40 -7
data/LICENSE.txt +1 -1
data/README.md +301 -5
data/Rakefile +5 -0
data/lib/url_parser.rb +93 -286
data/lib/url_parser/db.yml +77 -0
data/lib/url_parser/domain.rb +102 -0
data/lib/url_parser/model.rb +233 -0
data/lib/url_parser/option_setter.rb +47 -0
data/lib/url_parser/parser.rb +206 -0
data/lib/url_parser/uri.rb +206 -0
data/lib/url_parser/version.rb +1 -1
data/spec/spec_helper.rb +83 -6
data/spec/support/.gitkeep +0 -0
data/spec/support/helpers.rb +7 -0
data/spec/url_parser/domain_spec.rb +163 -0
data/spec/url_parser/model_spec.rb +426 -0
data/spec/url_parser/option_setter_spec.rb +71 -0
data/spec/url_parser/parser_spec.rb +515 -0
data/spec/url_parser/uri_spec.rb +570 -0
data/spec/url_parser_spec.rb +93 -387
data/url_parser.gemspec +5 -6
metadata +39 -29

data/lib/url_parser.rb CHANGED

@@ -1,321 +1,128 @@
-require "url_parser/version"
-require "domainatrix"
-require "postrank-uri"
+require "yaml"
+require "gem_config"
 require "addressable/uri"
-require "digest/sha1"
-class Array
-  def self.wrap(object)
-    if object.nil?
-      []
-    elsif object.respond_to?(:to_ary)
-      object.to_ary || [object]
-    else
-      [object]
-    end
-  end unless respond_to?(:wrap)
-end
+require "url_parser/version"
+require "url_parser/option_setter"
+require "url_parser/domain"
+require "url_parser/model"
+require "url_parser/parser"
+require "url_parser/uri"
 module UrlParser
+  include GemConfig::Base
-  # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
-  SCHEMES = [
-    'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https',
-    'imap', 'magnet', 'mailto', 'mms', 'news', 'nntp', 'prospero',
-    'rsync', 'rtsp', 'rtspu', 'sftp', 'shttp', 'sip', 'sips',
-    'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
-    # Unofficial schemes
-    'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk',
-    'irc', 'ircs', 'irc6', 'itms', 'mms', 'msnim', 'mvn', 'skype',
-    'ssh', 'smb', 'svn', 'ymsg', 'webcal'
-  ]
-  DEFAULT_SCHEMES = [
-    'http', 'https', 'ftp', 'mailto', 'file', 'ssh', 'feed',
-    'cvs', 'git', 'mvn', 'nntp', 'shttp', 'svn', 'webcal'
-  ]
-  module Error; end
-  class InvalidScheme
-    include UrlParser::Error
+  with_configuration do
+    has :default_scheme, classes: [ String, NilClass ], default: 'http'
+    has :scheme_map, classes: Hash, default: Hash.new
+    has :embedded_params, classes: Array, default: %w(u url)
   end
-  def self.call(text, options = {})
-    urls = []
-    PostRank::URI.extract(text).each do |url|
-      urls << new(url, options)
-    end
-    urls
-  end
+  module Error; end
-  def self.new(url, options = {})
-    Base.new(url, options)
+  class LibraryError < StandardError
+    include Error
   end
-  class Base
-    attr_reader :url, :original_url, :raise_errors
+  RequiresAddressableURI  = Class.new(LibraryError)
+  RequiresUrlParserDomain = Class.new(LibraryError)
-    attr_accessor :errors
+  DB = YAML.load_file(File.join(File.dirname(__FILE__), '/url_parser/db.yml'))
-    def initialize(url, options = {})
-      @schemes        = options.fetch(:schemes) { UrlParser::DEFAULT_SCHEMES }
-      @clean          = options.fetch(:clean) { false }
-      @raise_errors   = options.fetch(:raise_errors) { false }
-      @errors         = []
-      @original_url   = url
-      @url            = @clean ? clean(url) : parse(url)
-      prepare
-    end
-    def schemes
-      Array.wrap(@schemes)
-    end
-    def clean!
-      @parser = nil
-      @url = clean(url)
-      @clean = true
-      self
-    end
+  def self.new(uri, options = {})
+    warn "[DEPRECATION] `.new` is deprecated. Please use `.parse` instead."
+    parse(uri, options)
+  end
-    def parser
-      tag_errors do
-        @parser ||= Domainatrix.parse(to_s)
+  module_function
+  # Encode a string
+  #
+  # Adapted from ERB::Util.url_encode
+  #
+  def escape(uri, options = {})
+    uri.to_s.dup
+      .force_encoding(Encoding::ASCII_8BIT)
+      .gsub(/[^a-zA-Z0-9_\-.]/n) do
+        sprintf("%%%02X", Regexp.last_match[0].unpack("C")[0])
       end
-    end
-    def to_s
-      return '' if errors.any?
-      url.to_s
-    end
+  end
-    def hash(options = {})
-      return nil if errors.any?
-      clean = options.fetch(:clean) { nil }
-      if clean.nil?
-        Digest::SHA1.hexdigest(url.to_s)
-      else
-        Digest::SHA1.hexdigest(
-          clean ? clean(original_url) : parse(original_url)
+  # Decode a string
+  #
+  # Adapted from CGI::unescape
+  #
+  # See also http://tools.ietf.org/html/rfc3986#section-2.3
+  #
+  def unescape(uri, options = {})
+    encoding = options.fetch(:encoding) { Encoding::UTF_8 }
+    query_spaces = proc do
+      if Regexp.last_match[6]
+        Regexp.last_match[0].sub(
+          Regexp.last_match[6],
+          Regexp.last_match[6].tr('+', ' ')
         )
-      end
-    end
-    def valid?
-      errors.empty?
-    end
-    def join(relative_path)
-      return nil if errors.any?
-      UrlParser.new(
-        Addressable::URI.join(url, relative_path).to_s
-      )
-    end
-    # URI Components
-    def scheme
-      return nil if errors.any?
-      url.scheme
-    end
-    def username
-      return nil if errors.any?
-      url.user
-    end
-    alias_method :user, :username
-    def password
-      return nil if errors.any?
-      url.password
-    end
-    def userinfo
-      return nil if errors.any?
-      url.userinfo
-    end
-    def www
-      return nil if errors.any?
-      return nil if parser.subdomain.empty?
-      parts = slice_domain.split('.')
-      parts.first =~ /www?\d*/ ? parts.shift : nil
-    end
-    def subdomain
-      return nil if errors.any?
-      return nil if parser.subdomain.empty?
-      parts = slice_domain.split('.')
-      parts.shift if parts.first =~ /www?\d*/
-      parts.compact.join('.')
-    end
-    def subdomains
-      return nil if errors.any?
-      return nil if parser.subdomain.empty?
-      [ www, subdomain ].compact.join('.')
-    end
-    def domain_name
-      return nil if errors.any?
-      parser.domain.empty? ? nil : parser.domain
-    end
-    def domain
-      return nil if errors.any?
-      if parser.domain_with_public_suffix.empty?
-        nil
       else
-        parser.domain_with_public_suffix
+        Regexp.last_match[0]
       end
     end
-    def tld
-      return nil if errors.any?
-      tld = parser.public_suffix
-      tld.empty? ? nil : tld
-    end
-    def hostname
-      return nil if errors.any?
-      url.host
-    end
-    def port
-      return nil if errors.any?
-      url.port
-    end
-    def host
-      return nil if errors.any?
-      name = [ hostname, port ].compact.join(':')
-      name.empty? ? nil : name
-    end
-    def origin
-      return nil if errors.any?
-      url.origin == "null" ? nil : url.origin
-    end
-    def authority
-      return nil if errors.any?
-      url.authority
-    end
-    def site
-      return nil if errors.any?
-      url.site
-    end
-    def directory
-      return nil if errors.any?
-      parts = path.split('/')
-      return '/' if parts.empty?
-      parts.pop unless segment.to_s.empty?
-      parts.unshift('') unless parts.first.to_s.empty?
-      parts.compact.join('/')
-    end
-    def path
-      return nil if errors.any?
-      url.path
-    end
-    def segment
-      return nil if errors.any?
-      path =~ /\/\z/ ? nil : path.split('/').last
-    end
-    def filename
-      return nil if errors.any?
-      return 'index.html' if segment.to_s.empty?
-      return '' if suffix.to_s.empty?
-      segment
-    end
-    def suffix
-      return nil if errors.any?
-      ext = File.extname(path)
-      ext[0] = '' if ext[0] == '.'
-      ext.empty? ? nil : ext
-    end
-    def query
-      return nil if errors.any?
-      url.query
-    end
-    def query_values
-      return {} if errors.any?
-      url.query_values.to_h
+    decode_chars = proc do
+      [Regexp.last_match[1].delete('%')].pack('H*')
     end
-    def fragment
-      return nil if errors.any?
-      url.fragment
-    end
+    string = uri.to_s
-    def resource
-      return nil if errors.any?
-      name = [
-        [ segment, query ].compact.join('?'), fragment
-      ].compact.join('#')
-      name.empty? ? nil : name
-    end
+    str = string.dup
+      .gsub(Addressable::URI::URIREGEX, &query_spaces)
+      .force_encoding(Encoding::ASCII_8BIT)
+      .gsub(/((?:%[0-9a-fA-F]{2})+)/, &decode_chars)
+      .force_encoding(encoding)
-    def relative?
-      return nil if errors.any?
-      url.relative?
-    end
-    def absolute?
-      return nil if errors.any?
-      url.absolute?
-    end
+    str.valid_encoding? ? str : str.force_encoding(string.encoding)
+  end
-    def localhost?
-      return nil if errors.any?
-      !!(hostname =~ /(\A|\.)localhost\z/)
-    end
+  def parse(uri, options = {}, &blk)
+    URI.new(uri, options, &blk)
+  end
-    private
+  def unembed(uri, options = {}, &blk)
+    URI.new(uri, options.merge(unembed: true), &blk)
+  end
-    def slice_domain
-      parser.subdomain.tap{ |s| s.slice!(domain) }
-    end
+  def canonicalize(uri, options = {}, &blk)
+    URI.new(uri, options.merge(canonicalize: true), &blk)
+  end
-    def tag_errors
-      yield
-    rescue Exception => error
-      unless error.singleton_class.include?(UrlParser::Error)
-        error.extend(UrlParser::Error)
-      end
-      @errors << error
-      raise if raise_errors
-    end
+  def normalize(uri, options = {}, &blk)
+    URI.new(uri, options.merge(normalize: true), &blk)
+  end
-    def parse(url)
-      tag_errors do
-        PostRank::URI.parse(url, raw: true)
-      end
-    end
+  def clean(uri, options = {}, &blk)
+    URI.new(uri, options.merge(clean: true), &blk)
+  end
-    def clean(url)
-      tag_errors do
-        PostRank::URI.clean(url, raw: true)
-      end
+  # Wraps its argument in an array unless it is already an array
+  #
+  # See: activesupport/lib/active_support/core_ext/array/wrap.rb, line 36
+  #
+  def wrap(object)
+    if object.nil?
+      []
+    elsif object.respond_to?(:to_ary)
+      object.to_ary || [object]
+    else
+      [object]
     end
+  end
-    # Initialize parser to ensure no errors are raised
-    #
-    def prepare
-      parser
+  def tag_errors
+    yield
+  rescue StandardError => error
+    unless error.singleton_class.include?(UrlParser::Error)
+      error.extend(UrlParser::Error)
     end
+    raise error
   end
 end

data/lib/url_parser/db.yml ADDED

@@ -0,0 +1,77 @@
+---
+:global:
+- _openstat       # Yandex openstat param
+- awesm           # awe.sm tracker
+- gclid           # Google Analytics click ID
+- mc_cid          # Mailchimp campaign unique ID
+- mc_eid          # Mailchimp campaign member email unique ID
+- PHPSESSID       # Legacy PHP session identifier
+- sms_ss          # addthis.com tracker
+- utm_campaign    # Urchin / Google Analytics campaign name
+- utm_content     # Urchin / Google Analytics campaign content
+- utm_medium      # Urchin / Google Analytics campaign medium
+- utm_nooverride  # Urchin nooverride param
+- utm_reader      # Urchin reader param
+- utm_source      # Urchin / Google Analytics campaign source
+- utm_term        # Urchin / Google Anlaytics campaign term
+- utm_type        # Urchin type param
+- xtor            # AT Internet tracker
+:hosts:
+  allthingsd.com:
+  - mod
+  cbc.ca:
+  - ref
+  cnet.com:
+  - part
+  - subj
+  - tag
+  cnn.com:
+  - eref
+  diepresse.com:
+  - _vl_backlink
+  dw-world.de:
+  - maca
+  economist.com:
+  - fsrc
+  espn.com:
+  - campaign
+  - source
+  espn.go.com:
+  - campaign
+  - source
+  latimes.com:
+  - track
+  macworld.com:
+  - lsrc
+  nytimes.com:
+  - partner
+  - pagewanted
+  - emc
+  - _r
+  - ref
+  - src
+  repubblica.it:
+  - rss
+  theglobeandmail.com:
+  - cmpid
+  thestar.com:
+  - bn
+  usatoday.com:
+  - csp
+  waomarketing.com:
+  - nucrss
+  washingtonpost.com:
+  - nav
+  - wprss
+  welt.de:
+  - wtmc
+  wikipedia.org:
+  - source
+  wsj.com:
+  - mod
+  youtube.com:
+  - feature
+  - app
+  - ac
+  - src_vid
+  - annotation_id