RubyGems - url_parser - Versions diffs - 0.2.0 → 0.3.0 - Mend

url_parser 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d5650a4eee893c20d6109ba81727dee86207dd9a
-  data.tar.gz: 2d91d7efec1239d3bfdd56fc61b36d3274a3c8bc
+  metadata.gz: db620a681d6197369f31a483156df4163d2576fd
+  data.tar.gz: 00d0b29ded1f94326953bd5a1fece2003093d2fc
 SHA512:
-  metadata.gz: 6dd4c33a39b4dcbada0f21f52551df48ca1d514860c81caf64b1aa184eacc7dc1d707f705b41f6331d79d9588ebec9ae30144df7dc4ea9c0abd223532da31328
-  data.tar.gz: 98edd4c481f0494f5238601053947fc86d2d7a5e02ce6fc8a537d94ef57213bcdb5fb816813761cdb6be8b9845bf7c9cd52702eadee803d97285debc0a96d9f4
+  metadata.gz: 083dda35526897fae462b70cec4f84709dddd617a1c7f5d1f4d1dd830aac23d5cf59241bee008fe54ed231510b23c63f3e60d9125620b03e0b34c5757a6f4669
+  data.tar.gz: 92c747d882b57cb14c7e499d5d97d2b676df95d30147e49e15ee58ba99bb7c057c2e092cf4ed56ba9534037429e584fae770fe18283e4ceba1443d9ca7a75787

data/lib/url_parser/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UrlParser
-  VERSION = "0.2.0"
+  VERSION = "0.3.0"
 end

data/lib/url_parser.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require "url_parser/version"
 require "domainatrix"
 require "postrank-uri"
 require "addressable/uri"
+require "digest/sha1"
 class Array
@@ -19,12 +20,13 @@ end
 module UrlParser
   module Error; end
-  def self.call(text)
+  def self.call(text, options = {})
     urls = []
     PostRank::URI.extract(text).each do |url|
-      urls << new(url)
+      urls << new(url, options)
     end
     urls
   end
@@ -54,107 +56,202 @@ module UrlParser
     attr_reader :url, :original_url
     def initialize(url, options = {})
+      @schemes = options.fetch(:schemes) { DEFAULT_SCHEMES }
+      @clean = options.fetch(:clean) { false }
+      @original_url = url
+      @url = @clean ? clean(url) : parse(url)
+    end
+    def schemes
+      Array.wrap(@schemes)
+    end
+    def parse(url)
+      tag_errors do
+        PostRank::URI.parse(url, raw: true)
+      end
+    end
+    def clean(url)
+      tag_errors do
+        PostRank::URI.clean(url, raw: true)
+      end
+    end
+    def parser
       tag_errors do
-        @schemes = options.fetch(:schemes) { DEFAULT_SCHEMES }
-        @preserve = !!options[:preserve]
-        @original_url = url
-        @url = @preserve ? url : PostRank::URI.clean(url)
+        @parser ||= Domainatrix.parse(to_s)
       end
     end
     def clean!
-      @preserve = false
       @parser = nil
-      @uri = nil
-      @url = PostRank::URI.clean(url)
+      @url = clean(url)
+      @clean = true
       self
     end
     def to_s
-      url
+      url.to_s
     end
-    def schemes
-      Array.wrap(@schemes)
+    def hash(options = {})
+      clean = options.fetch(:clean) { nil }
+      if clean.nil?
+        Digest::SHA1.hexdigest(url.to_s)
+      else
+        Digest::SHA1.hexdigest(
+          clean ? clean(original_url) : parse(original_url)
+        )
+      end
     end
-    def uri
-      tag_errors do
-        @uri ||= Addressable::URI.parse(url) rescue nil
-      end
+    def valid?
+      return true if localhost?
+      return false unless schemes.include?(scheme)
+      return false unless hostname =~ /\./
+      true
     end
+    def join(relative_path)
+      UrlParser.new(
+        Addressable::URI.join(url, relative_path).to_s
+      )
+    end
+    # URI Components
     def scheme
-      uri.scheme if uri
+      url.scheme
     end
-    def user
-      uri.user if uri
+    def username
+      url.user
     end
+    alias_method :user, :username
     def password
-      uri.password if uri
+      url.password
     end
-    def host
-      uri.host if uri
+    def userinfo
+      url.userinfo
+    end
+    def www
+      return parser.subdomain if parser.subdomain.empty?
+      parts = slice_domain.split('.')
+      parts.first =~ /www?\d*/ ? parts.shift : ""
+    end
+    def subdomain
+      return parser.subdomain if parser.subdomain.empty?
+      parts = slice_domain.split('.')
+      parts.shift if parts.first =~ /www?\d*/
+      parts.compact.join('.')
+    end
+    def subdomains
+      [ www, subdomain ].compact.join('.')
+    end
+    def domain_name
+      parser.domain
+    end
+    def domain
+      parser.domain_with_public_suffix
+    end
+    def tld
+      parser.public_suffix
+    end
+    def hostname
+      url.host
     end
     def port
-      uri.port if uri
+      url.port
+    end
+    def host
+      [ hostname, port ].compact.join(':')
+    end
+    def origin
+      url.origin
+    end
+    def authority
+      url.authority
+    end
+    def site
+      url.site
+    end
+    def directory
+      parts = path.split('/')
+      parts.pop unless segment.empty?
+      parts.unshit('') unless parts.first.empty?
+      parts.compact.join('/')
     end
     def path
-      uri.path if uri
+      url.path
     end
-    def query
-      uri.query if uri
+    def segment
+      path =~ /\/\z/ ? '' : path.split('/').last
     end
-    def fragment
-      uri.fragment if uri
+    def filename
+      return 'index.html' if segment.empty?
+      return '' if suffix.empty?
+      segment
+    end
+    def suffix
+      ext = File.extname(path)
+      ext[0] = '' if ext[0] == '.'
+      ext
+    end
+    def query
+      url.query
     end
     def query_values
-      uri ? uri.query_values.to_h : {}
+      url.query_values.to_h
     end
-    def valid?
-      return true if domain == 'localhost'
-      return false if uri.nil?
-      return false unless schemes.include?(scheme)
-      return false unless host =~ /\./
-      true
+    def fragment
+      url.fragment
     end
-    def parser
-      tag_errors do
-        @parser ||= Domainatrix.parse(url)
-      end
+    def resource
+      [ [ segment, query ].compact.join('?'), fragment ].compact.join('#')
     end
-    def domain
-      parser.domain_with_public_suffix
+    def relative?
+      url.relative?
     end
-    def subdomain
-      unless parser.subdomain.empty?
-        parts = parser.subdomain.tap{ |s| s.slice!(domain) }.split('.')
-        parts.shift if parts.first =~ /www?\d*/
-        (parts << domain).join('.')
-      else
-        domain
-      end
+    def absolute?
+      url.absolute?
     end
-    def join(relative_path)
-      joined_url = Addressable::URI.join(url, relative_path).to_s
-      UrlParser.new(joined_url, preserve: true)
+    def localhost?
+      !!(hostname =~ /(\A|\.)localhost\z/)
     end
     private
+    def slice_domain
+      parser.subdomain.tap{ |s| s.slice!(domain) }
+    end
     def tag_errors
       yield
     rescue Exception => error

data/spec/spec_helper.rb CHANGED Viewed

@@ -14,7 +14,7 @@ require "url_parser"
 RSpec.configure do |config|
   config.run_all_when_everything_filtered = true
   config.filter_run :focus
+  config.raise_errors_for_deprecations!
   # Run specs in random order to surface order dependencies. If you find an
   # order dependency and want to debug it, you can fix the order by providing
   # the seed, which is printed after each run.

data/spec/url_parser_spec.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require 'spec_helper'
 describe UrlParser do
-  let(:parser) { UrlParser.new(link) }
+  let(:parser) { UrlParser.new(link, clean: true) }
   it "must be defined" do
     expect(UrlParser::VERSION).not_to be_nil
@@ -12,10 +12,11 @@ describe UrlParser do
     let(:link) { 'http://example.com/' }
     let(:text) { "there is a #{link} in here" }
-    let(:extractor) { UrlParser.call(text) }
+    let(:extractor) { UrlParser.call(text, clean: true) }
     it "extracts urls from text into an array" do
-      expect(extractor.collect(&:url)).to include link
+      expect(extractor.collect(&:url).collect(&:to_s))
+        .to include link
     end
     it "initializes each url with the parser" do
@@ -26,10 +27,10 @@ describe UrlParser do
   context "::new" do
-    let(:link) { 'http://example.com/' }
+    let(:link) { 'http://example.com/path' }
     it "initializes a parser with a url" do
-      expect(parser.url).to eq link
+      expect(parser.to_s).to eq link
     end
     it "cannot initialize invalid urls" do
@@ -37,11 +38,11 @@ describe UrlParser do
     end
     it "adds http by default" do
-      expect(UrlParser.new('example.com').url).to eq link
+      expect(UrlParser.new('example.com/path').to_s).to eq link
     end
     it "adds http to protocol-less urls" do
-      expect(UrlParser.new('//example.com').url).to eq link
+      expect(UrlParser.new('//example.com/path').to_s).to eq link
     end
     it "any errors raised inherit from UrlParser::Error" do
@@ -52,17 +53,30 @@ describe UrlParser do
     context "options" do
-      context ":preserve" do
+      context ":clean" do
         let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
-        it "is false by default" do
-          expect(parser.url).not_to eq parser.original_url
+        it "when true cleans the url" do
+          expect(parser.to_s).not_to eq parser.original_url
+        end
+        it "when true it normalizes the url" do
+          [
+            'http://igvita.com/',
+            'http://igvita.com///',
+            'http://igvita.com/../?#',
+            'http://igvita.com/a/../?',
+            'http://igvita.com/a/../?utm_source%3Danalytics'
+          ].each do |url|
+            expect(UrlParser.new(url, clean: true).to_s)
+              .to eq 'http://igvita.com/'
+          end
         end
-        it "does not clean the url when true" do
-          parser = UrlParser.new(link, preserve: true)
-          expect(parser.url).to eq parser.original_url
+        it "does not clean the url by default" do
+          expect(UrlParser.new(link).to_s)
+            .to eq PostRank::URI.parse(parser.original_url).to_s
         end
       end
@@ -71,129 +85,147 @@ describe UrlParser do
   end
-  context "#clean!" do
+  context "#original_url" do
     let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
-    let(:parser) { UrlParser.new(link, preserve: true) }
-    before { parser.clean! }
-    it "normalizes the url" do
-      expect(parser.url).to eq 'http://link.to/?a=b'
+    it "preserves the url input" do
+      expect(parser.original_url).to eq link
     end
-    it "resets the uri" do
-      expect(parser.instance_variable_get(:@uri)).to be_nil
-    end
+  end
-    it "resets the parser" do
-      expect(parser.instance_variable_get(:@parser)).to be_nil
+  context "#url" do
+    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
+    it "returns a url" do
+      expect(parser.url).to be_a Addressable::URI
     end
   end
-  context "#to_s" do
+  context "#schemes" do
+    it "returns an array of allowed schemes" do
+      parser = UrlParser.new('telnet://some.com', schemes: 'telnet')
+      expect(parser.schemes).to be_an Array
+    end
   end
-  context "#uri" do
+  context "#parse" do
+    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
-    it "returns a parsed uri" do
-      expect(UrlParser.new('http://example.com').uri).to be_a Addressable::URI
+    it "calls postrank-uri's parse function" do
+      expect(PostRank::URI).to receive :parse
+      UrlParser.new(link, clean: false)
+    end
+    it "tags errors" do
+      parser = UrlParser.new(link, clean: true)
+      expect(PostRank::URI).to receive(:parse).and_raise(StandardError)
+      expect{ parser.parse(link) }.to raise_error UrlParser::Error
     end
   end
-  context "#valid?" do
+  context "#clean" do
-    it "returns false if the url is invalid" do
-      expect(UrlParser.new('bullshit')).not_to be_valid
-    end
+    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
-    it "returns false if the url scheme is not in the options" do
-      expect(UrlParser.new('telnet://some.com')).not_to be_valid
+    it "calls postrank-uri's clean function" do
+      expect(PostRank::URI).to receive :clean
+      UrlParser.new(link, clean: true)
     end
-    it "returns true if the url scheme is in the options" do
-      expect(UrlParser.new('telnet://some.com', schemes: ['telnet'])).to be_valid
+    it "tags errors" do
+      parser = UrlParser.new(link, clean: false)
+      expect(PostRank::URI).to receive(:clean).and_raise(StandardError)
+      expect{ parser.clean(link) }.to raise_error UrlParser::Error
     end
-    it "returns true if the url is valid" do
-      expect(UrlParser.new('http://example.com/')).to be_valid
+  end
+  context "#parser" do
+    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
+    it "calls postrank-uri's clean function" do
+      parser = UrlParser.new(link, clean: true)
+      expect(Domainatrix).to receive(:parse).with(parser.to_s)
+      parser.parser
     end
-    it "returns true for localhost" do
-      expect(UrlParser.new('localhost:5000')).to be_valid
+    it "tags errors" do
+      parser = UrlParser.new(link, clean: false)
+      expect(Domainatrix).to receive(:parse).and_raise(StandardError)
+      expect{ parser.parser }.to raise_error UrlParser::Error
     end
   end
-  context "#original_url" do
+  context "#clean!" do
     let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
+    let(:parser) { UrlParser.new(link) }
-    it "preserves the url input" do
-      expect(parser.original_url).to eq link
+    it "normalizes the url" do
+      parser.clean!
+      expect(parser.to_s).to eq 'http://link.to/?a=b'
+    end
+    it "resets the parser" do
+      expect{
+        parser.clean!
+      }.to change{
+        parser.parser
+      }
     end
   end
-  context "#url" do
+  context "#to_s" do
-    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
+    let(:link) { 'http://example.com/' }
-    it "returns a url" do
-      expect(parser.url).to eq 'http://link.to/?a=b'
-    end
-    it "attempts to clean and normalize urls" do
-      [
-        'http://igvita.com/',
-        'http://igvita.com///',
-        'http://igvita.com/../?#',
-        'http://igvita.com/a/../?',
-        'http://igvita.com/a/../?utm_source%3Danalytics'
-      ].each do |url|
-        expect(UrlParser.new(url).url)
-          .to eq 'http://igvita.com/'
-      end
+    it "returns a string representation of the url" do
+      expect(parser.to_s).to eq 'http://example.com/'
     end
   end
-  context "#domain" do
+  context "#hash" do
-    let(:link) { 'https://github.com/pauldix/domainatrix' }
+    let(:link) { 'http://example.com/' }
-    it "returns the domain name with suffix" do
-      expect(parser.domain).to eq 'github.com'
+    it "hashes the url string" do
+      expect(parser.hash).to eq Digest::SHA1.hexdigest(link)
     end
   end
-  context "#subdomain" do
+  context "#valid?" do
-    let(:link) { 'http://foo.bar.pauldix.co.uk/asdf.html?q=arg' }
+    it "returns false if the url is invalid" do
+      expect(UrlParser.new('bullshit')).not_to be_valid
+    end
-    it "returns all subdomains with suffix" do
-      expect(parser.subdomain).to eq 'foo.bar.pauldix.co.uk'
+    it "returns false if the url scheme is not in the options" do
+      expect(UrlParser.new('telnet://some.com')).not_to be_valid
     end
-    it "returns only the domain if there is no subdomain" do
-      url = UrlParser.new('https://github.com/')
-      expect(url.subdomain).to eq 'github.com'
+    it "returns true if the url scheme is in the options" do
+      expect(UrlParser.new('telnet://some.com', schemes: ['telnet'])).to be_valid
     end
-    it "does not include www as part of the subdomain" do
-      parser = UrlParser.new("http://www.energy.ca.gov/")
-      expect(parser.subdomain).to eq 'energy.ca.gov'
+    it "returns true if the url is valid" do
+      expect(UrlParser.new('http://example.com/')).to be_valid
     end
-    it "does not include any variation of www as part of the subdomain" do
-      [ 'ww2', 'www2', 'ww23', 'www23' ].each do |www|
-        parser = UrlParser.new("http://#{www}.energy.ca.gov/")
-        expect(parser.subdomain).to eq 'energy.ca.gov'
-      end
+    it "returns true for localhost" do
+      expect(UrlParser.new('localhost:5000')).to be_valid
     end
   end
@@ -235,4 +267,99 @@ describe UrlParser do
   end
+  # http://medialize.github.io/URI.js/about-uris.html
+  #
+  context "uri components" do
+    let(:link) do
+      'foo://username:password@ww2.foo.bar.example.com:123/hello/world/there.html?name=ferret#foo'
+    end
+    let(:parser) { UrlParser.new(link, clean: false) }
+    it { expect(parser.scheme).to eq 'foo' }
+    it { expect(parser.username).to eq 'username' }
+    it { expect(parser.password).to eq 'password' }
+    it { expect(parser.userinfo).to eq 'username:password' }
+    it { expect(parser.www).to eq 'ww2' }
+    it { expect(parser.subdomain).to eq 'foo.bar' }
+    it { expect(parser.subdomains).to eq 'ww2.foo.bar' }
+    it { expect(parser.domain_name).to eq 'example' }
+    it { expect(parser.domain).to eq 'example.com' }
+    it { expect(parser.tld).to eq 'com' }
+    it { expect(parser.hostname).to eq 'ww2.foo.bar.example.com' }
+    it { expect(parser.port).to eq 123 }
+    it { expect(parser.host).to eq 'ww2.foo.bar.example.com:123' }
+    it { expect(parser.origin).to eq 'foo://ww2.foo.bar.example.com:123' }
+    it { expect(parser.authority).to eq 'username:password@ww2.foo.bar.example.com:123' }
+    it { expect(parser.site).to eq 'foo://username:password@ww2.foo.bar.example.com:123' }
+    it { expect(parser.directory).to eq '/hello/world' }
+    it { expect(parser.path).to eq '/hello/world/there.html' }
+    it { expect(parser.segment).to eq 'there.html' }
+    it { expect(parser.filename).to eq 'there.html' }
+    it { expect(parser.suffix).to eq 'html' }
+    it { expect(parser.query).to eq 'name=ferret' }
+    it { expect(parser.query_values['name']).to eq 'ferret' }
+    it { expect(parser.fragment).to eq 'foo' }
+    it { expect(parser.resource).to eq 'there.html?name=ferret#foo' }
+  end
+  context "localhost?" do
+    let(:link) { 'localhost:5000' }
+    it "returns true for localhost" do
+      expect(parser).to be_localhost
+    end
+  end
+  context "#domain_name" do
+    let(:link) { 'https://github.com/pauldix/domainatrix' }
+    it "returns the domain name without the suffix" do
+      expect(parser.domain_name).to eq 'github'
+    end
+  end
+  context "#domain" do
+    let(:link) { 'https://github.com/pauldix/domainatrix' }
+    it "returns the domain name with suffix" do
+      expect(parser.domain).to eq 'github.com'
+    end
+  end
+  context "#subdomain" do
+    let(:link) { 'http://foo.bar.pauldix.co.uk/asdf.html?q=arg' }
+    it "returns all subdomains" do
+      expect(parser.subdomain).to eq 'foo.bar'
+    end
+    it "returns an empty string if there is no subdomain" do
+      url = UrlParser.new('https://github.com/')
+      expect(url.subdomain).to eq ''
+    end
+    it "does not include www as part of the subdomain" do
+      parser = UrlParser.new("http://www.energy.ca.gov/")
+      expect(parser.subdomain).to eq 'energy'
+    end
+    it "does not include any variation of www as part of the subdomain" do
+      [ 'ww2', 'www2', 'ww23', 'www23' ].each do |www|
+        parser = UrlParser.new("http://#{www}.energy.ca.gov/")
+        expect(parser.subdomain).to eq 'energy'
+      end
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: url_parser
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Matt Solt
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-08-04 00:00:00.000000000 Z
+date: 2014-08-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler