RubyGems - url_parser - Versions diffs - 0.4.0 → 0.5.0 - Mend

url_parser 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/.travis.yml +7 -0
data/CHANGELOG.md +20 -0
data/Gemfile +4 -0
data/Guardfile +40 -7
data/LICENSE.txt +1 -1
data/README.md +301 -5
data/Rakefile +5 -0
data/lib/url_parser.rb +93 -286
data/lib/url_parser/db.yml +77 -0
data/lib/url_parser/domain.rb +102 -0
data/lib/url_parser/model.rb +233 -0
data/lib/url_parser/option_setter.rb +47 -0
data/lib/url_parser/parser.rb +206 -0
data/lib/url_parser/uri.rb +206 -0
data/lib/url_parser/version.rb +1 -1
data/spec/spec_helper.rb +83 -6
data/spec/support/.gitkeep +0 -0
data/spec/support/helpers.rb +7 -0
data/spec/url_parser/domain_spec.rb +163 -0
data/spec/url_parser/model_spec.rb +426 -0
data/spec/url_parser/option_setter_spec.rb +71 -0
data/spec/url_parser/parser_spec.rb +515 -0
data/spec/url_parser/uri_spec.rb +570 -0
data/spec/url_parser_spec.rb +93 -387
data/url_parser.gemspec +5 -6
metadata +39 -29

data/spec/url_parser_spec.rb CHANGED

@@ -1,481 +1,187 @@
 require 'spec_helper'
-describe UrlParser do
-  let(:parser) { UrlParser.new(link, clean: true) }
+RSpec.describe UrlParser do
   it "must be defined" do
     expect(UrlParser::VERSION).not_to be_nil
   end
-  context "::SCHEMES" do
-    it { expect( UrlParser::SCHEMES).to be_an Array }
-  end
-  context "::DEFAULT_SCHEMES" do
-    it { expect( UrlParser::DEFAULT_SCHEMES).to be_an Array }
-  end
-  context "::call" do
-    let(:link) { 'http://example.com/' }
-    let(:text) { "there is a #{link} in here" }
-    let(:extractor) { UrlParser.call(text, clean: true) }
-    it "extracts urls from text into an array" do
-      expect(extractor.collect(&:url).collect(&:to_s))
-        .to include link
-    end
-    it "initializes each url with the parser" do
-      expect(extractor.first).to be_a UrlParser::Base
-    end
-  end
-  context "::new" do
-    let(:link) { 'http://example.com/path' }
+  context "configuration" do
-    it "initializes a parser with a url" do
-      expect(parser.to_s).to eq link
-    end
-    it "adds http by default" do
-      expect(UrlParser.new('example.com/path').to_s).to eq link
-    end
-    it "adds http to protocol-less urls" do
-      expect(UrlParser.new('//example.com/path').to_s).to eq link
-    end
-    it "cannot initialize invalid urls" do
-      expect(UrlParser.new('http:||bra.ziz').url).to be_nil
-    end
-    it "catches errors from invalid urls" do
-      expect(UrlParser.new('http:||bra.ziz').errors).not_to be_empty
-    end
-    context "options" do
-      context ":clean" do
-        let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
-        it "when true cleans the url" do
-          expect(parser.to_s).not_to eq parser.original_url
-        end
-        it "when true it normalizes the url" do
-          [
-            'http://igvita.com/',
-            'http://igvita.com///',
-            'http://igvita.com/../?#',
-            'http://igvita.com/a/../?',
-            'http://igvita.com/a/../?utm_source%3Danalytics'
-          ].each do |url|
-            expect(UrlParser.new(url, clean: true).to_s)
-              .to eq 'http://igvita.com/'
-          end
-        end
-        it "does not clean the url by default" do
-          expect(UrlParser.new(link).to_s)
-            .to eq PostRank::URI.parse(parser.original_url).to_s
-        end
+    context ":embedded_params" do
+      it "sets the unembed param keys" do
+        described_class.configuration.embedded_params = [ 'ref' ]
+        uri = UrlParser.unembed('https://www.upwork.com/leaving?ref=https%3A%2F%2Fwww.example.com')
+        expect(uri.to_s).to eq 'https://www.example.com/'
+        described_class.configuration.reset
       end
-      context ":raise_errors" do
-        it "raises instead of catching errors" do
-          expect{
-            UrlParser.new('http:||bra.ziz', raise_errors: true)
-          }.to raise_error
-        end
+    end
-        it "any errors raised inherit from UrlParser::Error" do
-          expect{
-            UrlParser.new('http:||bra.ziz', raise_errors: true)
-          }.to raise_error UrlParser::Error
-        end
+    context ":default_scheme" do
+      it "sets a default scheme if one is not present" do
+        described_class.configuration.default_scheme = 'https'
+        uri = UrlParser.parse('example.com')
+        expect(uri.to_s).to eq 'https://example.com/'
+        described_class.configuration.reset
       end
     end
-  end
+    context ":scheme_map" do
-  context "#original_url" do
-    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
+      it "replaces scheme keys in the map with the corresponding value" do
+        described_class.configuration.scheme_map = { 'feed' => 'http' }
+        uri = UrlParser.parse('feed://feeds.feedburner.com/YourBlog')
+        expect(uri.to_s).to eq 'http://feeds.feedburner.com/YourBlog'
+        described_class.configuration.reset
+      end
-    it "preserves the url input" do
-      expect(parser.original_url).to eq link
     end
   end
-  context "#url" do
-    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
+  context ".tag_errors" do
-    it "returns a url" do
-      expect(parser.url).to be_a Addressable::URI
+    it "tags StandardError exceptions" do
+      expect{
+        described_class.tag_errors{ raise StandardError }
+      }.to raise_error UrlParser::Error
     end
-  end
-  context "#schemes" do
-    it "returns an array of allowed schemes" do
-      parser = UrlParser.new('telnet://some.com', schemes: 'telnet')
-      expect(parser.schemes).to be_an Array
+    it "does not tag errors that do not inherit from StandardError", :disable_raise_error_warning do
+      expect{
+        described_class.tag_errors{ raise Exception }
+      }.not_to raise_error UrlParser::Error
     end
   end
-  context "#parse" do
+  context ".new" do
-    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
-    it "calls postrank-uri's parse function" do
-      expect(PostRank::URI).to receive :parse
-      UrlParser.new(link, clean: false)
+    it "is deprecated" do
+      expect(described_class).to receive(:warn)
+      described_class.new('http://example.com')
     end
-    it "tags errors when set to raise errors" do
-      parser = UrlParser.new(link, clean: true, raise_errors: true)
-      expect(PostRank::URI).to receive(:parse).and_raise(StandardError)
-      expect{ parser.send(:parse, link) }.to raise_error UrlParser::Error
+    it "calls .parse" do
+      expect(described_class).to receive(:warn)
+      expect(described_class).to receive(:parse)
+      described_class.new('http://example.com')
     end
   end
-  context "#clean" do
-    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
+  context ".escape" do
-    it "calls postrank-uri's clean function" do
-      expect(PostRank::URI).to receive :clean
-      UrlParser.new(link, clean: true)
+    it "encodes a string" do
+      expect(described_class.escape('id=1')).to eq 'id%3D1'
     end
-    it "tags errors" do
-      parser = UrlParser.new(link, clean: false, raise_errors: true)
-      expect(PostRank::URI).to receive(:clean).and_raise(StandardError)
-      expect{ parser.send(:clean, link) }.to raise_error UrlParser::Error
+    it "escapes spaces as %20" do
+      expect(described_class.escape('id= 1')).to eq 'id%3D%201'
     end
   end
-  context "#parser" do
+  context ".unescape" do
-    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
-    it "calls postrank-uri's clean function" do
-      expect(Domainatrix).to receive(:parse).with(parser.to_s)
-      UrlParser.new(link, clean: true)
+    it "decodes a string" do
+      expect(described_class.unescape('id%3D1')).to eq 'id=1'
     end
-    it "tags errors" do
-      expect(Domainatrix).to receive(:parse).and_raise(StandardError)
-      expect{
-        UrlParser.new(link, clean: false, raise_errors: true)
-      }.to raise_error UrlParser::Error
+    it "unescapes spaces" do
+      expect(described_class.unescape('id%3D%201')).to eq 'id= 1'
     end
-  end
-  context "#clean!" do
-    let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
-    let(:parser) { UrlParser.new(link) }
+    context "accept improperly encoded strings" do
-    it "normalizes the url" do
-      parser.clean!
-      expect(parser.to_s).to eq 'http://link.to/?a=b'
-    end
-    it "resets the parser" do
-      expect{
-        parser.clean!
-      }.to change{
-        parser.parser
-      }
-    end
+      it "by unencoding spaces in the query encoded as '+'" do
+        expect(described_class.unescape('?id=+1')).to eq '?id= 1'
+      end
-  end
+      it "by unencoding spaces in the query encoded as '+'" do
+        expect(described_class.unescape('?id%3D+1')).to eq '?id= 1'
+      end
-  context "#to_s" do
+      it "by unencoding spaces in the query encoded as '%20'" do
+        expect(described_class.unescape('?id=%201')).to eq '?id= 1'
+      end
-    let(:link) { 'http://example.com/' }
+      it "but does not unencode '+' to spaces in paths" do
+        expect(described_class.unescape('/foo+bar?id=foo+bar')).to eq '/foo+bar?id=foo bar'
+      end
-    it "returns a string representation of the url" do
-      expect(parser.to_s).to eq 'http://example.com/'
     end
   end
-  context "#hash" do
+  context ".parse" do
-    let(:link) { 'http://example.com/' }
-    it "hashes the url string" do
-      expect(parser.hash).to eq Digest::SHA1.hexdigest(link)
+    it "returns an instance of UrlParser::URI" do
+      expect(described_class.parse('http://example.com')).to be_a UrlParser::URI
     end
   end
-  context "#valid?" do
+  context ".unembed" do
-    it "returns true if there are no errors" do
-      expect(UrlParser.new('http://example.com')).to be_valid
+    it "returns an instance of UrlParser::URI" do
+      expect(described_class.unembed('http://example.com')).to be_a UrlParser::URI
     end
-    it "returns false if there are errors" do
-      expect(UrlParser.new('http:||bra.ziz')).not_to be_valid
+    it "parses the URI with the :unembed option enabled" do
+      expect(UrlParser::URI).to receive(:new).with('#', hash_including(unembed: true))
+      described_class.unembed('#')
     end
   end
-  # Thanks to http://stackoverflow.com/a/4864170
-  #
-  context "#join" do
-    let(:link) { 'http://foo.com/zee/zaw/zoom.html' }
-    it "properly combines a url and and relative url" do
-      {
-        'http://zork.com/'                 => 'http://zork.com/',
-        'http://zork.com/#id'              => 'http://zork.com/#id',
-        'http://zork.com/bar'              => 'http://zork.com/bar',
-        'http://zork.com/bar#id'           => 'http://zork.com/bar#id',
-        'http://zork.com/bar/'             => 'http://zork.com/bar/',
-        'http://zork.com/bar/#id'          => 'http://zork.com/bar/#id',
-        'http://zork.com/bar/jim.html'     => 'http://zork.com/bar/jim.html',
-        'http://zork.com/bar/jim.html#id'  => 'http://zork.com/bar/jim.html#id',
-        '/bar'                             => 'http://foo.com/bar',
-        '/bar#id'                          => 'http://foo.com/bar#id',
-        '/bar/'                            => 'http://foo.com/bar/',
-        '/bar/#id'                         => 'http://foo.com/bar/#id',
-        '/bar/jim.html'                    => 'http://foo.com/bar/jim.html',
-        '/bar/jim.html#id'                 => 'http://foo.com/bar/jim.html#id',
-        'jim.html'                         => 'http://foo.com/zee/zaw/jim.html',
-        'jim.html#id'                      => 'http://foo.com/zee/zaw/jim.html#id',
-        '../jim.html'                      => 'http://foo.com/zee/jim.html',
-        '../jim.html#id'                   => 'http://foo.com/zee/jim.html#id',
-        '../'                              => 'http://foo.com/zee/',
-        '../#id'                           => 'http://foo.com/zee/#id',
-        '#id'                              => 'http://foo.com/zee/zaw/zoom.html#id'
-      }.each do |relative_url, expected_result|
-        expect(parser.join(relative_url).to_s).to eq expected_result
-      end
+  context ".canonicalize" do
+    it "returns an instance of UrlParser::URI" do
+      expect(described_class.canonicalize('http://example.com')).to be_a UrlParser::URI
     end
-  end
-  # http://medialize.github.io/URI.js/about-uris.html
-  #
-  context "uri components" do
-    let(:parser) { UrlParser.new(link, clean: false) }
-    context "when all are present" do
-      let(:link) do
-        'https://username:password@ww2.foo.bar.example.com:123/hello/world/there.html?name=ferret#foo'
-      end
-      it { expect(parser.errors).to be_empty }
-      it { expect(parser).to be_valid }
-      it { expect(parser.scheme).to eq 'https' }
-      it { expect(parser.username).to eq 'username' }
-      it { expect(parser.password).to eq 'password' }
-      it { expect(parser.userinfo).to eq 'username:password' }
-      it { expect(parser.www).to eq 'ww2' }
-      it { expect(parser.subdomain).to eq 'foo.bar' }
-      it { expect(parser.subdomains).to eq 'ww2.foo.bar' }
-      it { expect(parser.domain_name).to eq 'example' }
-      it { expect(parser.domain).to eq 'example.com' }
-      it { expect(parser.tld).to eq 'com' }
-      it { expect(parser.hostname).to eq 'ww2.foo.bar.example.com' }
-      it { expect(parser.port).to eq 123 }
-      it { expect(parser.host).to eq 'ww2.foo.bar.example.com:123' }
-      it { expect(parser.origin).to eq 'https://ww2.foo.bar.example.com:123' }
-      it { expect(parser.authority).to eq 'username:password@ww2.foo.bar.example.com:123' }
-      it { expect(parser.site).to eq 'https://username:password@ww2.foo.bar.example.com:123' }
-      it { expect(parser.directory).to eq '/hello/world' }
-      it { expect(parser.path).to eq '/hello/world/there.html' }
-      it { expect(parser.segment).to eq 'there.html' }
-      it { expect(parser.filename).to eq 'there.html' }
-      it { expect(parser.suffix).to eq 'html' }
-      it { expect(parser.query).to eq 'name=ferret' }
-      it { expect(parser.query_values['name']).to eq 'ferret' }
-      it { expect(parser.fragment).to eq 'foo' }
-      it { expect(parser.resource).to eq 'there.html?name=ferret#foo' }
-    end
-    context "when none are present" do
-      let(:link) { '/' }
-      it { expect(parser.errors).to be_empty }
-      it { expect(parser.scheme).to be_nil }
-      it { expect(parser.username).to be_nil }
-      it { expect(parser.password).to be_nil }
-      it { expect(parser.userinfo).to be_nil }
-      it { expect(parser.www).to be_nil }
-      it { expect(parser.subdomain).to be_nil }
-      it { expect(parser.subdomains).to be_nil }
-      it { expect(parser.domain_name).to be_nil }
-      it { expect(parser.domain).to be_nil }
-      it { expect(parser.tld).to be_nil }
-      it { expect(parser.hostname).to be_nil }
-      it { expect(parser.port).to be_nil }
-      it { expect(parser.host).to be_nil }
-      it { expect(parser.origin).to be_nil }
-      it { expect(parser.authority).to be_nil }
-      it { expect(parser.site).to be_nil }
-      it { expect(parser.directory).to eq '/' }
-      it { expect(parser.path).to eq '/' }
-      it { expect(parser.segment).to be_nil }
-      it { expect(parser.filename).to eq 'index.html' }
-      it { expect(parser.suffix).to be_nil }
-      it { expect(parser.query).to be_nil }
-      it { expect(parser.query_values['name']).to be_nil }
-      it { expect(parser.fragment).to be_nil }
-      it { expect(parser.resource).to be_nil }
-    end
-    context "when empty" do
-      let(:link) { '' }
-      it { expect(parser.errors).to be_empty }
-      it { expect(parser.scheme).to be_nil }
-      it { expect(parser.username).to be_nil }
-      it { expect(parser.password).to be_nil }
-      it { expect(parser.userinfo).to be_nil }
-      it { expect(parser.www).to be_nil }
-      it { expect(parser.subdomain).to be_nil }
-      it { expect(parser.subdomains).to be_nil }
-      it { expect(parser.domain_name).to be_nil }
-      it { expect(parser.domain).to be_nil }
-      it { expect(parser.tld).to be_nil }
-      it { expect(parser.hostname).to be_nil }
-      it { expect(parser.port).to be_nil }
-      it { expect(parser.host).to be_nil }
-      it { expect(parser.origin).to be_nil }
-      it { expect(parser.authority).to be_nil }
-      it { expect(parser.site).to be_nil }
-      it { expect(parser.directory).to eq '/' }
-      it { expect(parser.path).to eq '' }
-      it { expect(parser.segment).to be_nil }
-      it { expect(parser.filename).to eq 'index.html' }
-      it { expect(parser.suffix).to be_nil }
-      it { expect(parser.query).to be_nil }
-      it { expect(parser.query_values['name']).to be_nil }
-      it { expect(parser.fragment).to be_nil }
-      it { expect(parser.resource).to be_nil }
-    end
-    context "when invalid" do
-      let(:link) { 'http://#content-zone' }
-      it { expect(parser.errors).not_to be_empty }
-      it { expect(parser.scheme).to be_nil }
-      it { expect(parser.username).to be_nil }
-      it { expect(parser.password).to be_nil }
-      it { expect(parser.userinfo).to be_nil }
-      it { expect(parser.www).to be_nil }
-      it { expect(parser.subdomain).to be_nil }
-      it { expect(parser.subdomains).to be_nil }
-      it { expect(parser.domain_name).to be_nil }
-      it { expect(parser.domain).to be_nil }
-      it { expect(parser.tld).to be_nil }
-      it { expect(parser.hostname).to be_nil }
-      it { expect(parser.port).to be_nil }
-      it { expect(parser.host).to be_nil }
-      it { expect(parser.origin).to be_nil }
-      it { expect(parser.authority).to be_nil }
-      it { expect(parser.site).to be_nil }
-      it { expect(parser.directory).to be_nil }
-      it { expect(parser.path).to be_nil }
-      it { expect(parser.segment).to be_nil }
-      it { expect(parser.filename).to be_nil }
-      it { expect(parser.suffix).to be_nil }
-      it { expect(parser.query).to be_nil }
-      it { expect(parser.query_values['name']).to be_nil }
-      it { expect(parser.fragment).to be_nil }
-      it { expect(parser.resource).to be_nil }
+    it "parses the URI with the :canonicalize option enabled" do
+      expect(UrlParser::URI).to receive(:new).with('#', hash_including(canonicalize: true))
+      described_class.canonicalize('#')
     end
   end
-  context "localhost?" do
+  context ".normalize" do
-    let(:link) { 'localhost:5000' }
-    it "returns true for localhost" do
-      expect(parser).to be_localhost
+    it "returns an instance of UrlParser::URI" do
+      expect(described_class.normalize('http://example.com')).to be_a UrlParser::URI
     end
-  end
-  context "#domain_name" do
-    let(:link) { 'https://github.com/pauldix/domainatrix' }
-    it "returns the domain name without the suffix" do
-      expect(parser.domain_name).to eq 'github'
+    it "parses the URI with the :normalize option enabled" do
+      expect(UrlParser::URI).to receive(:new).with('#', hash_including(normalize: true))
+      described_class.normalize('#')
     end
   end
-  context "#domain" do
-    let(:link) { 'https://github.com/pauldix/domainatrix' }
+  context ".clean" do
-    it "returns the domain name with suffix" do
-      expect(parser.domain).to eq 'github.com'
+    it "returns an instance of UrlParser::URI" do
+      expect(described_class.clean('http://example.com')).to be_a UrlParser::URI
     end
-  end
-  context "#subdomain" do
-    let(:link) { 'http://foo.bar.pauldix.co.uk/asdf.html?q=arg' }
-    it "returns all subdomains" do
-      expect(parser.subdomain).to eq 'foo.bar'
+    it "parses the URI with the :clean option enabled" do
+      expect(UrlParser::URI).to receive(:new).with('#', hash_including(clean: true))
+      described_class.clean('#')
     end
-    it "returns nil if there is no subdomain" do
-      url = UrlParser.new('https://github.com/')
-      expect(url.subdomain).to be_nil
-    end
+  end
-    it "does not include www as part of the subdomain" do
-      parser = UrlParser.new("http://www.energy.ca.gov/")
-      expect(parser.subdomain).to eq 'energy'
-    end
+  context ".wrap" do
-    it "does not include any variation of www as part of the subdomain" do
-      [ 'ww2', 'www2', 'ww23', 'www23' ].each do |www|
-        parser = UrlParser.new("http://#{www}.energy.ca.gov/")
-        expect(parser.subdomain).to eq 'energy'
-      end
+    it "converts nil to an array" do
+      expect(described_class.wrap(nil)).to eq([])
     end
   end