url_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d88f4309a1787a5ed3f004e60a85e5f4a5e26765
4
+ data.tar.gz: 78146dbfb19dbec7f5f9169fef026a7b2448c43d
5
+ SHA512:
6
+ metadata.gz: 99801e22611a0d7c78b576e01aacb097c32852fa99775674524b2bf1f000988e64cb00766eaeec083ce16b0cb7c10c9b29745dff7186fca43135276e467c8a05
7
+ data.tar.gz: 4df106696e71b773da7bc7e7997ac8ade0cb4002327cbb107c1b436c420373b1ce55abcd0eed67d0b328385c7d35d190bc921e5e64d264ef26c4ce7232817389
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in url_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Matt Solt
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # UrlParser
2
+
3
+ Combine PostRank-URI, Domainatrix, and other Ruby url parsing libraries into a common interface.
4
+
5
+ See also:
6
+ - https://github.com/pauldix/domainatrix
7
+ - https://github.com/postrank-labs/postrank-uri
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'url_parser'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install url_parser
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Contributing
28
+
29
+ 1. Fork it ( https://github.com/[my-github-username]/url_parser/fork )
30
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
31
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
32
+ 4. Push to the branch (`git push origin my-new-feature`)
33
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,3 @@
1
+ module UrlParser
2
+ VERSION = "0.1.0"
3
+ end
data/lib/url_parser.rb ADDED
@@ -0,0 +1,150 @@
1
+ require "url_parser/version"
2
+ require "domainatrix"
3
+ require "postrank-uri"
4
+ require "addressable/uri"
5
+
6
+ class Array
7
+
8
+ def self.wrap(object)
9
+ if object.nil?
10
+ []
11
+ elsif object.respond_to?(:to_ary)
12
+ object.to_ary || [object]
13
+ else
14
+ [object]
15
+ end
16
+ end unless respond_to?(:wrap)
17
+
18
+ end
19
+
20
+ module UrlParser
21
+
22
+ module Error; end
23
+
24
+ def self.call(text)
25
+ urls = []
26
+ PostRank::URI.extract(text).each do |url|
27
+ urls << new(url)
28
+ end
29
+ urls
30
+ end
31
+
32
+ def self.new(url, options = {})
33
+ Base.new(url, options)
34
+ end
35
+
36
+ class Base
37
+
38
+ # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
39
+ MAJOR_SCHEMES = [
40
+ 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
41
+ 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
42
+ 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
43
+ 'wais',
44
+ # Unofficial schemes
45
+ 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
46
+ 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', 'mvn'
47
+ ]
48
+
49
+ DEFAULT_SCHEMES = [
50
+ 'http', 'https', 'ftp', 'mailto', 'file', 'ssh', 'feed',
51
+ 'cvs', 'git', 'mvn', 'nntp', 'shttp', 'svn'
52
+ ]
53
+
54
+ attr_reader :url, :original_url
55
+
56
+ def initialize(url, options = {})
57
+ tag_errors do
58
+ @schemes = options.fetch(:schemes) { DEFAULT_SCHEMES }
59
+ @preserve = !!options[:preserve]
60
+ @original_url = url
61
+ @url = @preserve ? url : PostRank::URI.clean(url)
62
+ end
63
+ end
64
+
65
+ def schemes
66
+ Array.wrap(@schemes)
67
+ end
68
+
69
+ def uri
70
+ tag_errors do
71
+ @uri ||= Addressable::URI.parse(url) rescue nil
72
+ end
73
+ end
74
+
75
+ def scheme
76
+ uri.scheme if uri
77
+ end
78
+
79
+ def user
80
+ uri.user if uri
81
+ end
82
+
83
+ def password
84
+ uri.password if uri
85
+ end
86
+
87
+ def host
88
+ uri.host if uri
89
+ end
90
+
91
+ def port
92
+ uri.port if uri
93
+ end
94
+
95
+ def path
96
+ uri.path if uri
97
+ end
98
+
99
+ def query
100
+ uri.query if uri
101
+ end
102
+
103
+ def fragment
104
+ uri.fragment if uri
105
+ end
106
+
107
+ def query_values
108
+ uri ? uri.query_values.to_h : {}
109
+ end
110
+
111
+ def valid?
112
+ return true if domain == 'localhost'
113
+ return false if uri.nil?
114
+ return false unless schemes.include?(scheme)
115
+ return false unless host =~ /\./
116
+ true
117
+ end
118
+
119
+ def parser
120
+ tag_errors do
121
+ @parser ||= Domainatrix.parse(url)
122
+ end
123
+ end
124
+
125
+ def domain
126
+ parser.domain_with_public_suffix
127
+ end
128
+
129
+ def subdomain
130
+ unless parser.subdomain.empty?
131
+ parts = parser.subdomain.tap{ |s| s.slice!(domain) }.split('.')
132
+ parts.shift if parts.first =~ /www?\d*/
133
+ (parts << domain).join('.')
134
+ else
135
+ domain
136
+ end
137
+ end
138
+
139
+ private
140
+
141
+ def tag_errors
142
+ yield
143
+ rescue Exception => error
144
+ error.extend(UrlParser::Error)
145
+ raise
146
+ end
147
+
148
+ end
149
+
150
+ end
@@ -0,0 +1,22 @@
1
+ require "rspec"
2
+
3
+ require "url_parser"
4
+
5
+ # This file was generated by the `rspec --init` command. Conventionally, all
6
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
7
+ # Require this file using `require "spec_helper"` to ensure that it is only
8
+ # loaded once.
9
+ #
10
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
11
+ RSpec.configure do |config|
12
+ config.run_all_when_everything_filtered = true
13
+ config.filter_run :focus
14
+
15
+ # Run specs in random order to surface order dependencies. If you find an
16
+ # order dependency and want to debug it, you can fix the order by providing
17
+ # the seed, which is printed after each run.
18
+ # --seed 1234
19
+ config.order = 'random'
20
+ end
21
+
22
+
@@ -0,0 +1,176 @@
1
+ require 'spec_helper'
2
+
3
+ describe UrlParser do
4
+
5
+ let(:parser) { UrlParser.new(link) }
6
+
7
+ it "must be defined" do
8
+ expect(UrlParser::VERSION).not_to be_nil
9
+ end
10
+
11
+ context "::call" do
12
+
13
+ let(:link) { 'http://example.com/' }
14
+ let(:text) { "there is a #{link} in here" }
15
+ let(:extractor) { UrlParser.call(text) }
16
+
17
+ it "extracts urls from text into an array" do
18
+ expect(extractor.collect(&:url)).to include link
19
+ end
20
+
21
+ it "initializes each url with the parser" do
22
+ expect(extractor.first).to be_a UrlParser::Base
23
+ end
24
+
25
+ end
26
+
27
+ context "::new" do
28
+
29
+ let(:link) { 'http://example.com/' }
30
+
31
+ it "initializes a parser with a url" do
32
+ expect(parser.url).to eq link
33
+ end
34
+
35
+ it "cannot initialize invalid urls" do
36
+ expect{ UrlParser.new('http:||bra.ziz') }.to raise_error
37
+ end
38
+
39
+ it "adds http by default" do
40
+ expect(UrlParser.new('example.com').url).to eq link
41
+ end
42
+
43
+ it "adds http to protocol-less urls" do
44
+ expect(UrlParser.new('//example.com').url).to eq link
45
+ end
46
+
47
+ it "any errors raised inherit from UrlParser::Error" do
48
+ expect{
49
+ UrlParser.new('http:||bra.ziz')
50
+ }.to raise_error UrlParser::Error
51
+ end
52
+
53
+ context "options" do
54
+
55
+ context ":preserve" do
56
+
57
+ let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
58
+
59
+ it "is false by default" do
60
+ expect(parser.url).not_to eq parser.original_url
61
+ end
62
+
63
+ it "does not clean the url when true" do
64
+ parser = UrlParser.new(link, preserve: true)
65
+ expect(parser.url).to eq parser.original_url
66
+ end
67
+
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+
74
+ context "#uri" do
75
+
76
+ it "returns a parsed uri" do
77
+ expect(UrlParser.new('http://example.com').uri).to be_a Addressable::URI
78
+ end
79
+
80
+ end
81
+
82
+ context "#valid?" do
83
+
84
+ it "returns false if the url is invalid" do
85
+ expect(UrlParser.new('bullshit')).not_to be_valid
86
+ end
87
+
88
+ it "returns false if the url scheme is not in the options" do
89
+ expect(UrlParser.new('telnet://some.com')).not_to be_valid
90
+ end
91
+
92
+ it "returns true if the url scheme is in the options" do
93
+ expect(UrlParser.new('telnet://some.com', schemes: ['telnet'])).to be_valid
94
+ end
95
+
96
+ it "returns true if the url is valid" do
97
+ expect(UrlParser.new('http://example.com/')).to be_valid
98
+ end
99
+
100
+ it "returns true for localhost" do
101
+ expect(UrlParser.new('localhost:5000')).to be_valid
102
+ end
103
+
104
+ end
105
+
106
+ context "#original_url" do
107
+
108
+ let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
109
+
110
+ it "preserves the url input" do
111
+ expect(parser.original_url).to eq link
112
+ end
113
+
114
+ end
115
+
116
+ context "#url" do
117
+
118
+ let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
119
+
120
+ it "returns a url" do
121
+ expect(parser.url).to eq 'http://link.to/?a=b'
122
+ end
123
+
124
+ it "attempts to clean and normalize urls" do
125
+ [
126
+ 'http://igvita.com/',
127
+ 'http://igvita.com///',
128
+ 'http://igvita.com/../?#',
129
+ 'http://igvita.com/a/../?',
130
+ 'http://igvita.com/a/../?utm_source%3Danalytics'
131
+ ].each do |url|
132
+ expect(UrlParser.new(url).url)
133
+ .to eq 'http://igvita.com/'
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ context "#domain" do
140
+
141
+ let(:link) { 'https://github.com/pauldix/domainatrix' }
142
+
143
+ it "returns the domain name with suffix" do
144
+ expect(parser.domain).to eq 'github.com'
145
+ end
146
+
147
+ end
148
+
149
+ context "#subdomain" do
150
+
151
+ let(:link) { 'http://foo.bar.pauldix.co.uk/asdf.html?q=arg' }
152
+
153
+ it "returns all subdomains with suffix" do
154
+ expect(parser.subdomain).to eq 'foo.bar.pauldix.co.uk'
155
+ end
156
+
157
+ it "returns only the domain if there is no subdomain" do
158
+ url = UrlParser.new('https://github.com/')
159
+ expect(url.subdomain).to eq 'github.com'
160
+ end
161
+
162
+ it "does not include www as part of the subdomain" do
163
+ parser = UrlParser.new("http://www.energy.ca.gov/")
164
+ expect(parser.subdomain).to eq 'energy.ca.gov'
165
+ end
166
+
167
+ it "does not include any variation of www as part of the subdomain" do
168
+ [ 'ww2', 'www2', 'ww23', 'www23' ].each do |www|
169
+ parser = UrlParser.new("http://#{www}.energy.ca.gov/")
170
+ expect(parser.subdomain).to eq 'energy.ca.gov'
171
+ end
172
+ end
173
+
174
+ end
175
+
176
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'url_parser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "url_parser"
8
+ spec.version = UrlParser::VERSION
9
+ spec.authors = ["Matt Solt"]
10
+ spec.email = ["mattsolt@gmail.com"]
11
+ spec.summary = %q{Combine PostRank-URI, Domainatrix, and other Ruby url parsing libraries into a common interface.}
12
+ spec.description = %q{Uses PostRank-URI to clean, Addressable to break into components, and Domainatrix to determine domain and subdomain.}
13
+ spec.homepage = "https://github.com/activefx/url_parser"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake", "~> 10"
23
+ spec.add_development_dependency "rspec", "~> 3.0"
24
+
25
+ spec.add_dependency "domainatrix", ">= 0.0.11"
26
+ spec.add_dependency "postrank-uri", "~> 1.0"
27
+ spec.add_dependency "addressable", "~> 2.3"
28
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matt Solt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-08-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: domainatrix
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 0.0.11
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 0.0.11
69
+ - !ruby/object:Gem::Dependency
70
+ name: postrank-uri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: addressable
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '2.3'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '2.3'
97
+ description: Uses PostRank-URI to clean, Addressable to break into components, and
98
+ Domainatrix to determine domain and subdomain.
99
+ email:
100
+ - mattsolt@gmail.com
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - Gemfile
108
+ - LICENSE.txt
109
+ - README.md
110
+ - Rakefile
111
+ - lib/url_parser.rb
112
+ - lib/url_parser/version.rb
113
+ - spec/spec_helper.rb
114
+ - spec/url_parser_spec.rb
115
+ - url_parser.gemspec
116
+ homepage: https://github.com/activefx/url_parser
117
+ licenses:
118
+ - MIT
119
+ metadata: {}
120
+ post_install_message:
121
+ rdoc_options: []
122
+ require_paths:
123
+ - lib
124
+ required_ruby_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ requirements: []
135
+ rubyforge_project:
136
+ rubygems_version: 2.2.2
137
+ signing_key:
138
+ specification_version: 4
139
+ summary: Combine PostRank-URI, Domainatrix, and other Ruby url parsing libraries into
140
+ a common interface.
141
+ test_files:
142
+ - spec/spec_helper.rb
143
+ - spec/url_parser_spec.rb