url_parser 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +4 -0
- data/Guardfile +40 -7
- data/LICENSE.txt +1 -1
- data/README.md +301 -5
- data/Rakefile +5 -0
- data/lib/url_parser.rb +93 -286
- data/lib/url_parser/db.yml +77 -0
- data/lib/url_parser/domain.rb +102 -0
- data/lib/url_parser/model.rb +233 -0
- data/lib/url_parser/option_setter.rb +47 -0
- data/lib/url_parser/parser.rb +206 -0
- data/lib/url_parser/uri.rb +206 -0
- data/lib/url_parser/version.rb +1 -1
- data/spec/spec_helper.rb +83 -6
- data/spec/support/.gitkeep +0 -0
- data/spec/support/helpers.rb +7 -0
- data/spec/url_parser/domain_spec.rb +163 -0
- data/spec/url_parser/model_spec.rb +426 -0
- data/spec/url_parser/option_setter_spec.rb +71 -0
- data/spec/url_parser/parser_spec.rb +515 -0
- data/spec/url_parser/uri_spec.rb +570 -0
- data/spec/url_parser_spec.rb +93 -387
- data/url_parser.gemspec +5 -6
- metadata +39 -29
@@ -0,0 +1,206 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
require 'resolv'
|
3
|
+
|
4
|
+
module UrlParser
|
5
|
+
class URI
|
6
|
+
extend Forwardable
|
7
|
+
|
8
|
+
LOCALHOST_REGEXP = /(\A|\.)localhost\z/
|
9
|
+
|
10
|
+
COMPONENTS = [
|
11
|
+
:scheme, # Top level URI naming structure / protocol.
|
12
|
+
:username, # Username portion of the userinfo.
|
13
|
+
:user, # Alias for #username.
|
14
|
+
:password, # Password portion of the userinfo.
|
15
|
+
:userinfo, # URI username and password for authentication.
|
16
|
+
:hostname, # Fully qualified domain name or IP address.
|
17
|
+
:naked_hostname, # Hostname without any ww? prefix.
|
18
|
+
:port, # Port number.
|
19
|
+
:host, # Hostname and port.
|
20
|
+
:www, # The ww? portion of the subdomain.
|
21
|
+
:tld, # Returns the top level domain portion, aka the extension.
|
22
|
+
:top_level_domain, # Alias for #tld.
|
23
|
+
:extension, # Alias for #tld.
|
24
|
+
:sld, # Returns the second level domain portion, aka the domain part.
|
25
|
+
:second_level_domain, # Alias for #sld.
|
26
|
+
:domain_name, # Alias for #sld.
|
27
|
+
:trd, # Returns the third level domain portion, aka the subdomain part.
|
28
|
+
:third_level_domain, # Alias for #trd.
|
29
|
+
:subdomains, # Alias for #trd.
|
30
|
+
:naked_trd, # Any non-ww? subdomains.
|
31
|
+
:naked_subdomain, # Alias for #naked_trd.
|
32
|
+
:domain, # The domain name with the tld.
|
33
|
+
:subdomain, # All subdomains, include ww?.
|
34
|
+
:origin, # Scheme and host.
|
35
|
+
:authority, # Userinfo and host.
|
36
|
+
:site, # Scheme, userinfo, and host.
|
37
|
+
:path, # Directory and segment.
|
38
|
+
:segment, # Last portion of the path.
|
39
|
+
:directory, # Any directories following the site within the URI.
|
40
|
+
:filename, # Segment if a file extension is present.
|
41
|
+
:suffix, # The file extension of the filename.
|
42
|
+
:query, # Params and values as a string.
|
43
|
+
:query_values, # A hash of params and values.
|
44
|
+
:fragment, # Fragment identifier.
|
45
|
+
:resource, # Path, query, and fragment.
|
46
|
+
:location # Directory and resource - everything after the site.
|
47
|
+
]
|
48
|
+
|
49
|
+
def_delegators :@model, *COMPONENTS
|
50
|
+
|
51
|
+
def_delegator :@model, :parsed_domain
|
52
|
+
def_delegator :parsed_domain, :labels
|
53
|
+
|
54
|
+
attr_reader :input, :uri, :options
|
55
|
+
|
56
|
+
def initialize(uri, options = {}, &blk)
|
57
|
+
@input = uri
|
58
|
+
@options = set_options(options, &blk)
|
59
|
+
@block = blk ? blk : block_builder
|
60
|
+
@uri = UrlParser::Parser.call(@input, @options, &@block)
|
61
|
+
@model = UrlParser::Model.new(@uri)
|
62
|
+
end
|
63
|
+
|
64
|
+
def unescaped?
|
65
|
+
!!options[:unescape]
|
66
|
+
end
|
67
|
+
|
68
|
+
def parsed?
|
69
|
+
true
|
70
|
+
end
|
71
|
+
|
72
|
+
def unembedded?
|
73
|
+
!!options[:unembed]
|
74
|
+
end
|
75
|
+
|
76
|
+
def canonicalized?
|
77
|
+
!!options[:canonicalize]
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalized?
|
81
|
+
!!options[:normalize]
|
82
|
+
end
|
83
|
+
|
84
|
+
def cleaned?
|
85
|
+
!!options[:clean] || (
|
86
|
+
unescaped? &&
|
87
|
+
parsed? &&
|
88
|
+
unembedded? &&
|
89
|
+
canonicalized? &&
|
90
|
+
normalized?
|
91
|
+
)
|
92
|
+
end
|
93
|
+
|
94
|
+
def clean
|
95
|
+
if cleaned?
|
96
|
+
raw
|
97
|
+
else
|
98
|
+
UrlParser::Parser.call(@input, raw: true) { |uri| uri.clean! }
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Cleans and converts into a naked hostname
|
103
|
+
#
|
104
|
+
def canonical
|
105
|
+
opts = { raw: true }
|
106
|
+
curi = naked_hostname + location
|
107
|
+
|
108
|
+
UrlParser::Parser.call(curi, opts) do |uri|
|
109
|
+
uri.clean!
|
110
|
+
end.sub(/\A[a-z]+:\/\//i, '//')
|
111
|
+
end
|
112
|
+
|
113
|
+
def clean?
|
114
|
+
cleaned? || self.to_s == clean
|
115
|
+
end
|
116
|
+
|
117
|
+
def relative?
|
118
|
+
uri.relative?
|
119
|
+
end
|
120
|
+
|
121
|
+
def absolute?
|
122
|
+
uri.absolute?
|
123
|
+
end
|
124
|
+
|
125
|
+
def localhost?
|
126
|
+
!!(hostname.to_s[LOCALHOST_REGEXP])
|
127
|
+
end
|
128
|
+
|
129
|
+
def ipv4
|
130
|
+
hostname.to_s[Resolv::IPv4::Regex]
|
131
|
+
end
|
132
|
+
|
133
|
+
def ipv4?
|
134
|
+
!!ipv4
|
135
|
+
end
|
136
|
+
|
137
|
+
def ipv6
|
138
|
+
host.to_s[Resolv::IPv6::Regex]
|
139
|
+
end
|
140
|
+
|
141
|
+
def ipv6?
|
142
|
+
!!ipv6
|
143
|
+
end
|
144
|
+
|
145
|
+
def ip_address?
|
146
|
+
ipv4? || ipv6?
|
147
|
+
end
|
148
|
+
|
149
|
+
def naked?
|
150
|
+
!localhost? && www.nil?
|
151
|
+
end
|
152
|
+
|
153
|
+
def raw
|
154
|
+
uri.to_s
|
155
|
+
end
|
156
|
+
alias_method :to_s, :raw
|
157
|
+
|
158
|
+
def sha1
|
159
|
+
Digest::SHA1.hexdigest(raw)
|
160
|
+
end
|
161
|
+
alias_method :hash, :sha1
|
162
|
+
|
163
|
+
def ==(uri)
|
164
|
+
clean == self.class.new(uri, clean: true).clean
|
165
|
+
end
|
166
|
+
|
167
|
+
def =~(uri)
|
168
|
+
canonical == self.class.new(uri, clean: true).canonical
|
169
|
+
end
|
170
|
+
|
171
|
+
def +(uri)
|
172
|
+
self.class.new(uri.to_s, options.merge({ base_uri: self.to_s}), &@block)
|
173
|
+
end
|
174
|
+
alias_method :join, :+
|
175
|
+
|
176
|
+
def valid?
|
177
|
+
return false if input.nil? || relative?
|
178
|
+
return true if ip_address? || localhost?
|
179
|
+
parsed_domain.valid?
|
180
|
+
end
|
181
|
+
|
182
|
+
private
|
183
|
+
|
184
|
+
def set_options(opts = {}, &blk)
|
185
|
+
UrlParser::OptionSetter
|
186
|
+
.new(opts, &blk)
|
187
|
+
.to_hash
|
188
|
+
.merge(raw: false)
|
189
|
+
end
|
190
|
+
|
191
|
+
def block_builder
|
192
|
+
proc do |uri|
|
193
|
+
if cleaned?
|
194
|
+
uri.clean!
|
195
|
+
else
|
196
|
+
uri.unescape! if unescaped?
|
197
|
+
uri.parse! if parsed?
|
198
|
+
uri.unembed! if unembedded?
|
199
|
+
uri.canonicalize! if canonicalized?
|
200
|
+
uri.normalize! if normalized?
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
data/lib/url_parser/version.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
2
|
+
|
3
|
+
require 'codeclimate-test-reporter'
|
4
|
+
CodeClimate::TestReporter.start
|
5
|
+
|
1
6
|
require "rspec"
|
7
|
+
|
2
8
|
begin
|
3
9
|
require "pry"
|
4
10
|
rescue LoadError; end
|
@@ -7,19 +13,90 @@ require "url_parser"
|
|
7
13
|
|
8
14
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
9
15
|
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
10
|
-
#
|
11
|
-
# loaded
|
16
|
+
# The generated `.rspec` file contains `--require spec_helper` which will cause this
|
17
|
+
# file to always be loaded, without a need to explicitly require it in any files.
|
18
|
+
#
|
19
|
+
# Given that it is always loaded, you are encouraged to keep this file as
|
20
|
+
# light-weight as possible. Requiring heavyweight dependencies from this file
|
21
|
+
# will add to the boot time of your test suite on EVERY test run, even for an
|
22
|
+
# individual file that may not need all of that loaded. Instead, consider making
|
23
|
+
# a separate helper file that requires the additional dependencies and performs
|
24
|
+
# the additional setup, and require it from the spec files that actually need it.
|
25
|
+
#
|
26
|
+
# The `.rspec` file also contains a few flags that are not defaults but that
|
27
|
+
# users commonly want.
|
12
28
|
#
|
13
29
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
14
30
|
RSpec.configure do |config|
|
15
|
-
|
31
|
+
|
32
|
+
# rspec-expectations config goes here. You can use an alternate
|
33
|
+
# assertion/expectation library such as wrong or the stdlib/minitest
|
34
|
+
# assertions if you prefer.
|
35
|
+
config.expect_with :rspec do |expectations|
|
36
|
+
# This option will default to `true` in RSpec 4. It makes the `description`
|
37
|
+
# and `failure_message` of custom matchers include text for helper methods
|
38
|
+
# defined using `chain`, e.g.:
|
39
|
+
# be_bigger_than(2).and_smaller_than(4).description
|
40
|
+
# # => "be bigger than 2 and smaller than 4"
|
41
|
+
# ...rather than:
|
42
|
+
# # => "be bigger than 2"
|
43
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
44
|
+
end
|
45
|
+
|
46
|
+
# rspec-mocks config goes here. You can use an alternate test double
|
47
|
+
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
48
|
+
config.mock_with :rspec do |mocks|
|
49
|
+
# Prevents you from mocking or stubbing a method that does not exist on
|
50
|
+
# a real object. This is generally recommended, and will default to
|
51
|
+
# `true` in RSpec 4.
|
52
|
+
mocks.verify_partial_doubles = true
|
53
|
+
end
|
54
|
+
|
55
|
+
# These two settings work together to allow you to limit a spec run
|
56
|
+
# to individual examples or groups you care about by tagging them with
|
57
|
+
# `:focus` metadata. When nothing is tagged with `:focus`, all examples
|
58
|
+
# get run.
|
16
59
|
config.filter_run :focus
|
17
|
-
config.
|
60
|
+
config.run_all_when_everything_filtered = true
|
61
|
+
|
62
|
+
# Limits the available syntax to the non-monkey patched syntax that is recommended.
|
63
|
+
# For more details, see:
|
64
|
+
# - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
|
65
|
+
# - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
66
|
+
# - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
|
67
|
+
config.disable_monkey_patching!
|
68
|
+
|
69
|
+
# This setting enables warnings. It's recommended, but in some cases may
|
70
|
+
# be too noisy due to issues in dependencies.
|
71
|
+
# config.warnings = true
|
72
|
+
|
73
|
+
# Many RSpec users commonly either run the entire suite or an individual
|
74
|
+
# file, and it's useful to allow more verbose output when running an
|
75
|
+
# individual spec file.
|
76
|
+
# if config.files_to_run.one?
|
77
|
+
# Use the documentation formatter for detailed output,
|
78
|
+
# unless a formatter has already been configured
|
79
|
+
# (e.g. via a command-line flag).
|
80
|
+
# config.default_formatter = 'doc'
|
81
|
+
# end
|
82
|
+
|
83
|
+
# Print the 10 slowest examples and example groups at the
|
84
|
+
# end of the spec run, to help surface which specs are running
|
85
|
+
# particularly slow.
|
86
|
+
# config.profile_examples = 10
|
87
|
+
|
18
88
|
# Run specs in random order to surface order dependencies. If you find an
|
19
89
|
# order dependency and want to debug it, you can fix the order by providing
|
20
90
|
# the seed, which is printed after each run.
|
21
91
|
# --seed 1234
|
22
|
-
config.order =
|
23
|
-
end
|
92
|
+
config.order = :random
|
24
93
|
|
94
|
+
# Seed global randomization in this process using the `--seed` CLI option.
|
95
|
+
# Setting this allows you to use `--seed` to deterministically reproduce
|
96
|
+
# test failures related to randomization by passing the same `--seed` value
|
97
|
+
# as the one that triggered the failure.
|
98
|
+
Kernel.srand config.seed
|
99
|
+
|
100
|
+
end
|
25
101
|
|
102
|
+
Dir[File.dirname(__FILE__) + "/support/**/*.rb"].each {|f| require f }
|
File without changes
|
@@ -0,0 +1,7 @@
|
|
1
|
+
RSpec.configure do |c|
|
2
|
+
c.around(:each, :disable_raise_error_warning) do |example|
|
3
|
+
RSpec::Expectations.configuration.warn_about_potential_false_positives = false
|
4
|
+
example.call
|
5
|
+
RSpec::Expectations.configuration.warn_about_potential_false_positives = true
|
6
|
+
end
|
7
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe UrlParser::Domain do
|
4
|
+
|
5
|
+
context ".new" do
|
6
|
+
|
7
|
+
it "downcases the input" do
|
8
|
+
instance = described_class.new('EXAMPLE.COM')
|
9
|
+
expect(instance.original).to eq 'example.com'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "removes the root label from absolute domains" do
|
13
|
+
instance = described_class.new('example.com.')
|
14
|
+
expect(instance.original).to eq 'example.com'
|
15
|
+
end
|
16
|
+
|
17
|
+
it "sets #original as the input string" do
|
18
|
+
instance = described_class.new("💩.la")
|
19
|
+
expect(instance.original).to eq "💩.la"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "sets the name as a string containing only ASCII characters" do
|
23
|
+
instance = described_class.new("💩.la")
|
24
|
+
expect(instance.name).to eq "xn--ls8h.la"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
context "#labels" do
|
30
|
+
|
31
|
+
it "returns an array of domain parts" do
|
32
|
+
instance = described_class.new('www.my.example.com')
|
33
|
+
expect(instance.labels).to eq(["com", "example", "my", "www"])
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
context "#suffix" do
|
39
|
+
|
40
|
+
it "when domain is valid, returns a PublicSuffix::Domain" do
|
41
|
+
instance = described_class.new('my.example.com')
|
42
|
+
expect(instance.suffix).to be_a PublicSuffix::Domain
|
43
|
+
end
|
44
|
+
|
45
|
+
it "with a PublicSuffix::Domain, a call to #to_s returns the domain" do
|
46
|
+
instance = described_class.new('my.example.com')
|
47
|
+
expect(instance.suffix.to_s).to eq 'my.example.com'
|
48
|
+
end
|
49
|
+
|
50
|
+
it "when domain is invalid, returns a OpenStruct" do
|
51
|
+
instance = described_class.new('//')
|
52
|
+
expect(instance.suffix).to be_a OpenStruct
|
53
|
+
end
|
54
|
+
|
55
|
+
it "when domain is invalid, a call to #to_s returns an empty string" do
|
56
|
+
instance = described_class.new('//')
|
57
|
+
expect(instance.suffix.to_s).to eq ''
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
context "#tld" do
|
63
|
+
|
64
|
+
it "when domain is valid, returns the top level domain" do
|
65
|
+
instance = described_class.new('www.my.example.com')
|
66
|
+
expect(instance.tld).to eq 'com'
|
67
|
+
end
|
68
|
+
|
69
|
+
it "when domain is invalid, returns nil" do
|
70
|
+
instance = described_class.new('//')
|
71
|
+
expect(instance.tld).to be_nil
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
context "#sld" do
|
77
|
+
|
78
|
+
it "when domain is valid, returns the second level domain" do
|
79
|
+
instance = described_class.new('www.my.example.com')
|
80
|
+
expect(instance.sld).to eq 'example'
|
81
|
+
end
|
82
|
+
|
83
|
+
it "when domain is invalid, returns nil" do
|
84
|
+
instance = described_class.new('//')
|
85
|
+
expect(instance.sld).to be_nil
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
context "#trd" do
|
91
|
+
|
92
|
+
it "when domain is valid, returns the third level domain" do
|
93
|
+
instance = described_class.new('www.my.example.com')
|
94
|
+
expect(instance.trd).to eq 'www.my'
|
95
|
+
end
|
96
|
+
|
97
|
+
it "when domain is invalid, returns nil" do
|
98
|
+
instance = described_class.new('//')
|
99
|
+
expect(instance.trd).to be_nil
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
context "#valid?" do
|
105
|
+
|
106
|
+
it "does not fail on an empty string" do
|
107
|
+
instance = described_class.new("")
|
108
|
+
expect(instance).not_to be_valid
|
109
|
+
end
|
110
|
+
|
111
|
+
it "is false when containing invalid characters" do
|
112
|
+
instance = described_class.new('my&example.com')
|
113
|
+
expect(instance).not_to be_valid
|
114
|
+
expect(instance.errors).to include "contains invalid characters"
|
115
|
+
end
|
116
|
+
|
117
|
+
it "is true with a valid suffix" do
|
118
|
+
instance = described_class.new('example.co.uk')
|
119
|
+
expect(instance).to be_valid
|
120
|
+
end
|
121
|
+
|
122
|
+
it "is false with an invalid suffix" do
|
123
|
+
instance = described_class.new('//')
|
124
|
+
expect(instance).not_to be_valid
|
125
|
+
expect(instance.errors).to include "'//' is not a valid domain"
|
126
|
+
end
|
127
|
+
|
128
|
+
it "is true with 127 labels or less" do
|
129
|
+
instance = described_class.new('.'*126+'com')
|
130
|
+
expect(instance).to be_valid
|
131
|
+
end
|
132
|
+
|
133
|
+
it "is false when exceeding 127 labels" do
|
134
|
+
instance = described_class.new('.'*127+'com')
|
135
|
+
expect(instance).not_to be_valid
|
136
|
+
expect(instance.errors).to include "exceeds 127 labels"
|
137
|
+
end
|
138
|
+
|
139
|
+
it "is true when no labels are greater than 63 characters" do
|
140
|
+
instance = described_class.new('a'*63+'.com')
|
141
|
+
expect(instance).to be_valid
|
142
|
+
end
|
143
|
+
|
144
|
+
it "is false with labels greater than 63 characters" do
|
145
|
+
instance = described_class.new('a'*64+'.com')
|
146
|
+
expect(instance).not_to be_valid
|
147
|
+
expect(instance.errors).to include "exceeds maximum label length of 63 characters"
|
148
|
+
end
|
149
|
+
|
150
|
+
it "is true with 253 ASCII characters or less" do
|
151
|
+
instance = described_class.new('a'*49+'.'+'b'*49+'.'+'c'*49+'.'+'d'*49+'.'+'e'*49+'.com')
|
152
|
+
expect(instance).to be_valid
|
153
|
+
end
|
154
|
+
|
155
|
+
it "is true with 253 ASCII characters or less" do
|
156
|
+
instance = described_class.new('a'*49+'.'+'b'*49+'.'+'c'*49+'.'+'d'*49+'.'+'e'*49+'.aero')
|
157
|
+
expect(instance).not_to be_valid
|
158
|
+
expect(instance.errors).to include "exceeds 253 ASCII characters"
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|