url_parser 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +4 -0
- data/Guardfile +40 -7
- data/LICENSE.txt +1 -1
- data/README.md +301 -5
- data/Rakefile +5 -0
- data/lib/url_parser.rb +93 -286
- data/lib/url_parser/db.yml +77 -0
- data/lib/url_parser/domain.rb +102 -0
- data/lib/url_parser/model.rb +233 -0
- data/lib/url_parser/option_setter.rb +47 -0
- data/lib/url_parser/parser.rb +206 -0
- data/lib/url_parser/uri.rb +206 -0
- data/lib/url_parser/version.rb +1 -1
- data/spec/spec_helper.rb +83 -6
- data/spec/support/.gitkeep +0 -0
- data/spec/support/helpers.rb +7 -0
- data/spec/url_parser/domain_spec.rb +163 -0
- data/spec/url_parser/model_spec.rb +426 -0
- data/spec/url_parser/option_setter_spec.rb +71 -0
- data/spec/url_parser/parser_spec.rb +515 -0
- data/spec/url_parser/uri_spec.rb +570 -0
- data/spec/url_parser_spec.rb +93 -387
- data/url_parser.gemspec +5 -6
- metadata +39 -29
@@ -0,0 +1,206 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
require 'resolv'
|
3
|
+
|
4
|
+
module UrlParser
|
5
|
+
class URI
|
6
|
+
extend Forwardable
|
7
|
+
|
8
|
+
LOCALHOST_REGEXP = /(\A|\.)localhost\z/
|
9
|
+
|
10
|
+
COMPONENTS = [
|
11
|
+
:scheme, # Top level URI naming structure / protocol.
|
12
|
+
:username, # Username portion of the userinfo.
|
13
|
+
:user, # Alias for #username.
|
14
|
+
:password, # Password portion of the userinfo.
|
15
|
+
:userinfo, # URI username and password for authentication.
|
16
|
+
:hostname, # Fully qualified domain name or IP address.
|
17
|
+
:naked_hostname, # Hostname without any ww? prefix.
|
18
|
+
:port, # Port number.
|
19
|
+
:host, # Hostname and port.
|
20
|
+
:www, # The ww? portion of the subdomain.
|
21
|
+
:tld, # Returns the top level domain portion, aka the extension.
|
22
|
+
:top_level_domain, # Alias for #tld.
|
23
|
+
:extension, # Alias for #tld.
|
24
|
+
:sld, # Returns the second level domain portion, aka the domain part.
|
25
|
+
:second_level_domain, # Alias for #sld.
|
26
|
+
:domain_name, # Alias for #sld.
|
27
|
+
:trd, # Returns the third level domain portion, aka the subdomain part.
|
28
|
+
:third_level_domain, # Alias for #trd.
|
29
|
+
:subdomains, # Alias for #trd.
|
30
|
+
:naked_trd, # Any non-ww? subdomains.
|
31
|
+
:naked_subdomain, # Alias for #naked_trd.
|
32
|
+
:domain, # The domain name with the tld.
|
33
|
+
:subdomain, # All subdomains, include ww?.
|
34
|
+
:origin, # Scheme and host.
|
35
|
+
:authority, # Userinfo and host.
|
36
|
+
:site, # Scheme, userinfo, and host.
|
37
|
+
:path, # Directory and segment.
|
38
|
+
:segment, # Last portion of the path.
|
39
|
+
:directory, # Any directories following the site within the URI.
|
40
|
+
:filename, # Segment if a file extension is present.
|
41
|
+
:suffix, # The file extension of the filename.
|
42
|
+
:query, # Params and values as a string.
|
43
|
+
:query_values, # A hash of params and values.
|
44
|
+
:fragment, # Fragment identifier.
|
45
|
+
:resource, # Path, query, and fragment.
|
46
|
+
:location # Directory and resource - everything after the site.
|
47
|
+
]
|
48
|
+
|
49
|
+
def_delegators :@model, *COMPONENTS
|
50
|
+
|
51
|
+
def_delegator :@model, :parsed_domain
|
52
|
+
def_delegator :parsed_domain, :labels
|
53
|
+
|
54
|
+
attr_reader :input, :uri, :options
|
55
|
+
|
56
|
+
def initialize(uri, options = {}, &blk)
|
57
|
+
@input = uri
|
58
|
+
@options = set_options(options, &blk)
|
59
|
+
@block = blk ? blk : block_builder
|
60
|
+
@uri = UrlParser::Parser.call(@input, @options, &@block)
|
61
|
+
@model = UrlParser::Model.new(@uri)
|
62
|
+
end
|
63
|
+
|
64
|
+
def unescaped?
|
65
|
+
!!options[:unescape]
|
66
|
+
end
|
67
|
+
|
68
|
+
def parsed?
|
69
|
+
true
|
70
|
+
end
|
71
|
+
|
72
|
+
def unembedded?
|
73
|
+
!!options[:unembed]
|
74
|
+
end
|
75
|
+
|
76
|
+
def canonicalized?
|
77
|
+
!!options[:canonicalize]
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalized?
|
81
|
+
!!options[:normalize]
|
82
|
+
end
|
83
|
+
|
84
|
+
def cleaned?
|
85
|
+
!!options[:clean] || (
|
86
|
+
unescaped? &&
|
87
|
+
parsed? &&
|
88
|
+
unembedded? &&
|
89
|
+
canonicalized? &&
|
90
|
+
normalized?
|
91
|
+
)
|
92
|
+
end
|
93
|
+
|
94
|
+
def clean
|
95
|
+
if cleaned?
|
96
|
+
raw
|
97
|
+
else
|
98
|
+
UrlParser::Parser.call(@input, raw: true) { |uri| uri.clean! }
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Cleans and converts into a naked hostname
|
103
|
+
#
|
104
|
+
def canonical
|
105
|
+
opts = { raw: true }
|
106
|
+
curi = naked_hostname + location
|
107
|
+
|
108
|
+
UrlParser::Parser.call(curi, opts) do |uri|
|
109
|
+
uri.clean!
|
110
|
+
end.sub(/\A[a-z]+:\/\//i, '//')
|
111
|
+
end
|
112
|
+
|
113
|
+
def clean?
|
114
|
+
cleaned? || self.to_s == clean
|
115
|
+
end
|
116
|
+
|
117
|
+
def relative?
|
118
|
+
uri.relative?
|
119
|
+
end
|
120
|
+
|
121
|
+
def absolute?
|
122
|
+
uri.absolute?
|
123
|
+
end
|
124
|
+
|
125
|
+
def localhost?
|
126
|
+
!!(hostname.to_s[LOCALHOST_REGEXP])
|
127
|
+
end
|
128
|
+
|
129
|
+
def ipv4
|
130
|
+
hostname.to_s[Resolv::IPv4::Regex]
|
131
|
+
end
|
132
|
+
|
133
|
+
def ipv4?
|
134
|
+
!!ipv4
|
135
|
+
end
|
136
|
+
|
137
|
+
def ipv6
|
138
|
+
host.to_s[Resolv::IPv6::Regex]
|
139
|
+
end
|
140
|
+
|
141
|
+
def ipv6?
|
142
|
+
!!ipv6
|
143
|
+
end
|
144
|
+
|
145
|
+
def ip_address?
|
146
|
+
ipv4? || ipv6?
|
147
|
+
end
|
148
|
+
|
149
|
+
def naked?
|
150
|
+
!localhost? && www.nil?
|
151
|
+
end
|
152
|
+
|
153
|
+
def raw
|
154
|
+
uri.to_s
|
155
|
+
end
|
156
|
+
alias_method :to_s, :raw
|
157
|
+
|
158
|
+
def sha1
|
159
|
+
Digest::SHA1.hexdigest(raw)
|
160
|
+
end
|
161
|
+
alias_method :hash, :sha1
|
162
|
+
|
163
|
+
def ==(uri)
|
164
|
+
clean == self.class.new(uri, clean: true).clean
|
165
|
+
end
|
166
|
+
|
167
|
+
def =~(uri)
|
168
|
+
canonical == self.class.new(uri, clean: true).canonical
|
169
|
+
end
|
170
|
+
|
171
|
+
def +(uri)
|
172
|
+
self.class.new(uri.to_s, options.merge({ base_uri: self.to_s}), &@block)
|
173
|
+
end
|
174
|
+
alias_method :join, :+
|
175
|
+
|
176
|
+
def valid?
|
177
|
+
return false if input.nil? || relative?
|
178
|
+
return true if ip_address? || localhost?
|
179
|
+
parsed_domain.valid?
|
180
|
+
end
|
181
|
+
|
182
|
+
private
|
183
|
+
|
184
|
+
def set_options(opts = {}, &blk)
|
185
|
+
UrlParser::OptionSetter
|
186
|
+
.new(opts, &blk)
|
187
|
+
.to_hash
|
188
|
+
.merge(raw: false)
|
189
|
+
end
|
190
|
+
|
191
|
+
def block_builder
|
192
|
+
proc do |uri|
|
193
|
+
if cleaned?
|
194
|
+
uri.clean!
|
195
|
+
else
|
196
|
+
uri.unescape! if unescaped?
|
197
|
+
uri.parse! if parsed?
|
198
|
+
uri.unembed! if unembedded?
|
199
|
+
uri.canonicalize! if canonicalized?
|
200
|
+
uri.normalize! if normalized?
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
end
|
data/lib/url_parser/version.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
2
|
+
|
3
|
+
require 'codeclimate-test-reporter'
|
4
|
+
CodeClimate::TestReporter.start
|
5
|
+
|
1
6
|
require "rspec"
|
7
|
+
|
2
8
|
begin
|
3
9
|
require "pry"
|
4
10
|
rescue LoadError; end
|
@@ -7,19 +13,90 @@ require "url_parser"
|
|
7
13
|
|
8
14
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
9
15
|
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
10
|
-
#
|
11
|
-
# loaded
|
16
|
+
# The generated `.rspec` file contains `--require spec_helper` which will cause this
|
17
|
+
# file to always be loaded, without a need to explicitly require it in any files.
|
18
|
+
#
|
19
|
+
# Given that it is always loaded, you are encouraged to keep this file as
|
20
|
+
# light-weight as possible. Requiring heavyweight dependencies from this file
|
21
|
+
# will add to the boot time of your test suite on EVERY test run, even for an
|
22
|
+
# individual file that may not need all of that loaded. Instead, consider making
|
23
|
+
# a separate helper file that requires the additional dependencies and performs
|
24
|
+
# the additional setup, and require it from the spec files that actually need it.
|
25
|
+
#
|
26
|
+
# The `.rspec` file also contains a few flags that are not defaults but that
|
27
|
+
# users commonly want.
|
12
28
|
#
|
13
29
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
14
30
|
RSpec.configure do |config|
|
15
|
-
|
31
|
+
|
32
|
+
# rspec-expectations config goes here. You can use an alternate
|
33
|
+
# assertion/expectation library such as wrong or the stdlib/minitest
|
34
|
+
# assertions if you prefer.
|
35
|
+
config.expect_with :rspec do |expectations|
|
36
|
+
# This option will default to `true` in RSpec 4. It makes the `description`
|
37
|
+
# and `failure_message` of custom matchers include text for helper methods
|
38
|
+
# defined using `chain`, e.g.:
|
39
|
+
# be_bigger_than(2).and_smaller_than(4).description
|
40
|
+
# # => "be bigger than 2 and smaller than 4"
|
41
|
+
# ...rather than:
|
42
|
+
# # => "be bigger than 2"
|
43
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
44
|
+
end
|
45
|
+
|
46
|
+
# rspec-mocks config goes here. You can use an alternate test double
|
47
|
+
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
48
|
+
config.mock_with :rspec do |mocks|
|
49
|
+
# Prevents you from mocking or stubbing a method that does not exist on
|
50
|
+
# a real object. This is generally recommended, and will default to
|
51
|
+
# `true` in RSpec 4.
|
52
|
+
mocks.verify_partial_doubles = true
|
53
|
+
end
|
54
|
+
|
55
|
+
# These two settings work together to allow you to limit a spec run
|
56
|
+
# to individual examples or groups you care about by tagging them with
|
57
|
+
# `:focus` metadata. When nothing is tagged with `:focus`, all examples
|
58
|
+
# get run.
|
16
59
|
config.filter_run :focus
|
17
|
-
config.
|
60
|
+
config.run_all_when_everything_filtered = true
|
61
|
+
|
62
|
+
# Limits the available syntax to the non-monkey patched syntax that is recommended.
|
63
|
+
# For more details, see:
|
64
|
+
# - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
|
65
|
+
# - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
66
|
+
# - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
|
67
|
+
config.disable_monkey_patching!
|
68
|
+
|
69
|
+
# This setting enables warnings. It's recommended, but in some cases may
|
70
|
+
# be too noisy due to issues in dependencies.
|
71
|
+
# config.warnings = true
|
72
|
+
|
73
|
+
# Many RSpec users commonly either run the entire suite or an individual
|
74
|
+
# file, and it's useful to allow more verbose output when running an
|
75
|
+
# individual spec file.
|
76
|
+
# if config.files_to_run.one?
|
77
|
+
# Use the documentation formatter for detailed output,
|
78
|
+
# unless a formatter has already been configured
|
79
|
+
# (e.g. via a command-line flag).
|
80
|
+
# config.default_formatter = 'doc'
|
81
|
+
# end
|
82
|
+
|
83
|
+
# Print the 10 slowest examples and example groups at the
|
84
|
+
# end of the spec run, to help surface which specs are running
|
85
|
+
# particularly slow.
|
86
|
+
# config.profile_examples = 10
|
87
|
+
|
18
88
|
# Run specs in random order to surface order dependencies. If you find an
|
19
89
|
# order dependency and want to debug it, you can fix the order by providing
|
20
90
|
# the seed, which is printed after each run.
|
21
91
|
# --seed 1234
|
22
|
-
config.order =
|
23
|
-
end
|
92
|
+
config.order = :random
|
24
93
|
|
94
|
+
# Seed global randomization in this process using the `--seed` CLI option.
|
95
|
+
# Setting this allows you to use `--seed` to deterministically reproduce
|
96
|
+
# test failures related to randomization by passing the same `--seed` value
|
97
|
+
# as the one that triggered the failure.
|
98
|
+
Kernel.srand config.seed
|
99
|
+
|
100
|
+
end
|
25
101
|
|
102
|
+
Dir[File.dirname(__FILE__) + "/support/**/*.rb"].each {|f| require f }
|
File without changes
|
@@ -0,0 +1,7 @@
|
|
1
|
+
RSpec.configure do |c|
|
2
|
+
c.around(:each, :disable_raise_error_warning) do |example|
|
3
|
+
RSpec::Expectations.configuration.warn_about_potential_false_positives = false
|
4
|
+
example.call
|
5
|
+
RSpec::Expectations.configuration.warn_about_potential_false_positives = true
|
6
|
+
end
|
7
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe UrlParser::Domain do
|
4
|
+
|
5
|
+
context ".new" do
|
6
|
+
|
7
|
+
it "downcases the input" do
|
8
|
+
instance = described_class.new('EXAMPLE.COM')
|
9
|
+
expect(instance.original).to eq 'example.com'
|
10
|
+
end
|
11
|
+
|
12
|
+
it "removes the root label from absolute domains" do
|
13
|
+
instance = described_class.new('example.com.')
|
14
|
+
expect(instance.original).to eq 'example.com'
|
15
|
+
end
|
16
|
+
|
17
|
+
it "sets #original as the input string" do
|
18
|
+
instance = described_class.new("💩.la")
|
19
|
+
expect(instance.original).to eq "💩.la"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "sets the name as a string containing only ASCII characters" do
|
23
|
+
instance = described_class.new("💩.la")
|
24
|
+
expect(instance.name).to eq "xn--ls8h.la"
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
context "#labels" do
|
30
|
+
|
31
|
+
it "returns an array of domain parts" do
|
32
|
+
instance = described_class.new('www.my.example.com')
|
33
|
+
expect(instance.labels).to eq(["com", "example", "my", "www"])
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
context "#suffix" do
|
39
|
+
|
40
|
+
it "when domain is valid, returns a PublicSuffix::Domain" do
|
41
|
+
instance = described_class.new('my.example.com')
|
42
|
+
expect(instance.suffix).to be_a PublicSuffix::Domain
|
43
|
+
end
|
44
|
+
|
45
|
+
it "with a PublicSuffix::Domain, a call to #to_s returns the domain" do
|
46
|
+
instance = described_class.new('my.example.com')
|
47
|
+
expect(instance.suffix.to_s).to eq 'my.example.com'
|
48
|
+
end
|
49
|
+
|
50
|
+
it "when domain is invalid, returns a OpenStruct" do
|
51
|
+
instance = described_class.new('//')
|
52
|
+
expect(instance.suffix).to be_a OpenStruct
|
53
|
+
end
|
54
|
+
|
55
|
+
it "when domain is invalid, a call to #to_s returns an empty string" do
|
56
|
+
instance = described_class.new('//')
|
57
|
+
expect(instance.suffix.to_s).to eq ''
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
context "#tld" do
|
63
|
+
|
64
|
+
it "when domain is valid, returns the top level domain" do
|
65
|
+
instance = described_class.new('www.my.example.com')
|
66
|
+
expect(instance.tld).to eq 'com'
|
67
|
+
end
|
68
|
+
|
69
|
+
it "when domain is invalid, returns nil" do
|
70
|
+
instance = described_class.new('//')
|
71
|
+
expect(instance.tld).to be_nil
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
context "#sld" do
|
77
|
+
|
78
|
+
it "when domain is valid, returns the second level domain" do
|
79
|
+
instance = described_class.new('www.my.example.com')
|
80
|
+
expect(instance.sld).to eq 'example'
|
81
|
+
end
|
82
|
+
|
83
|
+
it "when domain is invalid, returns nil" do
|
84
|
+
instance = described_class.new('//')
|
85
|
+
expect(instance.sld).to be_nil
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
context "#trd" do
|
91
|
+
|
92
|
+
it "when domain is valid, returns the third level domain" do
|
93
|
+
instance = described_class.new('www.my.example.com')
|
94
|
+
expect(instance.trd).to eq 'www.my'
|
95
|
+
end
|
96
|
+
|
97
|
+
it "when domain is invalid, returns nil" do
|
98
|
+
instance = described_class.new('//')
|
99
|
+
expect(instance.trd).to be_nil
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
context "#valid?" do
|
105
|
+
|
106
|
+
it "does not fail on an empty string" do
|
107
|
+
instance = described_class.new("")
|
108
|
+
expect(instance).not_to be_valid
|
109
|
+
end
|
110
|
+
|
111
|
+
it "is false when containing invalid characters" do
|
112
|
+
instance = described_class.new('my&example.com')
|
113
|
+
expect(instance).not_to be_valid
|
114
|
+
expect(instance.errors).to include "contains invalid characters"
|
115
|
+
end
|
116
|
+
|
117
|
+
it "is true with a valid suffix" do
|
118
|
+
instance = described_class.new('example.co.uk')
|
119
|
+
expect(instance).to be_valid
|
120
|
+
end
|
121
|
+
|
122
|
+
it "is false with an invalid suffix" do
|
123
|
+
instance = described_class.new('//')
|
124
|
+
expect(instance).not_to be_valid
|
125
|
+
expect(instance.errors).to include "'//' is not a valid domain"
|
126
|
+
end
|
127
|
+
|
128
|
+
it "is true with 127 labels or less" do
|
129
|
+
instance = described_class.new('.'*126+'com')
|
130
|
+
expect(instance).to be_valid
|
131
|
+
end
|
132
|
+
|
133
|
+
it "is false when exceeding 127 labels" do
|
134
|
+
instance = described_class.new('.'*127+'com')
|
135
|
+
expect(instance).not_to be_valid
|
136
|
+
expect(instance.errors).to include "exceeds 127 labels"
|
137
|
+
end
|
138
|
+
|
139
|
+
it "is true when no labels are greater than 63 characters" do
|
140
|
+
instance = described_class.new('a'*63+'.com')
|
141
|
+
expect(instance).to be_valid
|
142
|
+
end
|
143
|
+
|
144
|
+
it "is false with labels greater than 63 characters" do
|
145
|
+
instance = described_class.new('a'*64+'.com')
|
146
|
+
expect(instance).not_to be_valid
|
147
|
+
expect(instance.errors).to include "exceeds maximum label length of 63 characters"
|
148
|
+
end
|
149
|
+
|
150
|
+
it "is true with 253 ASCII characters or less" do
|
151
|
+
instance = described_class.new('a'*49+'.'+'b'*49+'.'+'c'*49+'.'+'d'*49+'.'+'e'*49+'.com')
|
152
|
+
expect(instance).to be_valid
|
153
|
+
end
|
154
|
+
|
155
|
+
it "is true with 253 ASCII characters or less" do
|
156
|
+
instance = described_class.new('a'*49+'.'+'b'*49+'.'+'c'*49+'.'+'d'*49+'.'+'e'*49+'.aero')
|
157
|
+
expect(instance).not_to be_valid
|
158
|
+
expect(instance.errors).to include "exceeds 253 ASCII characters"
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|