url_parser 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +4 -0
- data/Guardfile +40 -7
- data/LICENSE.txt +1 -1
- data/README.md +301 -5
- data/Rakefile +5 -0
- data/lib/url_parser.rb +93 -286
- data/lib/url_parser/db.yml +77 -0
- data/lib/url_parser/domain.rb +102 -0
- data/lib/url_parser/model.rb +233 -0
- data/lib/url_parser/option_setter.rb +47 -0
- data/lib/url_parser/parser.rb +206 -0
- data/lib/url_parser/uri.rb +206 -0
- data/lib/url_parser/version.rb +1 -1
- data/spec/spec_helper.rb +83 -6
- data/spec/support/.gitkeep +0 -0
- data/spec/support/helpers.rb +7 -0
- data/spec/url_parser/domain_spec.rb +163 -0
- data/spec/url_parser/model_spec.rb +426 -0
- data/spec/url_parser/option_setter_spec.rb +71 -0
- data/spec/url_parser/parser_spec.rb +515 -0
- data/spec/url_parser/uri_spec.rb +570 -0
- data/spec/url_parser_spec.rb +93 -387
- data/url_parser.gemspec +5 -6
- metadata +39 -29
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e2a74ec366bad04988dd1f34963428f2f926fe08
|
4
|
+
data.tar.gz: b3fdc8e738b103bc74820bb94ad2327d14ec6dc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0bd4a7b601dc635af88e0ff0a8a601de05ff6b4cbee00a5e479f0b23df4a447fbd854f10ea4e1164851d56de661df5e322878f032277e7da87b75cf94f8bd8e8
|
7
|
+
data.tar.gz: afb86d3bf4f117840bef16d2f43ca205ce7e7697416850b28b18c83a5d639444d2d5a167d93cf50dac955d6e4a04b0bd2ecac8c4224e8d3a1dce29406d1c8349
|
data/.gitignore
CHANGED
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
url_parser
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.3.0
|
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
v0.5.0 / 2016-02-09
|
2
|
+
======================
|
3
|
+
|
4
|
+
* Updated README.md
|
5
|
+
* Added CHANGELOG.md
|
6
|
+
* Only tag errors that inherit from StandardError
|
7
|
+
* Deprecate UrlParser.new, is now UrlParser.parse
|
8
|
+
* Added UrlParser::URI#ipv4 and UrlParser::URI#ipv6 to return the actual values, if applicable
|
9
|
+
* Added [gem_config](https://github.com/krautcomputing/gem_config) for configurable library settings :embedded_params, :default_scheme, and :scheme_map, see README.md for usage
|
10
|
+
* Add UrlParser module functions .parse, .unembed, .normalize, .canonicalize, and .clean
|
11
|
+
* Add UrlParser::Domain to handle domain name validations
|
12
|
+
* Add UrlParser .escape and .unescape to encode and decode strings
|
13
|
+
* Add UrlParser::Parser class for unescaping, parsing, unembedding, canonicalization, normalization, and hashing URI strings
|
14
|
+
* Add UrlParser::URI#naked_hostname to return the entire hostname without any ww? prefix
|
15
|
+
* Refactored UrlParser::URI and UrlParser::Parser classes, see README.md for updated usage
|
16
|
+
* Added 'addressable' to gemspec
|
17
|
+
* Remove 'naught' gem dependency
|
18
|
+
* Remove 'activemodel' gem dependency
|
19
|
+
* Remove 'activesupport' gem dependency
|
20
|
+
* Remove 'postrank-uri' gem dependency
|
data/Gemfile
CHANGED
data/Guardfile
CHANGED
@@ -1,17 +1,50 @@
|
|
1
1
|
# A sample Guardfile
|
2
2
|
# More info at https://github.com/guard/guard#readme
|
3
3
|
|
4
|
+
## Uncomment and set this to only include directories you want to watch
|
5
|
+
# directories %w(app lib config test spec features)
|
6
|
+
|
7
|
+
## Uncomment to clear the screen before every task
|
8
|
+
# clearing :on
|
9
|
+
|
10
|
+
## Guard internally checks for changes in the Guardfile and exits.
|
11
|
+
## If you want Guard to automatically start up again, run guard in a
|
12
|
+
## shell loop, e.g.:
|
13
|
+
##
|
14
|
+
## $ while bundle exec guard; do echo "Restarting Guard..."; done
|
15
|
+
##
|
16
|
+
## Note: if you are using the `directories` clause above and you are not
|
17
|
+
## watching the project directory ('.'), then you will want to move
|
18
|
+
## the Guardfile to a watched dir and symlink it back, e.g.
|
19
|
+
#
|
20
|
+
# $ mkdir config
|
21
|
+
# $ mv Guardfile config/
|
22
|
+
# $ ln -s config/Guardfile .
|
23
|
+
#
|
24
|
+
# and, you'll have to watch "config/Guardfile" instead of "Guardfile"
|
25
|
+
|
4
26
|
# Note: The cmd option is now required due to the increasing number of ways
|
5
27
|
# rspec may be run, below are examples of the most common uses.
|
6
28
|
# * bundler: 'bundle exec rspec'
|
7
29
|
# * bundler binstubs: 'bin/rspec'
|
8
|
-
# * spring: 'bin/
|
30
|
+
# * spring: 'bin/rspec' (This will use spring if running and you have
|
9
31
|
# installed the spring binstubs per the docs)
|
10
|
-
# * zeus: 'zeus rspec' (requires the server to be started
|
32
|
+
# * zeus: 'zeus rspec' (requires the server to be started separately)
|
11
33
|
# * 'just' rspec: 'rspec'
|
12
|
-
guard :rspec, cmd: 'bundle exec rspec' do
|
13
|
-
watch(%r{^spec/.+_spec\.rb$})
|
14
|
-
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
15
|
-
watch('spec/spec_helper.rb') { "spec" }
|
16
|
-
end
|
17
34
|
|
35
|
+
guard :rspec, cmd: "bundle exec rspec" do
|
36
|
+
require "guard/rspec/dsl"
|
37
|
+
dsl = Guard::RSpec::Dsl.new(self)
|
38
|
+
|
39
|
+
# Feel free to open issues for suggestions and improvements
|
40
|
+
|
41
|
+
# RSpec files
|
42
|
+
rspec = dsl.rspec
|
43
|
+
watch(rspec.spec_helper) { rspec.spec_dir }
|
44
|
+
watch(rspec.spec_support) { rspec.spec_dir }
|
45
|
+
watch(rspec.spec_files)
|
46
|
+
|
47
|
+
# Ruby files
|
48
|
+
ruby = dsl.ruby
|
49
|
+
dsl.watch_spec_files_for(ruby.lib_files)
|
50
|
+
end
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
# UrlParser
|
2
2
|
|
3
|
-
|
3
|
+
[](https://rubygems.org/gems/url_parser)
|
4
|
+
[](http://travis-ci.org/activefx/url_parser)
|
5
|
+
[](https://codeclimate.com/github/activefx/url_parser)
|
6
|
+
[](https://codeclimate.com/github/activefx/url_parser/coverage)
|
7
|
+
[](https://gemnasium.com/activefx/url_parser)
|
4
8
|
|
5
|
-
|
6
|
-
- https://github.com/pauldix/domainatrix
|
7
|
-
- https://github.com/postrank-labs/postrank-uri
|
9
|
+
Extended URI capabilities built on top of Addressable::URI. Parse URIs into granular components, unescape encoded characters, extract embedded URIs, normalize URIs, handle canonical url generation, and validate domains. Inspired by [PostRank-URI](https://github.com/postrank-labs/postrank-uri) and [URI.js](https://github.com/medialize/URI.js).
|
8
10
|
|
9
11
|
## Installation
|
10
12
|
|
@@ -20,9 +22,303 @@ Or install it yourself as:
|
|
20
22
|
|
21
23
|
$ gem install url_parser
|
22
24
|
|
25
|
+
## Example
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
uri = UrlParser.parse('foo://username:password@ww2.foo.bar.example.com:123/hello/world/there.html?name=ferret#foo')
|
29
|
+
|
30
|
+
uri.class #=> UrlParser::URI
|
31
|
+
uri.scheme #=> 'foo'
|
32
|
+
uri.username #=> 'username'
|
33
|
+
uri.user #=> 'username' # Alias for #username
|
34
|
+
uri.password #=> 'password'
|
35
|
+
uri.userinfo #=> 'username:password'
|
36
|
+
uri.hostname #=> 'ww2.foo.bar.example.com'
|
37
|
+
uri.naked_hostname #=> 'foo.bar.example.com'
|
38
|
+
uri.port #=> 123
|
39
|
+
uri.host #=> 'ww2.foo.bar.example.com:123'
|
40
|
+
uri.www #=> 'ww2'
|
41
|
+
uri.tld #=> 'com'
|
42
|
+
uri.top_level_domain #=> 'com' # Alias for #tld
|
43
|
+
uri.extension #=> 'com' # Alias for #tld
|
44
|
+
uri.sld #=> 'example'
|
45
|
+
uri.second_level_domain #=> 'example' # Alias for #sld
|
46
|
+
uri.domain_name #=> 'example' # Alias for #sld
|
47
|
+
uri.trd #=> 'ww2.foo.bar'
|
48
|
+
uri.third_level_domain #=> 'ww2.foo.bar' # Alias for #trd
|
49
|
+
uri.subdomains #=> 'ww2.foo.bar' # Alias for #trd
|
50
|
+
uri.naked_trd #=> 'foo.bar'
|
51
|
+
uri.naked_subdomain #=> 'foo.bar' # Alias for #naked_trd
|
52
|
+
uri.domain #=> 'example.com'
|
53
|
+
uri.subdomain #=> 'ww2.foo.bar.example.com'
|
54
|
+
uri.origin #=> 'foo://ww2.foo.bar.example.com:123'
|
55
|
+
uri.authority #=> 'username:password@ww2.foo.bar.example.com:123'
|
56
|
+
uri.site #=> 'foo://username:password@ww2.foo.bar.example.com:123'
|
57
|
+
uri.path #=> '/hello/world/there.html'
|
58
|
+
uri.segment #=> 'there.html'
|
59
|
+
uri.directory #=> '/hello/world'
|
60
|
+
uri.filename #=> 'there.html'
|
61
|
+
uri.suffix #=> 'html'
|
62
|
+
uri.query #=> 'name=ferret'
|
63
|
+
uri.query_values #=> { 'name' => 'ferret' }
|
64
|
+
uri.fragment #=> 'foo'
|
65
|
+
uri.resource #=> 'there.html?name=ferret#foo'
|
66
|
+
uri.location #=> '/hello/world/there.html?name=ferret#foo'
|
67
|
+
```
|
68
|
+
|
23
69
|
## Usage
|
24
70
|
|
25
|
-
|
71
|
+
### Parse
|
72
|
+
|
73
|
+
Parse takes the provided URI and breaks it down into its component parts. To see a full list components provided, see [URI Data Model](#uri-data-model). If you provide an instance of Addressable::URI, it will consider the URI already parsed.
|
74
|
+
|
75
|
+
```ruby```
|
76
|
+
uri = UrlParser.parse('http://example.org/foo?bar=baz')
|
77
|
+
uri.class
|
78
|
+
#=> UrlParser::URI
|
79
|
+
```
|
80
|
+
|
81
|
+
Unembed, canonicalize, normalize, and clean all rely on parse.
|
82
|
+
|
83
|
+
### Unembed
|
84
|
+
|
85
|
+
Unembed searches the provided URI's query values for redirection urls. By default, it searches the `u` and `url` params, however you can configure custom params to search.
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
uri = UrlParser.unembed('http://energy.gov/exit?url=https%3A//twitter.com/energy')
|
89
|
+
uri.to_s
|
90
|
+
#=> "https://twitter.com/energy"
|
91
|
+
```
|
92
|
+
|
93
|
+
With custom embedded params keys:
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
uri = UrlParser.unembed('https://www.upwork.com/leaving?ref=https%3A%2F%2Fwww.example.com', embedded_params: [ 'u', 'url', 'ref' ])
|
97
|
+
uri.to_s
|
98
|
+
#=> "https://www.example.com/"
|
99
|
+
```
|
100
|
+
|
101
|
+
### Canonicalize
|
102
|
+
|
103
|
+
Canonicalize applies filters on param keys to remove common tracking params, attempting to make it easier to identify duplicate URIs. For a full list of params, see `db.yml`.
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
uri = UrlParser.canonicalize('https://en.wikipedia.org/wiki/Ruby_(programming_language)?source=ABCD&utm_source=EFGH')
|
107
|
+
uri.to_s
|
108
|
+
#=> "https://en.wikipedia.org/wiki/Ruby_(programming_language)?"
|
109
|
+
```
|
110
|
+
|
111
|
+
### Normalize
|
112
|
+
|
113
|
+
Normalize standardizes paths, query strings, anchors, whitespace, hostnames, and trailing slashes.
|
114
|
+
|
115
|
+
```ruby
|
116
|
+
# Normalize paths
|
117
|
+
uri = UrlParser.normalize('http://example.com/a/b/../../')
|
118
|
+
uri.to_s
|
119
|
+
#=> "http://example.com/"
|
120
|
+
|
121
|
+
# Normalize query strings
|
122
|
+
uri = UrlParser.normalize('http://example.com/?')
|
123
|
+
uri.to_s
|
124
|
+
#=> "http://example.com/"
|
125
|
+
|
126
|
+
# Normalize anchors
|
127
|
+
uri = UrlParser.normalize('http://example.com/#test')
|
128
|
+
uri.to_s
|
129
|
+
#=> "http://example.com/"
|
130
|
+
|
131
|
+
# Normalize whitespace
|
132
|
+
uri = UrlParser.normalize('http://example.com/a/../? #test')
|
133
|
+
uri.to_s
|
134
|
+
#=> "http://example.com/"
|
135
|
+
|
136
|
+
# Normalize hostnames
|
137
|
+
uri = UrlParser.normalize("💩.la")
|
138
|
+
uri.to_s
|
139
|
+
#=> "http://xn--ls8h.la/"
|
140
|
+
|
141
|
+
# Normalize trailing slashes
|
142
|
+
uri = UrlParser.normalize('http://example.com/a/b/')
|
143
|
+
uri.to_s
|
144
|
+
#=> "http://example.com/a/b"
|
145
|
+
```
|
146
|
+
|
147
|
+
### Clean
|
148
|
+
|
149
|
+
Clean combines parsing, unembedding, canonicalization, and normalization into a single call. It is designed to provide a method for cross-referencing identical urls.
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
uri = UrlParser.clean('http://example.com/a/../?url=https%3A//💩.la/&utm_source=google')
|
153
|
+
uri.to_s
|
154
|
+
#=> "https://xn--ls8h.la/"
|
155
|
+
|
156
|
+
uri = UrlParser.clean('https://en.wikipedia.org/wiki/Ruby_(programming_language)?source=ABCD&utm_source%3Danalytics')
|
157
|
+
uri.to_s
|
158
|
+
#=> "https://en.wikipedia.org/wiki/Ruby_(programming_language)"
|
159
|
+
```
|
160
|
+
|
161
|
+
## UrlParser::URI
|
162
|
+
|
163
|
+
Parsing a URI with UrlParser returns an instance of `UrlParser::URI`, with the following methods available:
|
164
|
+
|
165
|
+
### URI Data Model
|
166
|
+
|
167
|
+
```ruby
|
168
|
+
* :scheme # Top level URI naming structure / protocol.
|
169
|
+
* :username # Username portion of the userinfo.
|
170
|
+
* :user # Alias for #username.
|
171
|
+
* :password # Password portion of the userinfo.
|
172
|
+
* :userinfo # URI username and password for authentication.
|
173
|
+
* :hostname # Fully qualified domain name or IP address.
|
174
|
+
* :naked_hostname # Hostname without any ww? prefix.
|
175
|
+
* :port # Port number.
|
176
|
+
* :host # Hostname and port.
|
177
|
+
* :www # The ww? portion of the subdomain.
|
178
|
+
* :tld # Returns the top level domain portion, aka the extension.
|
179
|
+
* :top_level_domain # Alias for #tld.
|
180
|
+
* :extension # Alias for #tld.
|
181
|
+
* :sld # Returns the second level domain portion, aka the domain part.
|
182
|
+
* :second_level_domain # Alias for #sld.
|
183
|
+
* :domain_name # Alias for #sld.
|
184
|
+
* :trd # Returns the third level domain portion, aka the subdomain part.
|
185
|
+
* :third_level_domain # Alias for #trd.
|
186
|
+
* :subdomains # Alias for #trd.
|
187
|
+
* :naked_trd # Any non-ww? subdomains.
|
188
|
+
* :naked_subdomain # Alias for #naked_trd.
|
189
|
+
* :domain # The domain name with the tld.
|
190
|
+
* :subdomain # All subdomains, include ww?.
|
191
|
+
* :origin # Scheme and host.
|
192
|
+
* :authority # Userinfo and host.
|
193
|
+
* :site # Scheme, userinfo, and host.
|
194
|
+
* :path # Directory and segment.
|
195
|
+
* :segment # Last portion of the path.
|
196
|
+
* :directory # Any directories following the site within the URI.
|
197
|
+
* :filename # Segment if a file extension is present.
|
198
|
+
* :suffix # The file extension of the filename.
|
199
|
+
* :query # Params and values as a string.
|
200
|
+
* :query_values # A hash of params and values.
|
201
|
+
* :fragment # Fragment identifier.
|
202
|
+
* :resource # Path, query, and fragment.
|
203
|
+
* :location # Directory and resource - everything after the site.
|
204
|
+
```
|
205
|
+
|
206
|
+
### Additional URI Methods
|
207
|
+
|
208
|
+
```ruby
|
209
|
+
uri = UrlParser.clean('#')
|
210
|
+
uri.unescaped? #=> true
|
211
|
+
uri.parsed? #=> true
|
212
|
+
uri.unembedded? #=> true
|
213
|
+
uri.canonicalized? #=> true
|
214
|
+
uri.normalized? #=> true
|
215
|
+
uri.cleaned? #=> true
|
216
|
+
|
217
|
+
# IP / localhost methods
|
218
|
+
uri.localhost?
|
219
|
+
uri.ip_address?
|
220
|
+
uri.ipv4?
|
221
|
+
uri.ipv6?
|
222
|
+
uri.ipv4 #=> returns IPv4 address if applicable
|
223
|
+
uri.ipv6 #=> returns IPv6 address if applicable
|
224
|
+
|
225
|
+
# UrlParser::URI#relative?
|
226
|
+
uri = UrlParser.parse('/')
|
227
|
+
uri.relative?
|
228
|
+
#=> true
|
229
|
+
|
230
|
+
# UrlParser::URI#absolute?
|
231
|
+
uri = UrlParser.parse('http://example.com/')
|
232
|
+
uri.absolute?
|
233
|
+
#=> true
|
234
|
+
|
235
|
+
# UrlParser::URI#clean - return a cleaned string
|
236
|
+
uri = UrlParser.parse('http://example.com/?utm_source=google')
|
237
|
+
uri.clean
|
238
|
+
#=> "http://example.com/"
|
239
|
+
|
240
|
+
# UrlParser::URI#canonical - cleans and strips the scheme
|
241
|
+
uri = UrlParser.parse('http://example.com/?utm_source%3Danalytics')
|
242
|
+
uri.canonical
|
243
|
+
#=> "//example.com/"
|
244
|
+
|
245
|
+
# Joining URIs
|
246
|
+
uri = UrlParser.parse('http://foo.com/zee/zaw/zoom.html')
|
247
|
+
joined_uri = uri + '/bar#id'
|
248
|
+
joined_uri.to_s
|
249
|
+
#=> "http://foo.com/bar#id"
|
250
|
+
|
251
|
+
# UrlParser::URI #raw / #to_s - return the URI as a string
|
252
|
+
uri = UrlParser.parse('http://example.com/')
|
253
|
+
uri.raw
|
254
|
+
#=> "http://example.com/"
|
255
|
+
|
256
|
+
# Compare URIs
|
257
|
+
# Taking into account the scheme:
|
258
|
+
uri = UrlParser.parse('http://example.com/a/../?')
|
259
|
+
uri == 'http://example.com/'
|
260
|
+
#=> true
|
261
|
+
uri == 'https://example.com/'
|
262
|
+
#=> false
|
263
|
+
|
264
|
+
# Ignoring the scheme:
|
265
|
+
uri =~ 'https://example.com/'
|
266
|
+
#=> true
|
267
|
+
|
268
|
+
# UrlParser::URI#valid? - checks if URI is absolute and domain is valid
|
269
|
+
uri = UrlParser.parse('http://example.qqq/')
|
270
|
+
uri.valid?
|
271
|
+
#=> false
|
272
|
+
```
|
273
|
+
|
274
|
+
## Configuration
|
275
|
+
|
276
|
+
### embedded_params
|
277
|
+
|
278
|
+
Set the params the unembed parser uses to search for embedded URIs. Default is `[ 'u', 'url ]`. Set to an empty array to disable unembedding.
|
279
|
+
|
280
|
+
```ruby
|
281
|
+
UrlParser.configure do |config|
|
282
|
+
config.embedded_params = [ 'ref' ]
|
283
|
+
end
|
284
|
+
|
285
|
+
uri = UrlParser.unembed('https://www.upwork.com/leaving?ref=https%3A%2F%2Fwww.example.com')
|
286
|
+
uri.to_s
|
287
|
+
#=> "https://www.example.com/"
|
288
|
+
```
|
289
|
+
|
290
|
+
### default_scheme
|
291
|
+
|
292
|
+
Set a default scheme if one is not present. Can also be set to nil if there should not be a default scheme. Default is `'http'`.
|
293
|
+
|
294
|
+
```ruby
|
295
|
+
UrlParser.configure do |config|
|
296
|
+
config.default_scheme = 'https'
|
297
|
+
end
|
298
|
+
|
299
|
+
uri = UrlParser.parse('example.com')
|
300
|
+
uri.to_s
|
301
|
+
#=> "https://example.com/"
|
302
|
+
```
|
303
|
+
|
304
|
+
### scheme_map
|
305
|
+
|
306
|
+
Replace scheme keys in the 'map' with the corresponding value. Useful for replacing invalid or outdated schemes. Default is an empty hash.
|
307
|
+
|
308
|
+
```ruby
|
309
|
+
UrlParser.configure do |config|
|
310
|
+
config.scheme_map = { 'feed' => 'http' }
|
311
|
+
end
|
312
|
+
|
313
|
+
uri = UrlParser.parse('feed://feeds.feedburner.com/YourBlog')
|
314
|
+
uri.to_s
|
315
|
+
#=> "http://feeds.feedburner.com/YourBlog"
|
316
|
+
```
|
317
|
+
|
318
|
+
## TODO
|
319
|
+
|
320
|
+
* Extract URIs from text
|
321
|
+
* Enable custom rules for normalization, canonicaliztion, escaping, and extraction
|
26
322
|
|
27
323
|
## Contributing
|
28
324
|
|