url_parser 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +4 -0
- data/Guardfile +40 -7
- data/LICENSE.txt +1 -1
- data/README.md +301 -5
- data/Rakefile +5 -0
- data/lib/url_parser.rb +93 -286
- data/lib/url_parser/db.yml +77 -0
- data/lib/url_parser/domain.rb +102 -0
- data/lib/url_parser/model.rb +233 -0
- data/lib/url_parser/option_setter.rb +47 -0
- data/lib/url_parser/parser.rb +206 -0
- data/lib/url_parser/uri.rb +206 -0
- data/lib/url_parser/version.rb +1 -1
- data/spec/spec_helper.rb +83 -6
- data/spec/support/.gitkeep +0 -0
- data/spec/support/helpers.rb +7 -0
- data/spec/url_parser/domain_spec.rb +163 -0
- data/spec/url_parser/model_spec.rb +426 -0
- data/spec/url_parser/option_setter_spec.rb +71 -0
- data/spec/url_parser/parser_spec.rb +515 -0
- data/spec/url_parser/uri_spec.rb +570 -0
- data/spec/url_parser_spec.rb +93 -387
- data/url_parser.gemspec +5 -6
- metadata +39 -29
data/lib/url_parser.rb
CHANGED
@@ -1,321 +1,128 @@
|
|
1
|
-
require "
|
2
|
-
require "
|
3
|
-
require "postrank-uri"
|
1
|
+
require "yaml"
|
2
|
+
require "gem_config"
|
4
3
|
require "addressable/uri"
|
5
|
-
require "
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
[]
|
12
|
-
elsif object.respond_to?(:to_ary)
|
13
|
-
object.to_ary || [object]
|
14
|
-
else
|
15
|
-
[object]
|
16
|
-
end
|
17
|
-
end unless respond_to?(:wrap)
|
18
|
-
|
19
|
-
end
|
4
|
+
require "url_parser/version"
|
5
|
+
require "url_parser/option_setter"
|
6
|
+
require "url_parser/domain"
|
7
|
+
require "url_parser/model"
|
8
|
+
require "url_parser/parser"
|
9
|
+
require "url_parser/uri"
|
20
10
|
|
21
11
|
module UrlParser
|
12
|
+
include GemConfig::Base
|
22
13
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
'rsync', 'rtsp', 'rtspu', 'sftp', 'shttp', 'sip', 'sips',
|
28
|
-
'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
|
29
|
-
# Unofficial schemes
|
30
|
-
'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk',
|
31
|
-
'irc', 'ircs', 'irc6', 'itms', 'mms', 'msnim', 'mvn', 'skype',
|
32
|
-
'ssh', 'smb', 'svn', 'ymsg', 'webcal'
|
33
|
-
]
|
34
|
-
|
35
|
-
DEFAULT_SCHEMES = [
|
36
|
-
'http', 'https', 'ftp', 'mailto', 'file', 'ssh', 'feed',
|
37
|
-
'cvs', 'git', 'mvn', 'nntp', 'shttp', 'svn', 'webcal'
|
38
|
-
]
|
39
|
-
|
40
|
-
module Error; end
|
41
|
-
|
42
|
-
class InvalidScheme
|
43
|
-
include UrlParser::Error
|
14
|
+
with_configuration do
|
15
|
+
has :default_scheme, classes: [ String, NilClass ], default: 'http'
|
16
|
+
has :scheme_map, classes: Hash, default: Hash.new
|
17
|
+
has :embedded_params, classes: Array, default: %w(u url)
|
44
18
|
end
|
45
19
|
|
46
|
-
|
47
|
-
urls = []
|
48
|
-
PostRank::URI.extract(text).each do |url|
|
49
|
-
urls << new(url, options)
|
50
|
-
end
|
51
|
-
urls
|
52
|
-
end
|
20
|
+
module Error; end
|
53
21
|
|
54
|
-
|
55
|
-
|
22
|
+
class LibraryError < StandardError
|
23
|
+
include Error
|
56
24
|
end
|
57
25
|
|
58
|
-
|
59
|
-
|
60
|
-
attr_reader :url, :original_url, :raise_errors
|
26
|
+
RequiresAddressableURI = Class.new(LibraryError)
|
27
|
+
RequiresUrlParserDomain = Class.new(LibraryError)
|
61
28
|
|
62
|
-
|
29
|
+
DB = YAML.load_file(File.join(File.dirname(__FILE__), '/url_parser/db.yml'))
|
63
30
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
@errors = []
|
69
|
-
@original_url = url
|
70
|
-
@url = @clean ? clean(url) : parse(url)
|
71
|
-
prepare
|
72
|
-
end
|
73
|
-
|
74
|
-
def schemes
|
75
|
-
Array.wrap(@schemes)
|
76
|
-
end
|
77
|
-
|
78
|
-
def clean!
|
79
|
-
@parser = nil
|
80
|
-
@url = clean(url)
|
81
|
-
@clean = true
|
82
|
-
self
|
83
|
-
end
|
31
|
+
def self.new(uri, options = {})
|
32
|
+
warn "[DEPRECATION] `.new` is deprecated. Please use `.parse` instead."
|
33
|
+
parse(uri, options)
|
34
|
+
end
|
84
35
|
|
85
|
-
|
86
|
-
|
87
|
-
|
36
|
+
module_function
|
37
|
+
|
38
|
+
# Encode a string
|
39
|
+
#
|
40
|
+
# Adapted from ERB::Util.url_encode
|
41
|
+
#
|
42
|
+
def escape(uri, options = {})
|
43
|
+
uri.to_s.dup
|
44
|
+
.force_encoding(Encoding::ASCII_8BIT)
|
45
|
+
.gsub(/[^a-zA-Z0-9_\-.]/n) do
|
46
|
+
sprintf("%%%02X", Regexp.last_match[0].unpack("C")[0])
|
88
47
|
end
|
89
|
-
|
90
|
-
|
91
|
-
def to_s
|
92
|
-
return '' if errors.any?
|
93
|
-
url.to_s
|
94
|
-
end
|
48
|
+
end
|
95
49
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
50
|
+
# Decode a string
|
51
|
+
#
|
52
|
+
# Adapted from CGI::unescape
|
53
|
+
#
|
54
|
+
# See also http://tools.ietf.org/html/rfc3986#section-2.3
|
55
|
+
#
|
56
|
+
def unescape(uri, options = {})
|
57
|
+
encoding = options.fetch(:encoding) { Encoding::UTF_8 }
|
58
|
+
|
59
|
+
query_spaces = proc do
|
60
|
+
if Regexp.last_match[6]
|
61
|
+
Regexp.last_match[0].sub(
|
62
|
+
Regexp.last_match[6],
|
63
|
+
Regexp.last_match[6].tr('+', ' ')
|
104
64
|
)
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def valid?
|
109
|
-
errors.empty?
|
110
|
-
end
|
111
|
-
|
112
|
-
def join(relative_path)
|
113
|
-
return nil if errors.any?
|
114
|
-
UrlParser.new(
|
115
|
-
Addressable::URI.join(url, relative_path).to_s
|
116
|
-
)
|
117
|
-
end
|
118
|
-
|
119
|
-
# URI Components
|
120
|
-
|
121
|
-
def scheme
|
122
|
-
return nil if errors.any?
|
123
|
-
url.scheme
|
124
|
-
end
|
125
|
-
|
126
|
-
def username
|
127
|
-
return nil if errors.any?
|
128
|
-
url.user
|
129
|
-
end
|
130
|
-
alias_method :user, :username
|
131
|
-
|
132
|
-
def password
|
133
|
-
return nil if errors.any?
|
134
|
-
url.password
|
135
|
-
end
|
136
|
-
|
137
|
-
def userinfo
|
138
|
-
return nil if errors.any?
|
139
|
-
url.userinfo
|
140
|
-
end
|
141
|
-
|
142
|
-
def www
|
143
|
-
return nil if errors.any?
|
144
|
-
return nil if parser.subdomain.empty?
|
145
|
-
parts = slice_domain.split('.')
|
146
|
-
parts.first =~ /www?\d*/ ? parts.shift : nil
|
147
|
-
end
|
148
|
-
|
149
|
-
def subdomain
|
150
|
-
return nil if errors.any?
|
151
|
-
return nil if parser.subdomain.empty?
|
152
|
-
parts = slice_domain.split('.')
|
153
|
-
parts.shift if parts.first =~ /www?\d*/
|
154
|
-
parts.compact.join('.')
|
155
|
-
end
|
156
|
-
|
157
|
-
def subdomains
|
158
|
-
return nil if errors.any?
|
159
|
-
return nil if parser.subdomain.empty?
|
160
|
-
[ www, subdomain ].compact.join('.')
|
161
|
-
end
|
162
|
-
|
163
|
-
def domain_name
|
164
|
-
return nil if errors.any?
|
165
|
-
parser.domain.empty? ? nil : parser.domain
|
166
|
-
end
|
167
|
-
|
168
|
-
def domain
|
169
|
-
return nil if errors.any?
|
170
|
-
if parser.domain_with_public_suffix.empty?
|
171
|
-
nil
|
172
65
|
else
|
173
|
-
|
66
|
+
Regexp.last_match[0]
|
174
67
|
end
|
175
68
|
end
|
176
69
|
|
177
|
-
|
178
|
-
|
179
|
-
tld = parser.public_suffix
|
180
|
-
tld.empty? ? nil : tld
|
181
|
-
end
|
182
|
-
|
183
|
-
def hostname
|
184
|
-
return nil if errors.any?
|
185
|
-
url.host
|
186
|
-
end
|
187
|
-
|
188
|
-
def port
|
189
|
-
return nil if errors.any?
|
190
|
-
url.port
|
191
|
-
end
|
192
|
-
|
193
|
-
def host
|
194
|
-
return nil if errors.any?
|
195
|
-
name = [ hostname, port ].compact.join(':')
|
196
|
-
name.empty? ? nil : name
|
197
|
-
end
|
198
|
-
|
199
|
-
def origin
|
200
|
-
return nil if errors.any?
|
201
|
-
url.origin == "null" ? nil : url.origin
|
202
|
-
end
|
203
|
-
|
204
|
-
def authority
|
205
|
-
return nil if errors.any?
|
206
|
-
url.authority
|
207
|
-
end
|
208
|
-
|
209
|
-
def site
|
210
|
-
return nil if errors.any?
|
211
|
-
url.site
|
212
|
-
end
|
213
|
-
|
214
|
-
def directory
|
215
|
-
return nil if errors.any?
|
216
|
-
parts = path.split('/')
|
217
|
-
return '/' if parts.empty?
|
218
|
-
parts.pop unless segment.to_s.empty?
|
219
|
-
parts.unshift('') unless parts.first.to_s.empty?
|
220
|
-
parts.compact.join('/')
|
221
|
-
end
|
222
|
-
|
223
|
-
def path
|
224
|
-
return nil if errors.any?
|
225
|
-
url.path
|
226
|
-
end
|
227
|
-
|
228
|
-
def segment
|
229
|
-
return nil if errors.any?
|
230
|
-
path =~ /\/\z/ ? nil : path.split('/').last
|
231
|
-
end
|
232
|
-
|
233
|
-
def filename
|
234
|
-
return nil if errors.any?
|
235
|
-
return 'index.html' if segment.to_s.empty?
|
236
|
-
return '' if suffix.to_s.empty?
|
237
|
-
segment
|
238
|
-
end
|
239
|
-
|
240
|
-
def suffix
|
241
|
-
return nil if errors.any?
|
242
|
-
ext = File.extname(path)
|
243
|
-
ext[0] = '' if ext[0] == '.'
|
244
|
-
ext.empty? ? nil : ext
|
245
|
-
end
|
246
|
-
|
247
|
-
def query
|
248
|
-
return nil if errors.any?
|
249
|
-
url.query
|
250
|
-
end
|
251
|
-
|
252
|
-
def query_values
|
253
|
-
return {} if errors.any?
|
254
|
-
url.query_values.to_h
|
70
|
+
decode_chars = proc do
|
71
|
+
[Regexp.last_match[1].delete('%')].pack('H*')
|
255
72
|
end
|
256
73
|
|
257
|
-
|
258
|
-
return nil if errors.any?
|
259
|
-
url.fragment
|
260
|
-
end
|
74
|
+
string = uri.to_s
|
261
75
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
name.empty? ? nil : name
|
268
|
-
end
|
76
|
+
str = string.dup
|
77
|
+
.gsub(Addressable::URI::URIREGEX, &query_spaces)
|
78
|
+
.force_encoding(Encoding::ASCII_8BIT)
|
79
|
+
.gsub(/((?:%[0-9a-fA-F]{2})+)/, &decode_chars)
|
80
|
+
.force_encoding(encoding)
|
269
81
|
|
270
|
-
|
271
|
-
|
272
|
-
url.relative?
|
273
|
-
end
|
274
|
-
|
275
|
-
def absolute?
|
276
|
-
return nil if errors.any?
|
277
|
-
url.absolute?
|
278
|
-
end
|
82
|
+
str.valid_encoding? ? str : str.force_encoding(string.encoding)
|
83
|
+
end
|
279
84
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
end
|
85
|
+
def parse(uri, options = {}, &blk)
|
86
|
+
URI.new(uri, options, &blk)
|
87
|
+
end
|
284
88
|
|
285
|
-
|
89
|
+
def unembed(uri, options = {}, &blk)
|
90
|
+
URI.new(uri, options.merge(unembed: true), &blk)
|
91
|
+
end
|
286
92
|
|
287
|
-
|
288
|
-
|
289
|
-
|
93
|
+
def canonicalize(uri, options = {}, &blk)
|
94
|
+
URI.new(uri, options.merge(canonicalize: true), &blk)
|
95
|
+
end
|
290
96
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
unless error.singleton_class.include?(UrlParser::Error)
|
295
|
-
error.extend(UrlParser::Error)
|
296
|
-
end
|
297
|
-
@errors << error
|
298
|
-
raise if raise_errors
|
299
|
-
end
|
97
|
+
def normalize(uri, options = {}, &blk)
|
98
|
+
URI.new(uri, options.merge(normalize: true), &blk)
|
99
|
+
end
|
300
100
|
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
end
|
305
|
-
end
|
101
|
+
def clean(uri, options = {}, &blk)
|
102
|
+
URI.new(uri, options.merge(clean: true), &blk)
|
103
|
+
end
|
306
104
|
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
105
|
+
# Wraps its argument in an array unless it is already an array
|
106
|
+
#
|
107
|
+
# See: activesupport/lib/active_support/core_ext/array/wrap.rb, line 36
|
108
|
+
#
|
109
|
+
def wrap(object)
|
110
|
+
if object.nil?
|
111
|
+
[]
|
112
|
+
elsif object.respond_to?(:to_ary)
|
113
|
+
object.to_ary || [object]
|
114
|
+
else
|
115
|
+
[object]
|
311
116
|
end
|
117
|
+
end
|
312
118
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
119
|
+
def tag_errors
|
120
|
+
yield
|
121
|
+
rescue StandardError => error
|
122
|
+
unless error.singleton_class.include?(UrlParser::Error)
|
123
|
+
error.extend(UrlParser::Error)
|
317
124
|
end
|
318
|
-
|
125
|
+
raise error
|
319
126
|
end
|
320
127
|
|
321
128
|
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
---
|
2
|
+
:global:
|
3
|
+
- _openstat # Yandex openstat param
|
4
|
+
- awesm # awe.sm tracker
|
5
|
+
- gclid # Google Analytics click ID
|
6
|
+
- mc_cid # Mailchimp campaign unique ID
|
7
|
+
- mc_eid # Mailchimp campaign member email unique ID
|
8
|
+
- PHPSESSID # Legacy PHP session identifier
|
9
|
+
- sms_ss # addthis.com tracker
|
10
|
+
- utm_campaign # Urchin / Google Analytics campaign name
|
11
|
+
- utm_content # Urchin / Google Analytics campaign content
|
12
|
+
- utm_medium # Urchin / Google Analytics campaign medium
|
13
|
+
- utm_nooverride # Urchin nooverride param
|
14
|
+
- utm_reader # Urchin reader param
|
15
|
+
- utm_source # Urchin / Google Analytics campaign source
|
16
|
+
- utm_term # Urchin / Google Anlaytics campaign term
|
17
|
+
- utm_type # Urchin type param
|
18
|
+
- xtor # AT Internet tracker
|
19
|
+
:hosts:
|
20
|
+
allthingsd.com:
|
21
|
+
- mod
|
22
|
+
cbc.ca:
|
23
|
+
- ref
|
24
|
+
cnet.com:
|
25
|
+
- part
|
26
|
+
- subj
|
27
|
+
- tag
|
28
|
+
cnn.com:
|
29
|
+
- eref
|
30
|
+
diepresse.com:
|
31
|
+
- _vl_backlink
|
32
|
+
dw-world.de:
|
33
|
+
- maca
|
34
|
+
economist.com:
|
35
|
+
- fsrc
|
36
|
+
espn.com:
|
37
|
+
- campaign
|
38
|
+
- source
|
39
|
+
espn.go.com:
|
40
|
+
- campaign
|
41
|
+
- source
|
42
|
+
latimes.com:
|
43
|
+
- track
|
44
|
+
macworld.com:
|
45
|
+
- lsrc
|
46
|
+
nytimes.com:
|
47
|
+
- partner
|
48
|
+
- pagewanted
|
49
|
+
- emc
|
50
|
+
- _r
|
51
|
+
- ref
|
52
|
+
- src
|
53
|
+
repubblica.it:
|
54
|
+
- rss
|
55
|
+
theglobeandmail.com:
|
56
|
+
- cmpid
|
57
|
+
thestar.com:
|
58
|
+
- bn
|
59
|
+
usatoday.com:
|
60
|
+
- csp
|
61
|
+
waomarketing.com:
|
62
|
+
- nucrss
|
63
|
+
washingtonpost.com:
|
64
|
+
- nav
|
65
|
+
- wprss
|
66
|
+
welt.de:
|
67
|
+
- wtmc
|
68
|
+
wikipedia.org:
|
69
|
+
- source
|
70
|
+
wsj.com:
|
71
|
+
- mod
|
72
|
+
youtube.com:
|
73
|
+
- feature
|
74
|
+
- app
|
75
|
+
- ac
|
76
|
+
- src_vid
|
77
|
+
- annotation_id
|