url_parser 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,321 +1,128 @@
1
- require "url_parser/version"
2
- require "domainatrix"
3
- require "postrank-uri"
1
+ require "yaml"
2
+ require "gem_config"
4
3
  require "addressable/uri"
5
- require "digest/sha1"
6
-
7
- class Array
8
-
9
- def self.wrap(object)
10
- if object.nil?
11
- []
12
- elsif object.respond_to?(:to_ary)
13
- object.to_ary || [object]
14
- else
15
- [object]
16
- end
17
- end unless respond_to?(:wrap)
18
-
19
- end
4
+ require "url_parser/version"
5
+ require "url_parser/option_setter"
6
+ require "url_parser/domain"
7
+ require "url_parser/model"
8
+ require "url_parser/parser"
9
+ require "url_parser/uri"
20
10
 
21
11
  module UrlParser
12
+ include GemConfig::Base
22
13
 
23
- # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
24
- SCHEMES = [
25
- 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https',
26
- 'imap', 'magnet', 'mailto', 'mms', 'news', 'nntp', 'prospero',
27
- 'rsync', 'rtsp', 'rtspu', 'sftp', 'shttp', 'sip', 'sips',
28
- 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
29
- # Unofficial schemes
30
- 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk',
31
- 'irc', 'ircs', 'irc6', 'itms', 'mms', 'msnim', 'mvn', 'skype',
32
- 'ssh', 'smb', 'svn', 'ymsg', 'webcal'
33
- ]
34
-
35
- DEFAULT_SCHEMES = [
36
- 'http', 'https', 'ftp', 'mailto', 'file', 'ssh', 'feed',
37
- 'cvs', 'git', 'mvn', 'nntp', 'shttp', 'svn', 'webcal'
38
- ]
39
-
40
- module Error; end
41
-
42
- class InvalidScheme
43
- include UrlParser::Error
14
+ with_configuration do
15
+ has :default_scheme, classes: [ String, NilClass ], default: 'http'
16
+ has :scheme_map, classes: Hash, default: Hash.new
17
+ has :embedded_params, classes: Array, default: %w(u url)
44
18
  end
45
19
 
46
- def self.call(text, options = {})
47
- urls = []
48
- PostRank::URI.extract(text).each do |url|
49
- urls << new(url, options)
50
- end
51
- urls
52
- end
20
+ module Error; end
53
21
 
54
- def self.new(url, options = {})
55
- Base.new(url, options)
22
+ class LibraryError < StandardError
23
+ include Error
56
24
  end
57
25
 
58
- class Base
59
-
60
- attr_reader :url, :original_url, :raise_errors
26
+ RequiresAddressableURI = Class.new(LibraryError)
27
+ RequiresUrlParserDomain = Class.new(LibraryError)
61
28
 
62
- attr_accessor :errors
29
+ DB = YAML.load_file(File.join(File.dirname(__FILE__), '/url_parser/db.yml'))
63
30
 
64
- def initialize(url, options = {})
65
- @schemes = options.fetch(:schemes) { UrlParser::DEFAULT_SCHEMES }
66
- @clean = options.fetch(:clean) { false }
67
- @raise_errors = options.fetch(:raise_errors) { false }
68
- @errors = []
69
- @original_url = url
70
- @url = @clean ? clean(url) : parse(url)
71
- prepare
72
- end
73
-
74
- def schemes
75
- Array.wrap(@schemes)
76
- end
77
-
78
- def clean!
79
- @parser = nil
80
- @url = clean(url)
81
- @clean = true
82
- self
83
- end
31
+ def self.new(uri, options = {})
32
+ warn "[DEPRECATION] `.new` is deprecated. Please use `.parse` instead."
33
+ parse(uri, options)
34
+ end
84
35
 
85
- def parser
86
- tag_errors do
87
- @parser ||= Domainatrix.parse(to_s)
36
+ module_function
37
+
38
+ # Encode a string
39
+ #
40
+ # Adapted from ERB::Util.url_encode
41
+ #
42
+ def escape(uri, options = {})
43
+ uri.to_s.dup
44
+ .force_encoding(Encoding::ASCII_8BIT)
45
+ .gsub(/[^a-zA-Z0-9_\-.]/n) do
46
+ sprintf("%%%02X", Regexp.last_match[0].unpack("C")[0])
88
47
  end
89
- end
90
-
91
- def to_s
92
- return '' if errors.any?
93
- url.to_s
94
- end
48
+ end
95
49
 
96
- def hash(options = {})
97
- return nil if errors.any?
98
- clean = options.fetch(:clean) { nil }
99
- if clean.nil?
100
- Digest::SHA1.hexdigest(url.to_s)
101
- else
102
- Digest::SHA1.hexdigest(
103
- clean ? clean(original_url) : parse(original_url)
50
+ # Decode a string
51
+ #
52
+ # Adapted from CGI::unescape
53
+ #
54
+ # See also http://tools.ietf.org/html/rfc3986#section-2.3
55
+ #
56
+ def unescape(uri, options = {})
57
+ encoding = options.fetch(:encoding) { Encoding::UTF_8 }
58
+
59
+ query_spaces = proc do
60
+ if Regexp.last_match[6]
61
+ Regexp.last_match[0].sub(
62
+ Regexp.last_match[6],
63
+ Regexp.last_match[6].tr('+', ' ')
104
64
  )
105
- end
106
- end
107
-
108
- def valid?
109
- errors.empty?
110
- end
111
-
112
- def join(relative_path)
113
- return nil if errors.any?
114
- UrlParser.new(
115
- Addressable::URI.join(url, relative_path).to_s
116
- )
117
- end
118
-
119
- # URI Components
120
-
121
- def scheme
122
- return nil if errors.any?
123
- url.scheme
124
- end
125
-
126
- def username
127
- return nil if errors.any?
128
- url.user
129
- end
130
- alias_method :user, :username
131
-
132
- def password
133
- return nil if errors.any?
134
- url.password
135
- end
136
-
137
- def userinfo
138
- return nil if errors.any?
139
- url.userinfo
140
- end
141
-
142
- def www
143
- return nil if errors.any?
144
- return nil if parser.subdomain.empty?
145
- parts = slice_domain.split('.')
146
- parts.first =~ /www?\d*/ ? parts.shift : nil
147
- end
148
-
149
- def subdomain
150
- return nil if errors.any?
151
- return nil if parser.subdomain.empty?
152
- parts = slice_domain.split('.')
153
- parts.shift if parts.first =~ /www?\d*/
154
- parts.compact.join('.')
155
- end
156
-
157
- def subdomains
158
- return nil if errors.any?
159
- return nil if parser.subdomain.empty?
160
- [ www, subdomain ].compact.join('.')
161
- end
162
-
163
- def domain_name
164
- return nil if errors.any?
165
- parser.domain.empty? ? nil : parser.domain
166
- end
167
-
168
- def domain
169
- return nil if errors.any?
170
- if parser.domain_with_public_suffix.empty?
171
- nil
172
65
  else
173
- parser.domain_with_public_suffix
66
+ Regexp.last_match[0]
174
67
  end
175
68
  end
176
69
 
177
- def tld
178
- return nil if errors.any?
179
- tld = parser.public_suffix
180
- tld.empty? ? nil : tld
181
- end
182
-
183
- def hostname
184
- return nil if errors.any?
185
- url.host
186
- end
187
-
188
- def port
189
- return nil if errors.any?
190
- url.port
191
- end
192
-
193
- def host
194
- return nil if errors.any?
195
- name = [ hostname, port ].compact.join(':')
196
- name.empty? ? nil : name
197
- end
198
-
199
- def origin
200
- return nil if errors.any?
201
- url.origin == "null" ? nil : url.origin
202
- end
203
-
204
- def authority
205
- return nil if errors.any?
206
- url.authority
207
- end
208
-
209
- def site
210
- return nil if errors.any?
211
- url.site
212
- end
213
-
214
- def directory
215
- return nil if errors.any?
216
- parts = path.split('/')
217
- return '/' if parts.empty?
218
- parts.pop unless segment.to_s.empty?
219
- parts.unshift('') unless parts.first.to_s.empty?
220
- parts.compact.join('/')
221
- end
222
-
223
- def path
224
- return nil if errors.any?
225
- url.path
226
- end
227
-
228
- def segment
229
- return nil if errors.any?
230
- path =~ /\/\z/ ? nil : path.split('/').last
231
- end
232
-
233
- def filename
234
- return nil if errors.any?
235
- return 'index.html' if segment.to_s.empty?
236
- return '' if suffix.to_s.empty?
237
- segment
238
- end
239
-
240
- def suffix
241
- return nil if errors.any?
242
- ext = File.extname(path)
243
- ext[0] = '' if ext[0] == '.'
244
- ext.empty? ? nil : ext
245
- end
246
-
247
- def query
248
- return nil if errors.any?
249
- url.query
250
- end
251
-
252
- def query_values
253
- return {} if errors.any?
254
- url.query_values.to_h
70
+ decode_chars = proc do
71
+ [Regexp.last_match[1].delete('%')].pack('H*')
255
72
  end
256
73
 
257
- def fragment
258
- return nil if errors.any?
259
- url.fragment
260
- end
74
+ string = uri.to_s
261
75
 
262
- def resource
263
- return nil if errors.any?
264
- name = [
265
- [ segment, query ].compact.join('?'), fragment
266
- ].compact.join('#')
267
- name.empty? ? nil : name
268
- end
76
+ str = string.dup
77
+ .gsub(Addressable::URI::URIREGEX, &query_spaces)
78
+ .force_encoding(Encoding::ASCII_8BIT)
79
+ .gsub(/((?:%[0-9a-fA-F]{2})+)/, &decode_chars)
80
+ .force_encoding(encoding)
269
81
 
270
- def relative?
271
- return nil if errors.any?
272
- url.relative?
273
- end
274
-
275
- def absolute?
276
- return nil if errors.any?
277
- url.absolute?
278
- end
82
+ str.valid_encoding? ? str : str.force_encoding(string.encoding)
83
+ end
279
84
 
280
- def localhost?
281
- return nil if errors.any?
282
- !!(hostname =~ /(\A|\.)localhost\z/)
283
- end
85
+ def parse(uri, options = {}, &blk)
86
+ URI.new(uri, options, &blk)
87
+ end
284
88
 
285
- private
89
+ def unembed(uri, options = {}, &blk)
90
+ URI.new(uri, options.merge(unembed: true), &blk)
91
+ end
286
92
 
287
- def slice_domain
288
- parser.subdomain.tap{ |s| s.slice!(domain) }
289
- end
93
+ def canonicalize(uri, options = {}, &blk)
94
+ URI.new(uri, options.merge(canonicalize: true), &blk)
95
+ end
290
96
 
291
- def tag_errors
292
- yield
293
- rescue Exception => error
294
- unless error.singleton_class.include?(UrlParser::Error)
295
- error.extend(UrlParser::Error)
296
- end
297
- @errors << error
298
- raise if raise_errors
299
- end
97
+ def normalize(uri, options = {}, &blk)
98
+ URI.new(uri, options.merge(normalize: true), &blk)
99
+ end
300
100
 
301
- def parse(url)
302
- tag_errors do
303
- PostRank::URI.parse(url, raw: true)
304
- end
305
- end
101
+ def clean(uri, options = {}, &blk)
102
+ URI.new(uri, options.merge(clean: true), &blk)
103
+ end
306
104
 
307
- def clean(url)
308
- tag_errors do
309
- PostRank::URI.clean(url, raw: true)
310
- end
105
+ # Wraps its argument in an array unless it is already an array
106
+ #
107
+ # See: activesupport/lib/active_support/core_ext/array/wrap.rb, line 36
108
+ #
109
+ def wrap(object)
110
+ if object.nil?
111
+ []
112
+ elsif object.respond_to?(:to_ary)
113
+ object.to_ary || [object]
114
+ else
115
+ [object]
311
116
  end
117
+ end
312
118
 
313
- # Initialize parser to ensure no errors are raised
314
- #
315
- def prepare
316
- parser
119
+ def tag_errors
120
+ yield
121
+ rescue StandardError => error
122
+ unless error.singleton_class.include?(UrlParser::Error)
123
+ error.extend(UrlParser::Error)
317
124
  end
318
-
125
+ raise error
319
126
  end
320
127
 
321
128
  end
@@ -0,0 +1,77 @@
1
+ ---
2
+ :global:
3
+ - _openstat # Yandex openstat param
4
+ - awesm # awe.sm tracker
5
+ - gclid # Google Analytics click ID
6
+ - mc_cid # Mailchimp campaign unique ID
7
+ - mc_eid # Mailchimp campaign member email unique ID
8
+ - PHPSESSID # Legacy PHP session identifier
9
+ - sms_ss # addthis.com tracker
10
+ - utm_campaign # Urchin / Google Analytics campaign name
11
+ - utm_content # Urchin / Google Analytics campaign content
12
+ - utm_medium # Urchin / Google Analytics campaign medium
13
+ - utm_nooverride # Urchin nooverride param
14
+ - utm_reader # Urchin reader param
15
+ - utm_source # Urchin / Google Analytics campaign source
16
+ - utm_term # Urchin / Google Anlaytics campaign term
17
+ - utm_type # Urchin type param
18
+ - xtor # AT Internet tracker
19
+ :hosts:
20
+ allthingsd.com:
21
+ - mod
22
+ cbc.ca:
23
+ - ref
24
+ cnet.com:
25
+ - part
26
+ - subj
27
+ - tag
28
+ cnn.com:
29
+ - eref
30
+ diepresse.com:
31
+ - _vl_backlink
32
+ dw-world.de:
33
+ - maca
34
+ economist.com:
35
+ - fsrc
36
+ espn.com:
37
+ - campaign
38
+ - source
39
+ espn.go.com:
40
+ - campaign
41
+ - source
42
+ latimes.com:
43
+ - track
44
+ macworld.com:
45
+ - lsrc
46
+ nytimes.com:
47
+ - partner
48
+ - pagewanted
49
+ - emc
50
+ - _r
51
+ - ref
52
+ - src
53
+ repubblica.it:
54
+ - rss
55
+ theglobeandmail.com:
56
+ - cmpid
57
+ thestar.com:
58
+ - bn
59
+ usatoday.com:
60
+ - csp
61
+ waomarketing.com:
62
+ - nucrss
63
+ washingtonpost.com:
64
+ - nav
65
+ - wprss
66
+ welt.de:
67
+ - wtmc
68
+ wikipedia.org:
69
+ - source
70
+ wsj.com:
71
+ - mod
72
+ youtube.com:
73
+ - feature
74
+ - app
75
+ - ac
76
+ - src_vid
77
+ - annotation_id