uri_parser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/.rvmrc +1 -0
- data/Gemfile +6 -0
- data/Rakefile +13 -0
- data/ext/uri_parser/basictypes.h +89 -0
- data/ext/uri_parser/extconf.h +6 -0
- data/ext/uri_parser/extconf.rb +50 -0
- data/ext/uri_parser/logging.h +5 -0
- data/ext/uri_parser/scoped_ptr.h +322 -0
- data/ext/uri_parser/string16.cc +95 -0
- data/ext/uri_parser/string16.h +194 -0
- data/ext/uri_parser/uri_parser.cc +87 -0
- data/ext/uri_parser/url_canon.h +872 -0
- data/ext/uri_parser/url_canon_etc.cc +392 -0
- data/ext/uri_parser/url_canon_fileurl.cc +215 -0
- data/ext/uri_parser/url_canon_host.cc +401 -0
- data/ext/uri_parser/url_canon_icu.cc +207 -0
- data/ext/uri_parser/url_canon_icu.h +63 -0
- data/ext/uri_parser/url_canon_internal.cc +427 -0
- data/ext/uri_parser/url_canon_internal.h +453 -0
- data/ext/uri_parser/url_canon_internal_file.h +157 -0
- data/ext/uri_parser/url_canon_ip.cc +737 -0
- data/ext/uri_parser/url_canon_ip.h +101 -0
- data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
- data/ext/uri_parser/url_canon_path.cc +380 -0
- data/ext/uri_parser/url_canon_pathurl.cc +128 -0
- data/ext/uri_parser/url_canon_query.cc +189 -0
- data/ext/uri_parser/url_canon_relative.cc +572 -0
- data/ext/uri_parser/url_canon_stdstring.h +134 -0
- data/ext/uri_parser/url_canon_stdurl.cc +211 -0
- data/ext/uri_parser/url_common.h +48 -0
- data/ext/uri_parser/url_file.h +108 -0
- data/ext/uri_parser/url_parse.cc +760 -0
- data/ext/uri_parser/url_parse.h +336 -0
- data/ext/uri_parser/url_parse_file.cc +243 -0
- data/ext/uri_parser/url_parse_internal.h +112 -0
- data/ext/uri_parser/url_util.cc +553 -0
- data/ext/uri_parser/url_util.h +222 -0
- data/lib/uri_parser.rb +28 -0
- data/lib/uri_parser/version.rb +3 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/uri_parser_spec.rb +54 -0
- data/uri_parser.gemspec +26 -0
- metadata +117 -0
@@ -0,0 +1,760 @@
|
|
1
|
+
/* Based on nsURLParsers.cc from Mozilla
|
2
|
+
* -------------------------------------
|
3
|
+
* The contents of this file are subject to the Mozilla Public License Version
|
4
|
+
* 1.1 (the "License"); you may not use this file except in compliance with
|
5
|
+
* the License. You may obtain a copy of the License at
|
6
|
+
* http://www.mozilla.org/MPL/
|
7
|
+
*
|
8
|
+
* Software distributed under the License is distributed on an "AS IS" basis,
|
9
|
+
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
10
|
+
* for the specific language governing rights and limitations under the
|
11
|
+
* License.
|
12
|
+
*
|
13
|
+
* The Original Code is mozilla.org code.
|
14
|
+
*
|
15
|
+
* The Initial Developer of the Original Code is
|
16
|
+
* Netscape Communications Corporation.
|
17
|
+
* Portions created by the Initial Developer are Copyright (C) 1998
|
18
|
+
* the Initial Developer. All Rights Reserved.
|
19
|
+
*
|
20
|
+
* Contributor(s):
|
21
|
+
* Darin Fisher (original author)
|
22
|
+
*
|
23
|
+
* Alternatively, the contents of this file may be used under the terms of
|
24
|
+
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
25
|
+
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
26
|
+
* in which case the provisions of the GPL or the LGPL are applicable instead
|
27
|
+
* of those above. If you wish to allow use of your version of this file only
|
28
|
+
* under the terms of either the GPL or the LGPL, and not to allow others to
|
29
|
+
* use your version of this file under the terms of the MPL, indicate your
|
30
|
+
* decision by deleting the provisions above and replace them with the notice
|
31
|
+
* and other provisions required by the GPL or the LGPL. If you do not delete
|
32
|
+
* the provisions above, a recipient may use your version of this file under
|
33
|
+
* the terms of any one of the MPL, the GPL or the LGPL.
|
34
|
+
*
|
35
|
+
* ***** END LICENSE BLOCK ***** */
|
36
|
+
|
37
|
+
#include "url_parse.h"
|
38
|
+
|
39
|
+
#include <stdlib.h>
|
40
|
+
|
41
|
+
#include "logging.h"
|
42
|
+
#include "url_parse_internal.h"
|
43
|
+
|
44
|
+
namespace url_parse {
|
45
|
+
|
46
|
+
namespace {
|
47
|
+
|
48
|
+
// Returns true if the given character is a valid digit to use in a port.
|
49
|
+
inline bool IsPortDigit(char16 ch) {
|
50
|
+
return ch >= '0' && ch <= '9';
|
51
|
+
}
|
52
|
+
|
53
|
+
// Returns the offset of the next authority terminator in the input starting
|
54
|
+
// from start_offset. If no terminator is found, the return value will be equal
|
55
|
+
// to spec_len.
|
56
|
+
template<typename CHAR>
|
57
|
+
int FindNextAuthorityTerminator(const CHAR* spec,
|
58
|
+
int start_offset,
|
59
|
+
int spec_len) {
|
60
|
+
for (int i = start_offset; i < spec_len; i++) {
|
61
|
+
if (IsAuthorityTerminator(spec[i]))
|
62
|
+
return i;
|
63
|
+
}
|
64
|
+
return spec_len; // Not found.
|
65
|
+
}
|
66
|
+
|
67
|
+
template<typename CHAR>
|
68
|
+
void ParseUserInfo(const CHAR* spec,
|
69
|
+
const Component& user,
|
70
|
+
Component* username,
|
71
|
+
Component* password) {
|
72
|
+
// Find the first colon in the user section, which separates the username and
|
73
|
+
// password.
|
74
|
+
int colon_offset = 0;
|
75
|
+
while (colon_offset < user.len && spec[user.begin + colon_offset] != ':')
|
76
|
+
colon_offset++;
|
77
|
+
|
78
|
+
if (colon_offset < user.len) {
|
79
|
+
// Found separator: <username>:<password>
|
80
|
+
*username = Component(user.begin, colon_offset);
|
81
|
+
*password = MakeRange(user.begin + colon_offset + 1,
|
82
|
+
user.begin + user.len);
|
83
|
+
} else {
|
84
|
+
// No separator, treat everything as the username
|
85
|
+
*username = user;
|
86
|
+
*password = Component();
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
template<typename CHAR>
|
91
|
+
void ParseServerInfo(const CHAR* spec,
|
92
|
+
const Component& serverinfo,
|
93
|
+
Component* hostname,
|
94
|
+
Component* port_num) {
|
95
|
+
if (serverinfo.len == 0) {
|
96
|
+
// No server info, host name is empty.
|
97
|
+
hostname->reset();
|
98
|
+
port_num->reset();
|
99
|
+
return;
|
100
|
+
}
|
101
|
+
|
102
|
+
// If the host starts with a left-bracket, assume the entire host is an
|
103
|
+
// IPv6 literal. Otherwise, assume none of the host is an IPv6 literal.
|
104
|
+
// This assumption will be overridden if we find a right-bracket.
|
105
|
+
//
|
106
|
+
// Our IPv6 address canonicalization code requires both brackets to exist,
|
107
|
+
// but the ability to locate an incomplete address can still be useful.
|
108
|
+
int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1;
|
109
|
+
int colon = -1;
|
110
|
+
|
111
|
+
// Find the last right-bracket, and the last colon.
|
112
|
+
for (int i = serverinfo.begin; i < serverinfo.end(); i++) {
|
113
|
+
switch (spec[i]) {
|
114
|
+
case ']':
|
115
|
+
ipv6_terminator = i;
|
116
|
+
break;
|
117
|
+
case ':':
|
118
|
+
colon = i;
|
119
|
+
break;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
if (colon > ipv6_terminator) {
|
124
|
+
// Found a port number: <hostname>:<port>
|
125
|
+
*hostname = MakeRange(serverinfo.begin, colon);
|
126
|
+
if (hostname->len == 0)
|
127
|
+
hostname->reset();
|
128
|
+
*port_num = MakeRange(colon + 1, serverinfo.end());
|
129
|
+
} else {
|
130
|
+
// No port: <hostname>
|
131
|
+
*hostname = serverinfo;
|
132
|
+
port_num->reset();
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
// Given an already-identified auth section, breaks it into its consituent
|
137
|
+
// parts. The port number will be parsed and the resulting integer will be
|
138
|
+
// filled into the given *port variable, or -1 if there is no port number or it
|
139
|
+
// is invalid.
|
140
|
+
template<typename CHAR>
|
141
|
+
void DoParseAuthority(const CHAR* spec,
|
142
|
+
const Component& auth,
|
143
|
+
Component* username,
|
144
|
+
Component* password,
|
145
|
+
Component* hostname,
|
146
|
+
Component* port_num) {
|
147
|
+
DCHECK(auth.is_valid()) << "We should always get an authority";
|
148
|
+
if (auth.len == 0) {
|
149
|
+
username->reset();
|
150
|
+
password->reset();
|
151
|
+
hostname->reset();
|
152
|
+
port_num->reset();
|
153
|
+
return;
|
154
|
+
}
|
155
|
+
|
156
|
+
// Search backwards for @, which is the separator between the user info and
|
157
|
+
// the server info.
|
158
|
+
int i = auth.begin + auth.len - 1;
|
159
|
+
while (i > auth.begin && spec[i] != '@')
|
160
|
+
i--;
|
161
|
+
|
162
|
+
if (spec[i] == '@') {
|
163
|
+
// Found user info: <user-info>@<server-info>
|
164
|
+
ParseUserInfo(spec, Component(auth.begin, i - auth.begin),
|
165
|
+
username, password);
|
166
|
+
ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len),
|
167
|
+
hostname, port_num);
|
168
|
+
} else {
|
169
|
+
// No user info, everything is server info.
|
170
|
+
username->reset();
|
171
|
+
password->reset();
|
172
|
+
ParseServerInfo(spec, auth, hostname, port_num);
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
template<typename CHAR>
|
177
|
+
void ParsePath(const CHAR* spec,
|
178
|
+
const Component& path,
|
179
|
+
Component* filepath,
|
180
|
+
Component* query,
|
181
|
+
Component* ref) {
|
182
|
+
// path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref>
|
183
|
+
|
184
|
+
// Special case when there is no path.
|
185
|
+
if (path.len == -1) {
|
186
|
+
filepath->reset();
|
187
|
+
query->reset();
|
188
|
+
ref->reset();
|
189
|
+
return;
|
190
|
+
}
|
191
|
+
DCHECK(path.len > 0) << "We should never have 0 length paths";
|
192
|
+
|
193
|
+
// Search for first occurrence of either ? or #.
|
194
|
+
int path_end = path.begin + path.len;
|
195
|
+
|
196
|
+
int query_separator = -1; // Index of the '?'
|
197
|
+
int ref_separator = -1; // Index of the '#'
|
198
|
+
for (int i = path.begin; i < path_end; i++) {
|
199
|
+
switch (spec[i]) {
|
200
|
+
case '?':
|
201
|
+
// Only match the query string if it precedes the reference fragment
|
202
|
+
// and when we haven't found one already.
|
203
|
+
if (ref_separator < 0 && query_separator < 0)
|
204
|
+
query_separator = i;
|
205
|
+
break;
|
206
|
+
case '#':
|
207
|
+
// Record the first # sign only.
|
208
|
+
if (ref_separator < 0)
|
209
|
+
ref_separator = i;
|
210
|
+
break;
|
211
|
+
}
|
212
|
+
}
|
213
|
+
|
214
|
+
// Markers pointing to the character after each of these corresponding
|
215
|
+
// components. The code below words from the end back to the beginning,
|
216
|
+
// and will update these indices as it finds components that exist.
|
217
|
+
int file_end, query_end;
|
218
|
+
|
219
|
+
// Ref fragment: from the # to the end of the path.
|
220
|
+
if (ref_separator >= 0) {
|
221
|
+
file_end = query_end = ref_separator;
|
222
|
+
*ref = MakeRange(ref_separator + 1, path_end);
|
223
|
+
} else {
|
224
|
+
file_end = query_end = path_end;
|
225
|
+
ref->reset();
|
226
|
+
}
|
227
|
+
|
228
|
+
// Query fragment: everything from the ? to the next boundary (either the end
|
229
|
+
// of the path or the ref fragment).
|
230
|
+
if (query_separator >= 0) {
|
231
|
+
file_end = query_separator;
|
232
|
+
*query = MakeRange(query_separator + 1, query_end);
|
233
|
+
} else {
|
234
|
+
query->reset();
|
235
|
+
}
|
236
|
+
|
237
|
+
// File path: treat an empty file path as no file path.
|
238
|
+
if (file_end != path.begin)
|
239
|
+
*filepath = MakeRange(path.begin, file_end);
|
240
|
+
else
|
241
|
+
filepath->reset();
|
242
|
+
}
|
243
|
+
|
244
|
+
template<typename CHAR>
|
245
|
+
bool DoExtractScheme(const CHAR* url,
|
246
|
+
int url_len,
|
247
|
+
Component* scheme) {
|
248
|
+
// Skip leading whitespace and control characters.
|
249
|
+
int begin = 0;
|
250
|
+
while (begin < url_len && ShouldTrimFromURL(url[begin]))
|
251
|
+
begin++;
|
252
|
+
if (begin == url_len)
|
253
|
+
return false; // Input is empty or all whitespace.
|
254
|
+
|
255
|
+
// Find the first colon character.
|
256
|
+
for (int i = begin; i < url_len; i++) {
|
257
|
+
if (url[i] == ':') {
|
258
|
+
*scheme = MakeRange(begin, i);
|
259
|
+
return true;
|
260
|
+
}
|
261
|
+
}
|
262
|
+
return false; // No colon found: no scheme
|
263
|
+
}
|
264
|
+
|
265
|
+
// Fills in all members of the Parsed structure except for the scheme.
|
266
|
+
//
|
267
|
+
// |spec| is the full spec being parsed, of length |spec_len|.
|
268
|
+
// |after_scheme| is the character immediately following the scheme (after the
|
269
|
+
// colon) where we'll begin parsing.
|
270
|
+
//
|
271
|
+
// Compatability data points. I list "host", "path" extracted:
|
272
|
+
// Input IE6 Firefox Us
|
273
|
+
// ----- -------------- -------------- --------------
|
274
|
+
// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
|
275
|
+
// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
|
276
|
+
// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/"
|
277
|
+
// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/"
|
278
|
+
// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
|
279
|
+
//
|
280
|
+
// (*) Interestingly, although IE fails to load these URLs, its history
|
281
|
+
// canonicalizer handles them, meaning if you've been to the corresponding
|
282
|
+
// "http://foo.com/" link, it will be colored.
|
283
|
+
template <typename CHAR>
|
284
|
+
void DoParseAfterScheme(const CHAR* spec,
|
285
|
+
int spec_len,
|
286
|
+
int after_scheme,
|
287
|
+
Parsed* parsed) {
|
288
|
+
int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
|
289
|
+
int after_slashes = after_scheme + num_slashes;
|
290
|
+
|
291
|
+
// First split into two main parts, the authority (username, password, host,
|
292
|
+
// and port) and the full path (path, query, and reference).
|
293
|
+
Component authority;
|
294
|
+
Component full_path;
|
295
|
+
|
296
|
+
// Found "//<some data>", looks like an authority section. Treat everything
|
297
|
+
// from there to the next slash (or end of spec) to be the authority. Note
|
298
|
+
// that we ignore the number of slashes and treat it as the authority.
|
299
|
+
int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len);
|
300
|
+
authority = Component(after_slashes, end_auth - after_slashes);
|
301
|
+
|
302
|
+
if (end_auth == spec_len) // No beginning of path found.
|
303
|
+
full_path = Component();
|
304
|
+
else // Everything starting from the slash to the end is the path.
|
305
|
+
full_path = Component(end_auth, spec_len - end_auth);
|
306
|
+
|
307
|
+
// Now parse those two sub-parts.
|
308
|
+
DoParseAuthority(spec, authority, &parsed->username, &parsed->password,
|
309
|
+
&parsed->host, &parsed->port);
|
310
|
+
ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
|
311
|
+
}
|
312
|
+
|
313
|
+
// The main parsing function for standard URLs. Standard URLs have a scheme,
|
314
|
+
// host, path, etc.
|
315
|
+
template<typename CHAR>
|
316
|
+
void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
|
317
|
+
DCHECK(spec_len >= 0);
|
318
|
+
|
319
|
+
// Strip leading & trailing spaces and control characters.
|
320
|
+
int begin = 0;
|
321
|
+
TrimURL(spec, &begin, &spec_len);
|
322
|
+
|
323
|
+
int after_scheme;
|
324
|
+
if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
|
325
|
+
after_scheme = parsed->scheme.end() + 1; // Skip past the colon.
|
326
|
+
} else {
|
327
|
+
// Say there's no scheme when there is no colon. We could also say that
|
328
|
+
// everything is the scheme. Both would produce an invalid URL, but this way
|
329
|
+
// seems less wrong in more cases.
|
330
|
+
parsed->scheme.reset();
|
331
|
+
after_scheme = begin;
|
332
|
+
}
|
333
|
+
DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
|
334
|
+
}
|
335
|
+
|
336
|
+
// Initializes a path URL which is merely a scheme followed by a path. Examples
|
337
|
+
// include "about:foo" and "javascript:alert('bar');"
|
338
|
+
template<typename CHAR>
|
339
|
+
void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
|
340
|
+
// Get the non-path and non-scheme parts of the URL out of the way, we never
|
341
|
+
// use them.
|
342
|
+
parsed->username.reset();
|
343
|
+
parsed->password.reset();
|
344
|
+
parsed->host.reset();
|
345
|
+
parsed->port.reset();
|
346
|
+
parsed->query.reset();
|
347
|
+
parsed->ref.reset();
|
348
|
+
|
349
|
+
// Strip leading & trailing spaces and control characters.
|
350
|
+
int begin = 0;
|
351
|
+
TrimURL(spec, &begin, &spec_len);
|
352
|
+
|
353
|
+
// Handle empty specs or ones that contain only whitespace or control chars.
|
354
|
+
if (begin == spec_len) {
|
355
|
+
parsed->scheme.reset();
|
356
|
+
parsed->path.reset();
|
357
|
+
return;
|
358
|
+
}
|
359
|
+
|
360
|
+
// Extract the scheme, with the path being everything following. We also
|
361
|
+
// handle the case where there is no scheme.
|
362
|
+
if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
|
363
|
+
// Offset the results since we gave ExtractScheme a substring.
|
364
|
+
parsed->scheme.begin += begin;
|
365
|
+
|
366
|
+
// For compatability with the standard URL parser, we treat no path as
|
367
|
+
// -1, rather than having a length of 0 (we normally wouldn't care so
|
368
|
+
// much for these non-standard URLs).
|
369
|
+
if (parsed->scheme.end() == spec_len - 1)
|
370
|
+
parsed->path.reset();
|
371
|
+
else
|
372
|
+
parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
|
373
|
+
} else {
|
374
|
+
// No scheme found, just path.
|
375
|
+
parsed->scheme.reset();
|
376
|
+
parsed->path = MakeRange(begin, spec_len);
|
377
|
+
}
|
378
|
+
}
|
379
|
+
|
380
|
+
template<typename CHAR>
|
381
|
+
void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) {
|
382
|
+
DCHECK(spec_len >= 0);
|
383
|
+
|
384
|
+
// Get the non-path and non-scheme parts of the URL out of the way, we never
|
385
|
+
// use them.
|
386
|
+
parsed->username.reset();
|
387
|
+
parsed->password.reset();
|
388
|
+
parsed->host.reset();
|
389
|
+
parsed->port.reset();
|
390
|
+
parsed->ref.reset();
|
391
|
+
parsed->query.reset(); // May use this; reset for convenience.
|
392
|
+
|
393
|
+
// Strip leading & trailing spaces and control characters.
|
394
|
+
int begin = 0;
|
395
|
+
TrimURL(spec, &begin, &spec_len);
|
396
|
+
|
397
|
+
// Handle empty specs or ones that contain only whitespace or control chars.
|
398
|
+
if (begin == spec_len) {
|
399
|
+
parsed->scheme.reset();
|
400
|
+
parsed->path.reset();
|
401
|
+
return;
|
402
|
+
}
|
403
|
+
|
404
|
+
int path_begin = -1;
|
405
|
+
int path_end = -1;
|
406
|
+
|
407
|
+
// Extract the scheme, with the path being everything following. We also
|
408
|
+
// handle the case where there is no scheme.
|
409
|
+
if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
|
410
|
+
// Offset the results since we gave ExtractScheme a substring.
|
411
|
+
parsed->scheme.begin += begin;
|
412
|
+
|
413
|
+
if (parsed->scheme.end() != spec_len - 1) {
|
414
|
+
path_begin = parsed->scheme.end() + 1;
|
415
|
+
path_end = spec_len;
|
416
|
+
}
|
417
|
+
} else {
|
418
|
+
// No scheme found, just path.
|
419
|
+
parsed->scheme.reset();
|
420
|
+
path_begin = begin;
|
421
|
+
path_end = spec_len;
|
422
|
+
}
|
423
|
+
|
424
|
+
// Split [path_begin, path_end) into a path + query.
|
425
|
+
for (int i = path_begin; i < path_end; ++i) {
|
426
|
+
if (spec[i] == '?') {
|
427
|
+
parsed->query = MakeRange(i + 1, path_end);
|
428
|
+
path_end = i;
|
429
|
+
break;
|
430
|
+
}
|
431
|
+
}
|
432
|
+
|
433
|
+
// For compatability with the standard URL parser, treat no path as
|
434
|
+
// -1, rather than having a length of 0
|
435
|
+
if (path_begin == path_end) {
|
436
|
+
parsed->path.reset();
|
437
|
+
} else {
|
438
|
+
parsed->path = MakeRange(path_begin, path_end);
|
439
|
+
}
|
440
|
+
}
|
441
|
+
|
442
|
+
// Converts a port number in a string to an integer. We'd like to just call
|
443
|
+
// sscanf but our input is not NULL-terminated, which sscanf requires. Instead,
|
444
|
+
// we copy the digits to a small stack buffer (since we know the maximum number
|
445
|
+
// of digits in a valid port number) that we can NULL terminate.
|
446
|
+
template<typename CHAR>
|
447
|
+
int DoParsePort(const CHAR* spec, const Component& component) {
|
448
|
+
// Easy success case when there is no port.
|
449
|
+
const int kMaxDigits = 5;
|
450
|
+
if (!component.is_nonempty())
|
451
|
+
return PORT_UNSPECIFIED;
|
452
|
+
|
453
|
+
// Skip over any leading 0s.
|
454
|
+
Component digits_comp(component.end(), 0);
|
455
|
+
for (int i = 0; i < component.len; i++) {
|
456
|
+
if (spec[component.begin + i] != '0') {
|
457
|
+
digits_comp = MakeRange(component.begin + i, component.end());
|
458
|
+
break;
|
459
|
+
}
|
460
|
+
}
|
461
|
+
if (digits_comp.len == 0)
|
462
|
+
return 0; // All digits were 0.
|
463
|
+
|
464
|
+
// Verify we don't have too many digits (we'll be copying to our buffer so
|
465
|
+
// we need to double-check).
|
466
|
+
if (digits_comp.len > kMaxDigits)
|
467
|
+
return PORT_INVALID;
|
468
|
+
|
469
|
+
// Copy valid digits to the buffer.
|
470
|
+
char digits[kMaxDigits + 1]; // +1 for null terminator
|
471
|
+
for (int i = 0; i < digits_comp.len; i++) {
|
472
|
+
CHAR ch = spec[digits_comp.begin + i];
|
473
|
+
if (!IsPortDigit(ch)) {
|
474
|
+
// Invalid port digit, fail.
|
475
|
+
return PORT_INVALID;
|
476
|
+
}
|
477
|
+
digits[i] = static_cast<char>(ch);
|
478
|
+
}
|
479
|
+
|
480
|
+
// Null-terminate the string and convert to integer. Since we guarantee
|
481
|
+
// only digits, atoi's lack of error handling is OK.
|
482
|
+
digits[digits_comp.len] = 0;
|
483
|
+
int port = atoi(digits);
|
484
|
+
if (port > 65535)
|
485
|
+
return PORT_INVALID; // Out of range.
|
486
|
+
return port;
|
487
|
+
}
|
488
|
+
|
489
|
+
template<typename CHAR>
|
490
|
+
void DoExtractFileName(const CHAR* spec,
|
491
|
+
const Component& path,
|
492
|
+
Component* file_name) {
|
493
|
+
// Handle empty paths: they have no file names.
|
494
|
+
if (!path.is_nonempty()) {
|
495
|
+
file_name->reset();
|
496
|
+
return;
|
497
|
+
}
|
498
|
+
|
499
|
+
// Search backwards for a parameter, which is a normally unused field in a
|
500
|
+
// URL delimited by a semicolon. We parse the parameter as part of the
|
501
|
+
// path, but here, we don't want to count it. The last semicolon is the
|
502
|
+
// parameter. The path should start with a slash, so we don't need to check
|
503
|
+
// the first one.
|
504
|
+
int file_end = path.end();
|
505
|
+
for (int i = path.end() - 1; i > path.begin; i--) {
|
506
|
+
if (spec[i] == ';') {
|
507
|
+
file_end = i;
|
508
|
+
break;
|
509
|
+
}
|
510
|
+
}
|
511
|
+
|
512
|
+
// Now search backwards from the filename end to the previous slash
|
513
|
+
// to find the beginning of the filename.
|
514
|
+
for (int i = file_end - 1; i >= path.begin; i--) {
|
515
|
+
if (IsURLSlash(spec[i])) {
|
516
|
+
// File name is everything following this character to the end
|
517
|
+
*file_name = MakeRange(i + 1, file_end);
|
518
|
+
return;
|
519
|
+
}
|
520
|
+
}
|
521
|
+
|
522
|
+
// No slash found, this means the input was degenerate (generally paths
|
523
|
+
// will start with a slash). Let's call everything the file name.
|
524
|
+
*file_name = MakeRange(path.begin, file_end);
|
525
|
+
return;
|
526
|
+
}
|
527
|
+
|
528
|
+
template<typename CHAR>
|
529
|
+
bool DoExtractQueryKeyValue(const CHAR* spec,
|
530
|
+
Component* query,
|
531
|
+
Component* key,
|
532
|
+
Component* value) {
|
533
|
+
if (!query->is_nonempty())
|
534
|
+
return false;
|
535
|
+
|
536
|
+
int start = query->begin;
|
537
|
+
int cur = start;
|
538
|
+
int end = query->end();
|
539
|
+
|
540
|
+
// We assume the beginning of the input is the beginning of the "key" and we
|
541
|
+
// skip to the end of it.
|
542
|
+
key->begin = cur;
|
543
|
+
while (cur < end && spec[cur] != '&' && spec[cur] != '=')
|
544
|
+
cur++;
|
545
|
+
key->len = cur - key->begin;
|
546
|
+
|
547
|
+
// Skip the separator after the key (if any).
|
548
|
+
if (cur < end && spec[cur] == '=')
|
549
|
+
cur++;
|
550
|
+
|
551
|
+
// Find the value part.
|
552
|
+
value->begin = cur;
|
553
|
+
while (cur < end && spec[cur] != '&')
|
554
|
+
cur++;
|
555
|
+
value->len = cur - value->begin;
|
556
|
+
|
557
|
+
// Finally skip the next separator if any
|
558
|
+
if (cur < end && spec[cur] == '&')
|
559
|
+
cur++;
|
560
|
+
|
561
|
+
// Save the new query
|
562
|
+
*query = url_parse::MakeRange(cur, end);
|
563
|
+
return true;
|
564
|
+
}
|
565
|
+
|
566
|
+
} // namespace
|
567
|
+
|
568
|
+
Parsed::Parsed() {
|
569
|
+
}
|
570
|
+
|
571
|
+
int Parsed::Length() const {
|
572
|
+
if (ref.is_valid())
|
573
|
+
return ref.end();
|
574
|
+
return CountCharactersBefore(REF, false);
|
575
|
+
}
|
576
|
+
|
577
|
+
int Parsed::CountCharactersBefore(ComponentType type,
|
578
|
+
bool include_delimiter) const {
|
579
|
+
if (type == SCHEME)
|
580
|
+
return scheme.begin;
|
581
|
+
|
582
|
+
// There will be some characters after the scheme like "://" and we don't
|
583
|
+
// know how many. Search forwards for the next thing until we find one.
|
584
|
+
int cur = 0;
|
585
|
+
if (scheme.is_valid())
|
586
|
+
cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme.
|
587
|
+
|
588
|
+
if (username.is_valid()) {
|
589
|
+
if (type <= USERNAME)
|
590
|
+
return username.begin;
|
591
|
+
cur = username.end() + 1; // Advance over the '@' or ':' at the end.
|
592
|
+
}
|
593
|
+
|
594
|
+
if (password.is_valid()) {
|
595
|
+
if (type <= PASSWORD)
|
596
|
+
return password.begin;
|
597
|
+
cur = password.end() + 1; // Advance over the '@' at the end.
|
598
|
+
}
|
599
|
+
|
600
|
+
if (host.is_valid()) {
|
601
|
+
if (type <= HOST)
|
602
|
+
return host.begin;
|
603
|
+
cur = host.end();
|
604
|
+
}
|
605
|
+
|
606
|
+
if (port.is_valid()) {
|
607
|
+
if (type < PORT || (type == PORT && include_delimiter))
|
608
|
+
return port.begin - 1; // Back over delimiter.
|
609
|
+
if (type == PORT)
|
610
|
+
return port.begin; // Don't want delimiter counted.
|
611
|
+
cur = port.end();
|
612
|
+
}
|
613
|
+
|
614
|
+
if (path.is_valid()) {
|
615
|
+
if (type <= PATH)
|
616
|
+
return path.begin;
|
617
|
+
cur = path.end();
|
618
|
+
}
|
619
|
+
|
620
|
+
if (query.is_valid()) {
|
621
|
+
if (type < QUERY || (type == QUERY && include_delimiter))
|
622
|
+
return query.begin - 1; // Back over delimiter.
|
623
|
+
if (type == QUERY)
|
624
|
+
return query.begin; // Don't want delimiter counted.
|
625
|
+
cur = query.end();
|
626
|
+
}
|
627
|
+
|
628
|
+
if (ref.is_valid()) {
|
629
|
+
if (type == REF && !include_delimiter)
|
630
|
+
return ref.begin; // Back over delimiter.
|
631
|
+
|
632
|
+
// When there is a ref and we get here, the component we wanted was before
|
633
|
+
// this and not found, so we always know the beginning of the ref is right.
|
634
|
+
return ref.begin - 1; // Don't want delimiter counted.
|
635
|
+
}
|
636
|
+
|
637
|
+
return cur;
|
638
|
+
}
|
639
|
+
|
640
|
+
bool ExtractScheme(const char* url, int url_len, Component* scheme) {
|
641
|
+
return DoExtractScheme(url, url_len, scheme);
|
642
|
+
}
|
643
|
+
|
644
|
+
bool ExtractScheme(const char16* url, int url_len, Component* scheme) {
|
645
|
+
return DoExtractScheme(url, url_len, scheme);
|
646
|
+
}
|
647
|
+
|
648
|
+
// This handles everything that may be an authority terminator, including
|
649
|
+
// backslash. For special backslash handling see DoParseAfterScheme.
|
650
|
+
bool IsAuthorityTerminator(char16 ch) {
|
651
|
+
return IsURLSlash(ch) || ch == '?' || ch == '#';
|
652
|
+
}
|
653
|
+
|
654
|
+
void ExtractFileName(const char* url,
|
655
|
+
const Component& path,
|
656
|
+
Component* file_name) {
|
657
|
+
DoExtractFileName(url, path, file_name);
|
658
|
+
}
|
659
|
+
|
660
|
+
void ExtractFileName(const char16* url,
|
661
|
+
const Component& path,
|
662
|
+
Component* file_name) {
|
663
|
+
DoExtractFileName(url, path, file_name);
|
664
|
+
}
|
665
|
+
|
666
|
+
bool ExtractQueryKeyValue(const char* url,
|
667
|
+
Component* query,
|
668
|
+
Component* key,
|
669
|
+
Component* value) {
|
670
|
+
return DoExtractQueryKeyValue(url, query, key, value);
|
671
|
+
}
|
672
|
+
|
673
|
+
bool ExtractQueryKeyValue(const char16* url,
|
674
|
+
Component* query,
|
675
|
+
Component* key,
|
676
|
+
Component* value) {
|
677
|
+
return DoExtractQueryKeyValue(url, query, key, value);
|
678
|
+
}
|
679
|
+
|
680
|
+
void ParseAuthority(const char* spec,
|
681
|
+
const Component& auth,
|
682
|
+
Component* username,
|
683
|
+
Component* password,
|
684
|
+
Component* hostname,
|
685
|
+
Component* port_num) {
|
686
|
+
DoParseAuthority(spec, auth, username, password, hostname, port_num);
|
687
|
+
}
|
688
|
+
|
689
|
+
void ParseAuthority(const char16* spec,
|
690
|
+
const Component& auth,
|
691
|
+
Component* username,
|
692
|
+
Component* password,
|
693
|
+
Component* hostname,
|
694
|
+
Component* port_num) {
|
695
|
+
DoParseAuthority(spec, auth, username, password, hostname, port_num);
|
696
|
+
}
|
697
|
+
|
698
|
+
int ParsePort(const char* url, const Component& port) {
|
699
|
+
return DoParsePort(url, port);
|
700
|
+
}
|
701
|
+
|
702
|
+
int ParsePort(const char16* url, const Component& port) {
|
703
|
+
return DoParsePort(url, port);
|
704
|
+
}
|
705
|
+
|
706
|
+
void ParseStandardURL(const char* url, int url_len, Parsed* parsed) {
|
707
|
+
DoParseStandardURL(url, url_len, parsed);
|
708
|
+
}
|
709
|
+
|
710
|
+
void ParseStandardURL(const char16* url, int url_len, Parsed* parsed) {
|
711
|
+
DoParseStandardURL(url, url_len, parsed);
|
712
|
+
}
|
713
|
+
|
714
|
+
void ParsePathURL(const char* url, int url_len, Parsed* parsed) {
|
715
|
+
DoParsePathURL(url, url_len, parsed);
|
716
|
+
}
|
717
|
+
|
718
|
+
void ParsePathURL(const char16* url, int url_len, Parsed* parsed) {
|
719
|
+
DoParsePathURL(url, url_len, parsed);
|
720
|
+
}
|
721
|
+
|
722
|
+
void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) {
|
723
|
+
DoParseMailtoURL(url, url_len, parsed);
|
724
|
+
}
|
725
|
+
|
726
|
+
void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed) {
|
727
|
+
DoParseMailtoURL(url, url_len, parsed);
|
728
|
+
}
|
729
|
+
|
730
|
+
void ParsePathInternal(const char* spec,
|
731
|
+
const Component& path,
|
732
|
+
Component* filepath,
|
733
|
+
Component* query,
|
734
|
+
Component* ref) {
|
735
|
+
ParsePath(spec, path, filepath, query, ref);
|
736
|
+
}
|
737
|
+
|
738
|
+
void ParsePathInternal(const char16* spec,
|
739
|
+
const Component& path,
|
740
|
+
Component* filepath,
|
741
|
+
Component* query,
|
742
|
+
Component* ref) {
|
743
|
+
ParsePath(spec, path, filepath, query, ref);
|
744
|
+
}
|
745
|
+
|
746
|
+
void ParseAfterScheme(const char* spec,
|
747
|
+
int spec_len,
|
748
|
+
int after_scheme,
|
749
|
+
Parsed* parsed) {
|
750
|
+
DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
|
751
|
+
}
|
752
|
+
|
753
|
+
void ParseAfterScheme(const char16* spec,
|
754
|
+
int spec_len,
|
755
|
+
int after_scheme,
|
756
|
+
Parsed* parsed) {
|
757
|
+
DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
|
758
|
+
}
|
759
|
+
|
760
|
+
} // namespace url_parse
|