uri_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,392 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Canonicalizers for random bits that aren't big enough for their own files.
31
+
32
+ #include <string.h>
33
+
34
+ #include "url_canon.h"
35
+ #include "url_canon_internal.h"
36
+
37
+ namespace url_canon {
38
+
39
+ namespace {
40
+
41
+ // Returns true if the given character should be removed from the middle of a
42
+ // URL.
43
+ inline bool IsRemovableURLWhitespace(int ch) {
44
+ return ch == '\r' || ch == '\n' || ch == '\t';
45
+ }
46
+
47
+ // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
48
+ // It sucks that we have to do this, since this takes about 13% of the total URL
49
+ // canonicalization time.
50
+ template<typename CHAR>
51
+ const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
52
+ CanonOutputT<CHAR>* buffer,
53
+ int* output_len) {
54
+ // Fast verification that there's nothing that needs removal. This is the 99%
55
+ // case, so we want it to be fast and don't care about impacting the speed
56
+ // when we do find whitespace.
57
+ int found_whitespace = false;
58
+ for (int i = 0; i < input_len; i++) {
59
+ if (!IsRemovableURLWhitespace(input[i]))
60
+ continue;
61
+ found_whitespace = true;
62
+ break;
63
+ }
64
+
65
+ if (!found_whitespace) {
66
+ // Didn't find any whitespace, we don't need to do anything. We can just
67
+ // return the input as the output.
68
+ *output_len = input_len;
69
+ return input;
70
+ }
71
+
72
+ // Remove the whitespace into the new buffer and return it.
73
+ for (int i = 0; i < input_len; i++) {
74
+ if (!IsRemovableURLWhitespace(input[i]))
75
+ buffer->push_back(input[i]);
76
+ }
77
+ *output_len = buffer->length();
78
+ return buffer->data();
79
+ }
80
+
81
+ // Contains the canonical version of each possible input letter in the scheme
82
+ // (basically, lower-cased). The corresponding entry will be 0 if the letter
83
+ // is not allowed in a scheme.
84
+ const char kSchemeCanonical[0x80] = {
85
+ // 00-1f: all are invalid
86
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88
+ // ' ' ! " # $ % & ' ( ) * + , - . /
89
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
90
+ // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
91
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
92
+ // @ A B C D E F G H I J K L M N O
93
+ 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
94
+ // P Q R S T U V W X Y Z [ \ ] ^ _
95
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
96
+ // ` a b c d e f g h i j k l m n o
97
+ 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
98
+ // p q r s t u v w x y z { | } ~
99
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
100
+
101
+ // This could be a table lookup as well by setting the high bit for each
102
+ // valid character, but it's only called once per URL, and it makes the lookup
103
+ // table easier to read not having extra stuff in it.
104
+ inline bool IsSchemeFirstChar(unsigned char c) {
105
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
106
+ }
107
+
108
+ template<typename CHAR, typename UCHAR>
109
+ bool DoScheme(const CHAR* spec,
110
+ const url_parse::Component& scheme,
111
+ CanonOutput* output,
112
+ url_parse::Component* out_scheme) {
113
+ if (scheme.len <= 0) {
114
+ // Scheme is unspecified or empty, convert to empty by appending a colon.
115
+ *out_scheme = url_parse::Component(output->length(), 0);
116
+ output->push_back(':');
117
+ return true;
118
+ }
119
+
120
+ // The output scheme starts from the current position.
121
+ out_scheme->begin = output->length();
122
+
123
+ // Danger: it's important that this code does not strip any characters: it
124
+ // only emits the canonical version (be it valid or escaped) of each of
125
+ // the input characters. Stripping would put it out of sync with
126
+ // url_util::FindAndCompareScheme, which could cause some security checks on
127
+ // schemes to be incorrect.
128
+ bool success = true;
129
+ int end = scheme.end();
130
+ for (int i = scheme.begin; i < end; i++) {
131
+ UCHAR ch = static_cast<UCHAR>(spec[i]);
132
+ char replacement = 0;
133
+ if (ch < 0x80) {
134
+ if (i == scheme.begin) {
135
+ // Need to do a special check for the first letter of the scheme.
136
+ if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
137
+ replacement = kSchemeCanonical[ch];
138
+ } else {
139
+ replacement = kSchemeCanonical[ch];
140
+ }
141
+ }
142
+
143
+ if (replacement) {
144
+ output->push_back(replacement);
145
+ } else if (ch == '%') {
146
+ // Canonicalizing the scheme multiple times should lead to the same
147
+ // result. Since invalid characters will be escaped, we need to preserve
148
+ // the percent to avoid multiple escaping. The scheme will be invalid.
149
+ success = false;
150
+ output->push_back('%');
151
+ } else {
152
+ // Invalid character, store it but mark this scheme as invalid.
153
+ success = false;
154
+
155
+ // This will escape the output and also handle encoding issues.
156
+ // Ignore the return value since we already failed.
157
+ AppendUTF8EscapedChar(spec, &i, end, output);
158
+ }
159
+ }
160
+
161
+ // The output scheme ends with the the current position, before appending
162
+ // the colon.
163
+ out_scheme->len = output->length() - out_scheme->begin;
164
+ output->push_back(':');
165
+ return success;
166
+ }
167
+
168
+ // The username and password components reference ranges in the corresponding
169
+ // *_spec strings. Typically, these specs will be the same (we're
170
+ // canonicalizing a single source string), but may be different when
171
+ // replacing components.
172
+ template<typename CHAR, typename UCHAR>
173
+ bool DoUserInfo(const CHAR* username_spec,
174
+ const url_parse::Component& username,
175
+ const CHAR* password_spec,
176
+ const url_parse::Component& password,
177
+ CanonOutput* output,
178
+ url_parse::Component* out_username,
179
+ url_parse::Component* out_password) {
180
+ if (username.len <= 0 && password.len <= 0) {
181
+ // Common case: no user info. We strip empty username/passwords.
182
+ *out_username = url_parse::Component();
183
+ *out_password = url_parse::Component();
184
+ return true;
185
+ }
186
+
187
+ // Write the username.
188
+ out_username->begin = output->length();
189
+ if (username.len > 0) {
190
+ // This will escape characters not valid for the username.
191
+ AppendStringOfType(&username_spec[username.begin], username.len,
192
+ CHAR_USERINFO, output);
193
+ }
194
+ out_username->len = output->length() - out_username->begin;
195
+
196
+ // When there is a password, we need the separator. Note that we strip
197
+ // empty but specified passwords.
198
+ if (password.len > 0) {
199
+ output->push_back(':');
200
+ out_password->begin = output->length();
201
+ AppendStringOfType(&password_spec[password.begin], password.len,
202
+ CHAR_USERINFO, output);
203
+ out_password->len = output->length() - out_password->begin;
204
+ } else {
205
+ *out_password = url_parse::Component();
206
+ }
207
+
208
+ output->push_back('@');
209
+ return true;
210
+ }
211
+
212
+ // Helper functions for converting port integers to strings.
213
+ inline void WritePortInt(char* output, int output_len, int port) {
214
+ _itoa_s(port, output, output_len, 10);
215
+ }
216
+
217
+ // This function will prepend the colon if there will be a port.
218
+ template<typename CHAR, typename UCHAR>
219
+ bool DoPort(const CHAR* spec,
220
+ const url_parse::Component& port,
221
+ int default_port_for_scheme,
222
+ CanonOutput* output,
223
+ url_parse::Component* out_port) {
224
+ int port_num = url_parse::ParsePort(spec, port);
225
+ if (port_num == url_parse::PORT_UNSPECIFIED ||
226
+ port_num == default_port_for_scheme) {
227
+ *out_port = url_parse::Component();
228
+ return true; // Leave port empty.
229
+ }
230
+
231
+ if (port_num == url_parse::PORT_INVALID) {
232
+ // Invalid port: We'll copy the text from the input so the user can see
233
+ // what the error was, and mark the URL as invalid by returning false.
234
+ output->push_back(':');
235
+ out_port->begin = output->length();
236
+ AppendInvalidNarrowString(spec, port.begin, port.end(), output);
237
+ out_port->len = output->length() - out_port->begin;
238
+ return false;
239
+ }
240
+
241
+ // Convert port number back to an integer. Max port value is 5 digits, and
242
+ // the Parsed::ExtractPort will have made sure the integer is in range.
243
+ const int buf_size = 6;
244
+ char buf[buf_size];
245
+ WritePortInt(buf, buf_size, port_num);
246
+
247
+ // Append the port number to the output, preceeded by a colon.
248
+ output->push_back(':');
249
+ out_port->begin = output->length();
250
+ for (int i = 0; i < buf_size && buf[i]; i++)
251
+ output->push_back(buf[i]);
252
+
253
+ out_port->len = output->length() - out_port->begin;
254
+ return true;
255
+ }
256
+
257
+ template<typename CHAR, typename UCHAR>
258
+ void DoCanonicalizeRef(const CHAR* spec,
259
+ const url_parse::Component& ref,
260
+ CanonOutput* output,
261
+ url_parse::Component* out_ref) {
262
+ if (ref.len < 0) {
263
+ // Common case of no ref.
264
+ *out_ref = url_parse::Component();
265
+ return;
266
+ }
267
+
268
+ // Append the ref separator. Note that we need to do this even when the ref
269
+ // is empty but present.
270
+ output->push_back('#');
271
+ out_ref->begin = output->length();
272
+
273
+ // Now iterate through all the characters, converting to UTF-8 and validating.
274
+ int end = ref.end();
275
+ for (int i = ref.begin; i < end; i++) {
276
+ if (spec[i] == 0) {
277
+ // IE just strips NULLs, so we do too.
278
+ continue;
279
+ } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
280
+ // Unline IE seems to, we escape control characters. This will probably
281
+ // make the reference fragment unusable on a web page, but people
282
+ // shouldn't be using control characters in their anchor names.
283
+ AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
284
+ } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
285
+ // Normal ASCII characters are just appended.
286
+ output->push_back(static_cast<char>(spec[i]));
287
+ } else {
288
+ // Non-ASCII characters are appended unescaped, but only when they are
289
+ // valid. Invalid Unicode characters are replaced with the "invalid
290
+ // character" as IE seems to (ReadUTFChar puts the unicode replacement
291
+ // character in the output on failure for us).
292
+ unsigned code_point;
293
+ ReadUTFChar(spec, &i, end, &code_point);
294
+ AppendUTF8Value(code_point, output);
295
+ }
296
+ }
297
+
298
+ out_ref->len = output->length() - out_ref->begin;
299
+ }
300
+
301
+ } // namespace
302
+
303
+ const char* RemoveURLWhitespace(const char* input, int input_len,
304
+ CanonOutputT<char>* buffer,
305
+ int* output_len) {
306
+ return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
307
+ }
308
+
309
+ const char16* RemoveURLWhitespace(const char16* input, int input_len,
310
+ CanonOutputT<char16>* buffer,
311
+ int* output_len) {
312
+ return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
313
+ }
314
+
315
+ char CanonicalSchemeChar(char16 ch) {
316
+ if (ch >= 0x80)
317
+ return 0; // Non-ASCII is not supported by schemes.
318
+ return kSchemeCanonical[ch];
319
+ }
320
+
321
+ bool CanonicalizeScheme(const char* spec,
322
+ const url_parse::Component& scheme,
323
+ CanonOutput* output,
324
+ url_parse::Component* out_scheme) {
325
+ return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
326
+ }
327
+
328
+ bool CanonicalizeScheme(const char16* spec,
329
+ const url_parse::Component& scheme,
330
+ CanonOutput* output,
331
+ url_parse::Component* out_scheme) {
332
+ return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
333
+ }
334
+
335
+ bool CanonicalizeUserInfo(const char* username_source,
336
+ const url_parse::Component& username,
337
+ const char* password_source,
338
+ const url_parse::Component& password,
339
+ CanonOutput* output,
340
+ url_parse::Component* out_username,
341
+ url_parse::Component* out_password) {
342
+ return DoUserInfo<char, unsigned char>(
343
+ username_source, username, password_source, password,
344
+ output, out_username, out_password);
345
+ }
346
+
347
+ bool CanonicalizeUserInfo(const char16* username_source,
348
+ const url_parse::Component& username,
349
+ const char16* password_source,
350
+ const url_parse::Component& password,
351
+ CanonOutput* output,
352
+ url_parse::Component* out_username,
353
+ url_parse::Component* out_password) {
354
+ return DoUserInfo<char16, char16>(
355
+ username_source, username, password_source, password,
356
+ output, out_username, out_password);
357
+ }
358
+
359
+ bool CanonicalizePort(const char* spec,
360
+ const url_parse::Component& port,
361
+ int default_port_for_scheme,
362
+ CanonOutput* output,
363
+ url_parse::Component* out_port) {
364
+ return DoPort<char, unsigned char>(spec, port,
365
+ default_port_for_scheme,
366
+ output, out_port);
367
+ }
368
+
369
+ bool CanonicalizePort(const char16* spec,
370
+ const url_parse::Component& port,
371
+ int default_port_for_scheme,
372
+ CanonOutput* output,
373
+ url_parse::Component* out_port) {
374
+ return DoPort<char16, char16>(spec, port, default_port_for_scheme,
375
+ output, out_port);
376
+ }
377
+
378
+ void CanonicalizeRef(const char* spec,
379
+ const url_parse::Component& ref,
380
+ CanonOutput* output,
381
+ url_parse::Component* out_ref) {
382
+ DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
383
+ }
384
+
385
+ void CanonicalizeRef(const char16* spec,
386
+ const url_parse::Component& ref,
387
+ CanonOutput* output,
388
+ url_parse::Component* out_ref) {
389
+ DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
390
+ }
391
+
392
+ } // namespace url_canon
@@ -0,0 +1,215 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Functions for canonicalizing "file:" URLs.
31
+
32
+ #include "url_canon.h"
33
+ #include "url_canon_internal.h"
34
+ #include "url_file.h"
35
+ #include "url_parse_internal.h"
36
+
37
+ namespace url_canon {
38
+
39
+ namespace {
40
+
41
+ #ifdef WIN32
42
+
43
+ // Given a pointer into the spec, this copies and canonicalizes the drive
44
+ // letter and colon to the output, if one is found. If there is not a drive
45
+ // spec, it won't do anything. The index of the next character in the input
46
+ // spec is returned (after the colon when a drive spec is found, the begin
47
+ // offset if one is not).
48
+ template<typename CHAR>
49
+ int FileDoDriveSpec(const CHAR* spec, int begin, int end,
50
+ CanonOutput* output) {
51
+ // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
52
+ // (with backslashes instead of slashes as well).
53
+ int num_slashes = url_parse::CountConsecutiveSlashes(spec, begin, end);
54
+ int after_slashes = begin + num_slashes;
55
+
56
+ if (!url_parse::DoesBeginWindowsDriveSpec(spec, after_slashes, end))
57
+ return begin; // Haven't consumed any characters
58
+
59
+ // A drive spec is the start of a path, so we need to add a slash for the
60
+ // authority terminator (typically the third slash).
61
+ output->push_back('/');
62
+
63
+ // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
64
+ // and that it is followed by a colon/pipe.
65
+
66
+ // Normalize Windows drive letters to uppercase
67
+ if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
68
+ output->push_back(spec[after_slashes] - 'a' + 'A');
69
+ else
70
+ output->push_back(static_cast<char>(spec[after_slashes]));
71
+
72
+ // Normalize the character following it to a colon rather than pipe.
73
+ output->push_back(':');
74
+ return after_slashes + 2;
75
+ }
76
+
77
+ #endif // WIN32
78
+
79
+ template<typename CHAR, typename UCHAR>
80
+ bool DoFileCanonicalizePath(const CHAR* spec,
81
+ const url_parse::Component& path,
82
+ CanonOutput* output,
83
+ url_parse::Component* out_path) {
84
+ // Copies and normalizes the "c:" at the beginning, if present.
85
+ out_path->begin = output->length();
86
+ int after_drive;
87
+ #ifdef WIN32
88
+ after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output);
89
+ #else
90
+ after_drive = path.begin;
91
+ #endif
92
+
93
+ // Copies the rest of the path, starting from the slash following the
94
+ // drive colon (if any, Windows only), or the first slash of the path.
95
+ bool success = true;
96
+ if (after_drive < path.end()) {
97
+ // Use the regular path canonicalizer to canonicalize the rest of the
98
+ // path. Give it a fake output component to write into. DoCanonicalizeFile
99
+ // will compute the full path component.
100
+ url_parse::Component sub_path =
101
+ url_parse::MakeRange(after_drive, path.end());
102
+ url_parse::Component fake_output_path;
103
+ success = CanonicalizePath(spec, sub_path, output, &fake_output_path);
104
+ } else {
105
+ // No input path, canonicalize to a slash.
106
+ output->push_back('/');
107
+ }
108
+
109
+ out_path->len = output->length() - out_path->begin;
110
+ return success;
111
+ }
112
+
113
+ template<typename CHAR, typename UCHAR>
114
+ bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
115
+ const url_parse::Parsed& parsed,
116
+ CharsetConverter* query_converter,
117
+ CanonOutput* output,
118
+ url_parse::Parsed* new_parsed) {
119
+ // Things we don't set in file: URLs.
120
+ new_parsed->username = url_parse::Component();
121
+ new_parsed->password = url_parse::Component();
122
+ new_parsed->port = url_parse::Component();
123
+
124
+ // Scheme (known, so we don't bother running it through the more
125
+ // complicated scheme canonicalizer).
126
+ new_parsed->scheme.begin = output->length();
127
+ output->Append("file://", 7);
128
+ new_parsed->scheme.len = 4;
129
+
130
+ // Append the host. For many file URLs, this will be empty. For UNC, this
131
+ // will be present.
132
+ // TODO(brettw) This doesn't do any checking for host name validity. We
133
+ // should probably handle validity checking of UNC hosts differently than
134
+ // for regular IP hosts.
135
+ bool success = CanonicalizeHost(source.host, parsed.host,
136
+ output, &new_parsed->host);
137
+ success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path,
138
+ output, &new_parsed->path);
139
+ CanonicalizeQuery(source.query, parsed.query, query_converter,
140
+ output, &new_parsed->query);
141
+
142
+ // Ignore failure for refs since the URL can probably still be loaded.
143
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
144
+
145
+ return success;
146
+ }
147
+
148
+ } // namespace
149
+
150
+ bool CanonicalizeFileURL(const char* spec,
151
+ int spec_len,
152
+ const url_parse::Parsed& parsed,
153
+ CharsetConverter* query_converter,
154
+ CanonOutput* output,
155
+ url_parse::Parsed* new_parsed) {
156
+ return DoCanonicalizeFileURL<char, unsigned char>(
157
+ URLComponentSource<char>(spec), parsed, query_converter,
158
+ output, new_parsed);
159
+ }
160
+
161
+ bool CanonicalizeFileURL(const char16* spec,
162
+ int spec_len,
163
+ const url_parse::Parsed& parsed,
164
+ CharsetConverter* query_converter,
165
+ CanonOutput* output,
166
+ url_parse::Parsed* new_parsed) {
167
+ return DoCanonicalizeFileURL<char16, char16>(
168
+ URLComponentSource<char16>(spec), parsed, query_converter,
169
+ output, new_parsed);
170
+ }
171
+
172
+ bool FileCanonicalizePath(const char* spec,
173
+ const url_parse::Component& path,
174
+ CanonOutput* output,
175
+ url_parse::Component* out_path) {
176
+ return DoFileCanonicalizePath<char, unsigned char>(spec, path,
177
+ output, out_path);
178
+ }
179
+
180
+ bool FileCanonicalizePath(const char16* spec,
181
+ const url_parse::Component& path,
182
+ CanonOutput* output,
183
+ url_parse::Component* out_path) {
184
+ return DoFileCanonicalizePath<char16, char16>(spec, path,
185
+ output, out_path);
186
+ }
187
+
188
+ bool ReplaceFileURL(const char* base,
189
+ const url_parse::Parsed& base_parsed,
190
+ const Replacements<char>& replacements,
191
+ CharsetConverter* query_converter,
192
+ CanonOutput* output,
193
+ url_parse::Parsed* new_parsed) {
194
+ URLComponentSource<char> source(base);
195
+ url_parse::Parsed parsed(base_parsed);
196
+ SetupOverrideComponents(base, replacements, &source, &parsed);
197
+ return DoCanonicalizeFileURL<char, unsigned char>(
198
+ source, parsed, query_converter, output, new_parsed);
199
+ }
200
+
201
+ bool ReplaceFileURL(const char* base,
202
+ const url_parse::Parsed& base_parsed,
203
+ const Replacements<char16>& replacements,
204
+ CharsetConverter* query_converter,
205
+ CanonOutput* output,
206
+ url_parse::Parsed* new_parsed) {
207
+ RawCanonOutput<1024> utf8;
208
+ URLComponentSource<char> source(base);
209
+ url_parse::Parsed parsed(base_parsed);
210
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
211
+ return DoCanonicalizeFileURL<char, unsigned char>(
212
+ source, parsed, query_converter, output, new_parsed);
213
+ }
214
+
215
+ } // namespace url_canon