uri_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,392 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Canonicalizers for random bits that aren't big enough for their own files.
31
+
32
+ #include <string.h>
33
+
34
+ #include "url_canon.h"
35
+ #include "url_canon_internal.h"
36
+
37
+ namespace url_canon {
38
+
39
+ namespace {
40
+
41
+ // Returns true if the given character should be removed from the middle of a
42
+ // URL.
43
+ inline bool IsRemovableURLWhitespace(int ch) {
44
+ return ch == '\r' || ch == '\n' || ch == '\t';
45
+ }
46
+
47
+ // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
48
+ // It sucks that we have to do this, since this takes about 13% of the total URL
49
+ // canonicalization time.
50
+ template<typename CHAR>
51
+ const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
52
+ CanonOutputT<CHAR>* buffer,
53
+ int* output_len) {
54
+ // Fast verification that there's nothing that needs removal. This is the 99%
55
+ // case, so we want it to be fast and don't care about impacting the speed
56
+ // when we do find whitespace.
57
+ int found_whitespace = false;
58
+ for (int i = 0; i < input_len; i++) {
59
+ if (!IsRemovableURLWhitespace(input[i]))
60
+ continue;
61
+ found_whitespace = true;
62
+ break;
63
+ }
64
+
65
+ if (!found_whitespace) {
66
+ // Didn't find any whitespace, we don't need to do anything. We can just
67
+ // return the input as the output.
68
+ *output_len = input_len;
69
+ return input;
70
+ }
71
+
72
+ // Remove the whitespace into the new buffer and return it.
73
+ for (int i = 0; i < input_len; i++) {
74
+ if (!IsRemovableURLWhitespace(input[i]))
75
+ buffer->push_back(input[i]);
76
+ }
77
+ *output_len = buffer->length();
78
+ return buffer->data();
79
+ }
80
+
81
+ // Contains the canonical version of each possible input letter in the scheme
82
+ // (basically, lower-cased). The corresponding entry will be 0 if the letter
83
+ // is not allowed in a scheme.
84
+ const char kSchemeCanonical[0x80] = {
85
+ // 00-1f: all are invalid
86
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88
+ // ' ' ! " # $ % & ' ( ) * + , - . /
89
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
90
+ // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
91
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
92
+ // @ A B C D E F G H I J K L M N O
93
+ 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
94
+ // P Q R S T U V W X Y Z [ \ ] ^ _
95
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
96
+ // ` a b c d e f g h i j k l m n o
97
+ 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
98
+ // p q r s t u v w x y z { | } ~
99
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
100
+
101
+ // This could be a table lookup as well by setting the high bit for each
102
+ // valid character, but it's only called once per URL, and it makes the lookup
103
+ // table easier to read not having extra stuff in it.
104
+ inline bool IsSchemeFirstChar(unsigned char c) {
105
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
106
+ }
107
+
108
+ template<typename CHAR, typename UCHAR>
109
+ bool DoScheme(const CHAR* spec,
110
+ const url_parse::Component& scheme,
111
+ CanonOutput* output,
112
+ url_parse::Component* out_scheme) {
113
+ if (scheme.len <= 0) {
114
+ // Scheme is unspecified or empty, convert to empty by appending a colon.
115
+ *out_scheme = url_parse::Component(output->length(), 0);
116
+ output->push_back(':');
117
+ return true;
118
+ }
119
+
120
+ // The output scheme starts from the current position.
121
+ out_scheme->begin = output->length();
122
+
123
+ // Danger: it's important that this code does not strip any characters: it
124
+ // only emits the canonical version (be it valid or escaped) of each of
125
+ // the input characters. Stripping would put it out of sync with
126
+ // url_util::FindAndCompareScheme, which could cause some security checks on
127
+ // schemes to be incorrect.
128
+ bool success = true;
129
+ int end = scheme.end();
130
+ for (int i = scheme.begin; i < end; i++) {
131
+ UCHAR ch = static_cast<UCHAR>(spec[i]);
132
+ char replacement = 0;
133
+ if (ch < 0x80) {
134
+ if (i == scheme.begin) {
135
+ // Need to do a special check for the first letter of the scheme.
136
+ if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
137
+ replacement = kSchemeCanonical[ch];
138
+ } else {
139
+ replacement = kSchemeCanonical[ch];
140
+ }
141
+ }
142
+
143
+ if (replacement) {
144
+ output->push_back(replacement);
145
+ } else if (ch == '%') {
146
+ // Canonicalizing the scheme multiple times should lead to the same
147
+ // result. Since invalid characters will be escaped, we need to preserve
148
+ // the percent to avoid multiple escaping. The scheme will be invalid.
149
+ success = false;
150
+ output->push_back('%');
151
+ } else {
152
+ // Invalid character, store it but mark this scheme as invalid.
153
+ success = false;
154
+
155
+ // This will escape the output and also handle encoding issues.
156
+ // Ignore the return value since we already failed.
157
+ AppendUTF8EscapedChar(spec, &i, end, output);
158
+ }
159
+ }
160
+
161
+ // The output scheme ends with the the current position, before appending
162
+ // the colon.
163
+ out_scheme->len = output->length() - out_scheme->begin;
164
+ output->push_back(':');
165
+ return success;
166
+ }
167
+
168
+ // The username and password components reference ranges in the corresponding
169
+ // *_spec strings. Typically, these specs will be the same (we're
170
+ // canonicalizing a single source string), but may be different when
171
+ // replacing components.
172
+ template<typename CHAR, typename UCHAR>
173
+ bool DoUserInfo(const CHAR* username_spec,
174
+ const url_parse::Component& username,
175
+ const CHAR* password_spec,
176
+ const url_parse::Component& password,
177
+ CanonOutput* output,
178
+ url_parse::Component* out_username,
179
+ url_parse::Component* out_password) {
180
+ if (username.len <= 0 && password.len <= 0) {
181
+ // Common case: no user info. We strip empty username/passwords.
182
+ *out_username = url_parse::Component();
183
+ *out_password = url_parse::Component();
184
+ return true;
185
+ }
186
+
187
+ // Write the username.
188
+ out_username->begin = output->length();
189
+ if (username.len > 0) {
190
+ // This will escape characters not valid for the username.
191
+ AppendStringOfType(&username_spec[username.begin], username.len,
192
+ CHAR_USERINFO, output);
193
+ }
194
+ out_username->len = output->length() - out_username->begin;
195
+
196
+ // When there is a password, we need the separator. Note that we strip
197
+ // empty but specified passwords.
198
+ if (password.len > 0) {
199
+ output->push_back(':');
200
+ out_password->begin = output->length();
201
+ AppendStringOfType(&password_spec[password.begin], password.len,
202
+ CHAR_USERINFO, output);
203
+ out_password->len = output->length() - out_password->begin;
204
+ } else {
205
+ *out_password = url_parse::Component();
206
+ }
207
+
208
+ output->push_back('@');
209
+ return true;
210
+ }
211
+
212
+ // Helper functions for converting port integers to strings.
213
+ inline void WritePortInt(char* output, int output_len, int port) {
214
+ _itoa_s(port, output, output_len, 10);
215
+ }
216
+
217
+ // This function will prepend the colon if there will be a port.
218
+ template<typename CHAR, typename UCHAR>
219
+ bool DoPort(const CHAR* spec,
220
+ const url_parse::Component& port,
221
+ int default_port_for_scheme,
222
+ CanonOutput* output,
223
+ url_parse::Component* out_port) {
224
+ int port_num = url_parse::ParsePort(spec, port);
225
+ if (port_num == url_parse::PORT_UNSPECIFIED ||
226
+ port_num == default_port_for_scheme) {
227
+ *out_port = url_parse::Component();
228
+ return true; // Leave port empty.
229
+ }
230
+
231
+ if (port_num == url_parse::PORT_INVALID) {
232
+ // Invalid port: We'll copy the text from the input so the user can see
233
+ // what the error was, and mark the URL as invalid by returning false.
234
+ output->push_back(':');
235
+ out_port->begin = output->length();
236
+ AppendInvalidNarrowString(spec, port.begin, port.end(), output);
237
+ out_port->len = output->length() - out_port->begin;
238
+ return false;
239
+ }
240
+
241
+ // Convert port number back to an integer. Max port value is 5 digits, and
242
+ // the Parsed::ExtractPort will have made sure the integer is in range.
243
+ const int buf_size = 6;
244
+ char buf[buf_size];
245
+ WritePortInt(buf, buf_size, port_num);
246
+
247
+ // Append the port number to the output, preceeded by a colon.
248
+ output->push_back(':');
249
+ out_port->begin = output->length();
250
+ for (int i = 0; i < buf_size && buf[i]; i++)
251
+ output->push_back(buf[i]);
252
+
253
+ out_port->len = output->length() - out_port->begin;
254
+ return true;
255
+ }
256
+
257
+ template<typename CHAR, typename UCHAR>
258
+ void DoCanonicalizeRef(const CHAR* spec,
259
+ const url_parse::Component& ref,
260
+ CanonOutput* output,
261
+ url_parse::Component* out_ref) {
262
+ if (ref.len < 0) {
263
+ // Common case of no ref.
264
+ *out_ref = url_parse::Component();
265
+ return;
266
+ }
267
+
268
+ // Append the ref separator. Note that we need to do this even when the ref
269
+ // is empty but present.
270
+ output->push_back('#');
271
+ out_ref->begin = output->length();
272
+
273
+ // Now iterate through all the characters, converting to UTF-8 and validating.
274
+ int end = ref.end();
275
+ for (int i = ref.begin; i < end; i++) {
276
+ if (spec[i] == 0) {
277
+ // IE just strips NULLs, so we do too.
278
+ continue;
279
+ } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
280
+ // Unline IE seems to, we escape control characters. This will probably
281
+ // make the reference fragment unusable on a web page, but people
282
+ // shouldn't be using control characters in their anchor names.
283
+ AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
284
+ } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
285
+ // Normal ASCII characters are just appended.
286
+ output->push_back(static_cast<char>(spec[i]));
287
+ } else {
288
+ // Non-ASCII characters are appended unescaped, but only when they are
289
+ // valid. Invalid Unicode characters are replaced with the "invalid
290
+ // character" as IE seems to (ReadUTFChar puts the unicode replacement
291
+ // character in the output on failure for us).
292
+ unsigned code_point;
293
+ ReadUTFChar(spec, &i, end, &code_point);
294
+ AppendUTF8Value(code_point, output);
295
+ }
296
+ }
297
+
298
+ out_ref->len = output->length() - out_ref->begin;
299
+ }
300
+
301
+ } // namespace
302
+
303
+ const char* RemoveURLWhitespace(const char* input, int input_len,
304
+ CanonOutputT<char>* buffer,
305
+ int* output_len) {
306
+ return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
307
+ }
308
+
309
+ const char16* RemoveURLWhitespace(const char16* input, int input_len,
310
+ CanonOutputT<char16>* buffer,
311
+ int* output_len) {
312
+ return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
313
+ }
314
+
315
+ char CanonicalSchemeChar(char16 ch) {
316
+ if (ch >= 0x80)
317
+ return 0; // Non-ASCII is not supported by schemes.
318
+ return kSchemeCanonical[ch];
319
+ }
320
+
321
+ bool CanonicalizeScheme(const char* spec,
322
+ const url_parse::Component& scheme,
323
+ CanonOutput* output,
324
+ url_parse::Component* out_scheme) {
325
+ return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
326
+ }
327
+
328
+ bool CanonicalizeScheme(const char16* spec,
329
+ const url_parse::Component& scheme,
330
+ CanonOutput* output,
331
+ url_parse::Component* out_scheme) {
332
+ return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
333
+ }
334
+
335
+ bool CanonicalizeUserInfo(const char* username_source,
336
+ const url_parse::Component& username,
337
+ const char* password_source,
338
+ const url_parse::Component& password,
339
+ CanonOutput* output,
340
+ url_parse::Component* out_username,
341
+ url_parse::Component* out_password) {
342
+ return DoUserInfo<char, unsigned char>(
343
+ username_source, username, password_source, password,
344
+ output, out_username, out_password);
345
+ }
346
+
347
+ bool CanonicalizeUserInfo(const char16* username_source,
348
+ const url_parse::Component& username,
349
+ const char16* password_source,
350
+ const url_parse::Component& password,
351
+ CanonOutput* output,
352
+ url_parse::Component* out_username,
353
+ url_parse::Component* out_password) {
354
+ return DoUserInfo<char16, char16>(
355
+ username_source, username, password_source, password,
356
+ output, out_username, out_password);
357
+ }
358
+
359
+ bool CanonicalizePort(const char* spec,
360
+ const url_parse::Component& port,
361
+ int default_port_for_scheme,
362
+ CanonOutput* output,
363
+ url_parse::Component* out_port) {
364
+ return DoPort<char, unsigned char>(spec, port,
365
+ default_port_for_scheme,
366
+ output, out_port);
367
+ }
368
+
369
+ bool CanonicalizePort(const char16* spec,
370
+ const url_parse::Component& port,
371
+ int default_port_for_scheme,
372
+ CanonOutput* output,
373
+ url_parse::Component* out_port) {
374
+ return DoPort<char16, char16>(spec, port, default_port_for_scheme,
375
+ output, out_port);
376
+ }
377
+
378
+ void CanonicalizeRef(const char* spec,
379
+ const url_parse::Component& ref,
380
+ CanonOutput* output,
381
+ url_parse::Component* out_ref) {
382
+ DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
383
+ }
384
+
385
+ void CanonicalizeRef(const char16* spec,
386
+ const url_parse::Component& ref,
387
+ CanonOutput* output,
388
+ url_parse::Component* out_ref) {
389
+ DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
390
+ }
391
+
392
+ } // namespace url_canon
@@ -0,0 +1,215 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Functions for canonicalizing "file:" URLs.
31
+
32
+ #include "url_canon.h"
33
+ #include "url_canon_internal.h"
34
+ #include "url_file.h"
35
+ #include "url_parse_internal.h"
36
+
37
+ namespace url_canon {
38
+
39
+ namespace {
40
+
41
+ #ifdef WIN32
42
+
43
+ // Given a pointer into the spec, this copies and canonicalizes the drive
44
+ // letter and colon to the output, if one is found. If there is not a drive
45
+ // spec, it won't do anything. The index of the next character in the input
46
+ // spec is returned (after the colon when a drive spec is found, the begin
47
+ // offset if one is not).
48
+ template<typename CHAR>
49
+ int FileDoDriveSpec(const CHAR* spec, int begin, int end,
50
+ CanonOutput* output) {
51
+ // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
52
+ // (with backslashes instead of slashes as well).
53
+ int num_slashes = url_parse::CountConsecutiveSlashes(spec, begin, end);
54
+ int after_slashes = begin + num_slashes;
55
+
56
+ if (!url_parse::DoesBeginWindowsDriveSpec(spec, after_slashes, end))
57
+ return begin; // Haven't consumed any characters
58
+
59
+ // A drive spec is the start of a path, so we need to add a slash for the
60
+ // authority terminator (typically the third slash).
61
+ output->push_back('/');
62
+
63
+ // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
64
+ // and that it is followed by a colon/pipe.
65
+
66
+ // Normalize Windows drive letters to uppercase
67
+ if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
68
+ output->push_back(spec[after_slashes] - 'a' + 'A');
69
+ else
70
+ output->push_back(static_cast<char>(spec[after_slashes]));
71
+
72
+ // Normalize the character following it to a colon rather than pipe.
73
+ output->push_back(':');
74
+ return after_slashes + 2;
75
+ }
76
+
77
+ #endif // WIN32
78
+
79
+ template<typename CHAR, typename UCHAR>
80
+ bool DoFileCanonicalizePath(const CHAR* spec,
81
+ const url_parse::Component& path,
82
+ CanonOutput* output,
83
+ url_parse::Component* out_path) {
84
+ // Copies and normalizes the "c:" at the beginning, if present.
85
+ out_path->begin = output->length();
86
+ int after_drive;
87
+ #ifdef WIN32
88
+ after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output);
89
+ #else
90
+ after_drive = path.begin;
91
+ #endif
92
+
93
+ // Copies the rest of the path, starting from the slash following the
94
+ // drive colon (if any, Windows only), or the first slash of the path.
95
+ bool success = true;
96
+ if (after_drive < path.end()) {
97
+ // Use the regular path canonicalizer to canonicalize the rest of the
98
+ // path. Give it a fake output component to write into. DoCanonicalizeFile
99
+ // will compute the full path component.
100
+ url_parse::Component sub_path =
101
+ url_parse::MakeRange(after_drive, path.end());
102
+ url_parse::Component fake_output_path;
103
+ success = CanonicalizePath(spec, sub_path, output, &fake_output_path);
104
+ } else {
105
+ // No input path, canonicalize to a slash.
106
+ output->push_back('/');
107
+ }
108
+
109
+ out_path->len = output->length() - out_path->begin;
110
+ return success;
111
+ }
112
+
113
+ template<typename CHAR, typename UCHAR>
114
+ bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
115
+ const url_parse::Parsed& parsed,
116
+ CharsetConverter* query_converter,
117
+ CanonOutput* output,
118
+ url_parse::Parsed* new_parsed) {
119
+ // Things we don't set in file: URLs.
120
+ new_parsed->username = url_parse::Component();
121
+ new_parsed->password = url_parse::Component();
122
+ new_parsed->port = url_parse::Component();
123
+
124
+ // Scheme (known, so we don't bother running it through the more
125
+ // complicated scheme canonicalizer).
126
+ new_parsed->scheme.begin = output->length();
127
+ output->Append("file://", 7);
128
+ new_parsed->scheme.len = 4;
129
+
130
+ // Append the host. For many file URLs, this will be empty. For UNC, this
131
+ // will be present.
132
+ // TODO(brettw) This doesn't do any checking for host name validity. We
133
+ // should probably handle validity checking of UNC hosts differently than
134
+ // for regular IP hosts.
135
+ bool success = CanonicalizeHost(source.host, parsed.host,
136
+ output, &new_parsed->host);
137
+ success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path,
138
+ output, &new_parsed->path);
139
+ CanonicalizeQuery(source.query, parsed.query, query_converter,
140
+ output, &new_parsed->query);
141
+
142
+ // Ignore failure for refs since the URL can probably still be loaded.
143
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
144
+
145
+ return success;
146
+ }
147
+
148
+ } // namespace
149
+
150
+ bool CanonicalizeFileURL(const char* spec,
151
+ int spec_len,
152
+ const url_parse::Parsed& parsed,
153
+ CharsetConverter* query_converter,
154
+ CanonOutput* output,
155
+ url_parse::Parsed* new_parsed) {
156
+ return DoCanonicalizeFileURL<char, unsigned char>(
157
+ URLComponentSource<char>(spec), parsed, query_converter,
158
+ output, new_parsed);
159
+ }
160
+
161
+ bool CanonicalizeFileURL(const char16* spec,
162
+ int spec_len,
163
+ const url_parse::Parsed& parsed,
164
+ CharsetConverter* query_converter,
165
+ CanonOutput* output,
166
+ url_parse::Parsed* new_parsed) {
167
+ return DoCanonicalizeFileURL<char16, char16>(
168
+ URLComponentSource<char16>(spec), parsed, query_converter,
169
+ output, new_parsed);
170
+ }
171
+
172
+ bool FileCanonicalizePath(const char* spec,
173
+ const url_parse::Component& path,
174
+ CanonOutput* output,
175
+ url_parse::Component* out_path) {
176
+ return DoFileCanonicalizePath<char, unsigned char>(spec, path,
177
+ output, out_path);
178
+ }
179
+
180
+ bool FileCanonicalizePath(const char16* spec,
181
+ const url_parse::Component& path,
182
+ CanonOutput* output,
183
+ url_parse::Component* out_path) {
184
+ return DoFileCanonicalizePath<char16, char16>(spec, path,
185
+ output, out_path);
186
+ }
187
+
188
+ bool ReplaceFileURL(const char* base,
189
+ const url_parse::Parsed& base_parsed,
190
+ const Replacements<char>& replacements,
191
+ CharsetConverter* query_converter,
192
+ CanonOutput* output,
193
+ url_parse::Parsed* new_parsed) {
194
+ URLComponentSource<char> source(base);
195
+ url_parse::Parsed parsed(base_parsed);
196
+ SetupOverrideComponents(base, replacements, &source, &parsed);
197
+ return DoCanonicalizeFileURL<char, unsigned char>(
198
+ source, parsed, query_converter, output, new_parsed);
199
+ }
200
+
201
+ bool ReplaceFileURL(const char* base,
202
+ const url_parse::Parsed& base_parsed,
203
+ const Replacements<char16>& replacements,
204
+ CharsetConverter* query_converter,
205
+ CanonOutput* output,
206
+ url_parse::Parsed* new_parsed) {
207
+ RawCanonOutput<1024> utf8;
208
+ URLComponentSource<char> source(base);
209
+ url_parse::Parsed parsed(base_parsed);
210
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
211
+ return DoCanonicalizeFileURL<char, unsigned char>(
212
+ source, parsed, query_converter, output, new_parsed);
213
+ }
214
+
215
+ } // namespace url_canon