uri_parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/.rvmrc +1 -0
- data/Gemfile +6 -0
- data/Rakefile +13 -0
- data/ext/uri_parser/basictypes.h +89 -0
- data/ext/uri_parser/extconf.h +6 -0
- data/ext/uri_parser/extconf.rb +50 -0
- data/ext/uri_parser/logging.h +5 -0
- data/ext/uri_parser/scoped_ptr.h +322 -0
- data/ext/uri_parser/string16.cc +95 -0
- data/ext/uri_parser/string16.h +194 -0
- data/ext/uri_parser/uri_parser.cc +87 -0
- data/ext/uri_parser/url_canon.h +872 -0
- data/ext/uri_parser/url_canon_etc.cc +392 -0
- data/ext/uri_parser/url_canon_fileurl.cc +215 -0
- data/ext/uri_parser/url_canon_host.cc +401 -0
- data/ext/uri_parser/url_canon_icu.cc +207 -0
- data/ext/uri_parser/url_canon_icu.h +63 -0
- data/ext/uri_parser/url_canon_internal.cc +427 -0
- data/ext/uri_parser/url_canon_internal.h +453 -0
- data/ext/uri_parser/url_canon_internal_file.h +157 -0
- data/ext/uri_parser/url_canon_ip.cc +737 -0
- data/ext/uri_parser/url_canon_ip.h +101 -0
- data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
- data/ext/uri_parser/url_canon_path.cc +380 -0
- data/ext/uri_parser/url_canon_pathurl.cc +128 -0
- data/ext/uri_parser/url_canon_query.cc +189 -0
- data/ext/uri_parser/url_canon_relative.cc +572 -0
- data/ext/uri_parser/url_canon_stdstring.h +134 -0
- data/ext/uri_parser/url_canon_stdurl.cc +211 -0
- data/ext/uri_parser/url_common.h +48 -0
- data/ext/uri_parser/url_file.h +108 -0
- data/ext/uri_parser/url_parse.cc +760 -0
- data/ext/uri_parser/url_parse.h +336 -0
- data/ext/uri_parser/url_parse_file.cc +243 -0
- data/ext/uri_parser/url_parse_internal.h +112 -0
- data/ext/uri_parser/url_util.cc +553 -0
- data/ext/uri_parser/url_util.h +222 -0
- data/lib/uri_parser.rb +28 -0
- data/lib/uri_parser/version.rb +3 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/uri_parser_spec.rb +54 -0
- data/uri_parser.gemspec +26 -0
- metadata +117 -0
@@ -0,0 +1,401 @@
|
|
1
|
+
// Copyright 2007, Google Inc.
|
2
|
+
// All rights reserved.
|
3
|
+
//
|
4
|
+
// Redistribution and use in source and binary forms, with or without
|
5
|
+
// modification, are permitted provided that the following conditions are
|
6
|
+
// met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright
|
9
|
+
// notice, this list of conditions and the following disclaimer.
|
10
|
+
// * Redistributions in binary form must reproduce the above
|
11
|
+
// copyright notice, this list of conditions and the following disclaimer
|
12
|
+
// in the documentation and/or other materials provided with the
|
13
|
+
// distribution.
|
14
|
+
// * Neither the name of Google Inc. nor the names of its
|
15
|
+
// contributors may be used to endorse or promote products derived from
|
16
|
+
// this software without specific prior written permission.
|
17
|
+
//
|
18
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
20
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
21
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
22
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
25
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
26
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
27
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
+
|
30
|
+
#include "logging.h"
|
31
|
+
#include "url_canon.h"
|
32
|
+
#include "url_canon_internal.h"
|
33
|
+
|
34
|
+
namespace url_canon {
|
35
|
+
|
36
|
+
namespace {
|
37
|
+
|
38
|
+
// For reference, here's what IE supports:
|
39
|
+
// Key: 0 (disallowed: failure if present in the input)
|
40
|
+
// + (allowed either escaped or unescaped, and unmodified)
|
41
|
+
// U (allowed escaped or unescaped but always unescaped if present in
|
42
|
+
// escaped form)
|
43
|
+
// E (allowed escaped or unescaped but always escaped if present in
|
44
|
+
// unescaped form)
|
45
|
+
// % (only allowed escaped in the input, will be unmodified).
|
46
|
+
// I left blank alpha numeric characters.
|
47
|
+
//
|
48
|
+
// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
49
|
+
// -----------------------------------------------
|
50
|
+
// 0 0 E E E E E E E E E E E E E E E
|
51
|
+
// 1 E E E E E E E E E E E E E E E E
|
52
|
+
// 2 E + E E + E + + + + + + + U U 0
|
53
|
+
// 3 % % E + E 0 <-- Those are : ; < = > ?
|
54
|
+
// 4 %
|
55
|
+
// 5 U 0 U U U <-- Those are [ \ ] ^ _
|
56
|
+
// 6 E <-- That's `
|
57
|
+
// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE)
|
58
|
+
//
|
59
|
+
// NOTE: I didn't actually test all the control characters. Some may be
|
60
|
+
// disallowed in the input, but they are all accepted escaped except for 0.
|
61
|
+
// I also didn't test if characters affecting HTML parsing are allowed
|
62
|
+
// unescaped, eg. (") or (#), which would indicate the beginning of the path.
|
63
|
+
// Surprisingly, space is accepted in the input and always escaped.
|
64
|
+
|
65
|
+
// This table lists the canonical version of all characters we allow in the
|
66
|
+
// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
|
67
|
+
// value to indicate that this character should be escaped. We are a little more
|
68
|
+
// restrictive than IE, but less restrictive than Firefox.
|
69
|
+
//
|
70
|
+
// Note that we disallow the % character. We will allow it when part of an
|
71
|
+
// escape sequence, of course, but this disallows "%25". Even though IE allows
|
72
|
+
// it, allowing it would put us in a funny state. If there was an invalid
|
73
|
+
// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
|
74
|
+
// Allowing percents means we'll succeed a second time, so validity would change
|
75
|
+
// based on how many times you run the canonicalizer. We prefer to always report
|
76
|
+
// the same vailidity, so reject this.
|
77
|
+
const unsigned char kEsc = 0xff;
|
78
|
+
const unsigned char kHostCharLookup[0x80] = {
|
79
|
+
// 00-1f: all are invalid
|
80
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
81
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
82
|
+
// ' ' ! " # $ % & ' ( ) * + , - . /
|
83
|
+
kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
|
84
|
+
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
85
|
+
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
|
86
|
+
// @ A B C D E F G H I J K L M N O
|
87
|
+
kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
88
|
+
// P Q R S T U V W X Y Z [ \ ] ^ _
|
89
|
+
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
|
90
|
+
// ` a b c d e f g h i j k l m n o
|
91
|
+
kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
92
|
+
// p q r s t u v w x y z { | } ~
|
93
|
+
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
|
94
|
+
|
95
|
+
const int kTempHostBufferLen = 1024;
|
96
|
+
typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
|
97
|
+
typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
|
98
|
+
|
99
|
+
// Scans a host name and fills in the output flags according to what we find.
|
100
|
+
// |has_non_ascii| will be true if there are any non-7-bit characters, and
|
101
|
+
// |has_escaped| will be true if there is a percent sign.
|
102
|
+
template<typename CHAR, typename UCHAR>
|
103
|
+
void ScanHostname(const CHAR* spec, const url_parse::Component& host,
|
104
|
+
bool* has_non_ascii, bool* has_escaped) {
|
105
|
+
int end = host.end();
|
106
|
+
*has_non_ascii = false;
|
107
|
+
*has_escaped = false;
|
108
|
+
for (int i = host.begin; i < end; i++) {
|
109
|
+
if (static_cast<UCHAR>(spec[i]) >= 0x80)
|
110
|
+
*has_non_ascii = true;
|
111
|
+
else if (spec[i] == '%')
|
112
|
+
*has_escaped = true;
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
// Canonicalizes a host name that is entirely 8-bit characters (even though
|
117
|
+
// the type holding them may be 16 bits. Escaped characters will be unescaped.
|
118
|
+
// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
|
119
|
+
//
|
120
|
+
// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
|
121
|
+
// the output.
|
122
|
+
//
|
123
|
+
// This function is used in two situations:
|
124
|
+
//
|
125
|
+
// * When the caller knows there is no non-ASCII or percent escaped
|
126
|
+
// characters. This is what DoHost does. The result will be a completely
|
127
|
+
// canonicalized host since we know nothing weird can happen (escaped
|
128
|
+
// characters could be unescaped to non-7-bit, so they have to be treated
|
129
|
+
// with suspicion at this point). It does not use the |has_non_ascii| flag.
|
130
|
+
//
|
131
|
+
// * When the caller has an 8-bit string that may need unescaping.
|
132
|
+
// DoComplexHost calls us this situation to do unescaping and validation.
|
133
|
+
// After this, it may do other IDN operations depending on the value of the
|
134
|
+
// |*has_non_ascii| flag.
|
135
|
+
//
|
136
|
+
// The return value indicates if the output is a potentially valid host name.
|
137
|
+
template<typename INCHAR, typename OUTCHAR>
|
138
|
+
bool DoSimpleHost(const INCHAR* host,
|
139
|
+
int host_len,
|
140
|
+
CanonOutputT<OUTCHAR>* output,
|
141
|
+
bool* has_non_ascii) {
|
142
|
+
*has_non_ascii = false;
|
143
|
+
|
144
|
+
bool success = true;
|
145
|
+
for (int i = 0; i < host_len; ++i) {
|
146
|
+
unsigned int source = host[i];
|
147
|
+
if (source == '%') {
|
148
|
+
// Unescape first, if possible.
|
149
|
+
// Source will be used only if decode operation was successful.
|
150
|
+
if (!DecodeEscaped(host, &i, host_len,
|
151
|
+
reinterpret_cast<unsigned char*>(&source))) {
|
152
|
+
// Invalid escaped character. There is nothing that can make this
|
153
|
+
// host valid. We append an escaped percent so the URL looks reasonable
|
154
|
+
// and mark as failed.
|
155
|
+
AppendEscapedChar('%', output);
|
156
|
+
success = false;
|
157
|
+
continue;
|
158
|
+
}
|
159
|
+
}
|
160
|
+
|
161
|
+
if (source < 0x80) {
|
162
|
+
// We have ASCII input, we can use our lookup table.
|
163
|
+
unsigned char replacement = kHostCharLookup[source];
|
164
|
+
if (!replacement) {
|
165
|
+
// Invalid character, add it as percent-escaped and mark as failed.
|
166
|
+
AppendEscapedChar(source, output);
|
167
|
+
success = false;
|
168
|
+
} else if (replacement == kEsc) {
|
169
|
+
// This character is valid but should be escaped.
|
170
|
+
AppendEscapedChar(source, output);
|
171
|
+
} else {
|
172
|
+
// Common case, the given character is valid in a hostname, the lookup
|
173
|
+
// table tells us the canonical representation of that character (lower
|
174
|
+
// cased).
|
175
|
+
output->push_back(replacement);
|
176
|
+
}
|
177
|
+
} else {
|
178
|
+
// It's a non-ascii char. Just push it to the output.
|
179
|
+
// In case where we have char16 input, and char output it's safe to
|
180
|
+
// cast char16->char only if input string was converted to ASCII.
|
181
|
+
output->push_back(static_cast<OUTCHAR>(source));
|
182
|
+
*has_non_ascii = true;
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
186
|
+
return success;
|
187
|
+
}
|
188
|
+
|
189
|
+
// Canonicalizes a host that requires IDN conversion. Returns true on success
|
190
|
+
bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
|
191
|
+
// We need to escape URL before doing IDN conversion, since punicode strings
|
192
|
+
// cannot be escaped after they are created.
|
193
|
+
RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
|
194
|
+
bool has_non_ascii;
|
195
|
+
DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
|
196
|
+
|
197
|
+
StackBufferW wide_output;
|
198
|
+
if (!IDNToASCII(url_escaped_host.data(),
|
199
|
+
url_escaped_host.length(),
|
200
|
+
&wide_output)) {
|
201
|
+
// Some error, give up. This will write some reasonable looking
|
202
|
+
// representation of the string to the output.
|
203
|
+
AppendInvalidNarrowString(src, 0, src_len, output);
|
204
|
+
return false;
|
205
|
+
}
|
206
|
+
|
207
|
+
// Now we check the ASCII output like a normal host. It will also handle
|
208
|
+
// unescaping. Although we unescaped everything before this function call, if
|
209
|
+
// somebody does %00 as fullwidth, ICU will convert this to ASCII.
|
210
|
+
bool success = DoSimpleHost(wide_output.data(),
|
211
|
+
wide_output.length(),
|
212
|
+
output, &has_non_ascii);
|
213
|
+
DCHECK(!has_non_ascii);
|
214
|
+
return success;
|
215
|
+
}
|
216
|
+
|
217
|
+
// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
|
218
|
+
// UTF-16. The has_escaped flag should be set if the input string requires
|
219
|
+
// unescaping.
|
220
|
+
bool DoComplexHost(const char* host, int host_len,
|
221
|
+
bool has_non_ascii, bool has_escaped, CanonOutput* output) {
|
222
|
+
// Save the current position in the output. We may write stuff and rewind it
|
223
|
+
// below, so we need to know where to rewind to.
|
224
|
+
int begin_length = output->length();
|
225
|
+
|
226
|
+
// Points to the UTF-8 data we want to convert. This will either be the
|
227
|
+
// input or the unescaped version written to |*output| if necessary.
|
228
|
+
const char* utf8_source;
|
229
|
+
int utf8_source_len;
|
230
|
+
if (has_escaped) {
|
231
|
+
// Unescape before converting to UTF-16 for IDN. We write this into the
|
232
|
+
// output because it most likely does not require IDNization, and we can
|
233
|
+
// save another huge stack buffer. It will be replaced below if it requires
|
234
|
+
// IDN. This will also update our non-ASCII flag so we know whether the
|
235
|
+
// unescaped input requires IDN.
|
236
|
+
if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
|
237
|
+
// Error with some escape sequence. We'll call the current output
|
238
|
+
// complete. DoSimpleHost will have written some "reasonable" output.
|
239
|
+
return false;
|
240
|
+
}
|
241
|
+
|
242
|
+
// Unescaping may have left us with ASCII input, in which case the
|
243
|
+
// unescaped version we wrote to output is complete.
|
244
|
+
if (!has_non_ascii) {
|
245
|
+
return true;
|
246
|
+
}
|
247
|
+
|
248
|
+
// Save the pointer into the data was just converted (it may be appended to
|
249
|
+
// other data in the output buffer).
|
250
|
+
utf8_source = &output->data()[begin_length];
|
251
|
+
utf8_source_len = output->length() - begin_length;
|
252
|
+
} else {
|
253
|
+
// We don't need to unescape, use input for IDNization later. (We know the
|
254
|
+
// input has non-ASCII, or the simple version would have been called
|
255
|
+
// instead of us.)
|
256
|
+
utf8_source = host;
|
257
|
+
utf8_source_len = host_len;
|
258
|
+
}
|
259
|
+
|
260
|
+
// Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
|
261
|
+
// Above, we may have used the output to write the unescaped values to, so
|
262
|
+
// we have to rewind it to where we started after we convert it to UTF-16.
|
263
|
+
StackBufferW utf16;
|
264
|
+
if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
|
265
|
+
// In this error case, the input may or may not be the output.
|
266
|
+
StackBuffer utf8;
|
267
|
+
for (int i = 0; i < utf8_source_len; i++)
|
268
|
+
utf8.push_back(utf8_source[i]);
|
269
|
+
output->set_length(begin_length);
|
270
|
+
AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
|
271
|
+
return false;
|
272
|
+
}
|
273
|
+
output->set_length(begin_length);
|
274
|
+
|
275
|
+
// This will call DoSimpleHost which will do normal ASCII canonicalization
|
276
|
+
// and also check for IP addresses in the outpt.
|
277
|
+
return DoIDNHost(utf16.data(), utf16.length(), output);
|
278
|
+
}
|
279
|
+
|
280
|
+
// UTF-16 convert host to its ASCII version. The set up is already ready for
|
281
|
+
// the backend, so we just pass through. The has_escaped flag should be set if
|
282
|
+
// the input string requires unescaping.
|
283
|
+
bool DoComplexHost(const char16* host, int host_len,
|
284
|
+
bool has_non_ascii, bool has_escaped, CanonOutput* output) {
|
285
|
+
if (has_escaped) {
|
286
|
+
// Yikes, we have escaped characters with wide input. The escaped
|
287
|
+
// characters should be interpreted as UTF-8. To solve this problem,
|
288
|
+
// we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
|
289
|
+
//
|
290
|
+
// We don't bother to optimize the conversion in the ASCII case (which
|
291
|
+
// *could* just be a copy) and use the UTF-8 path, because it should be
|
292
|
+
// very rare that host names have escaped characters, and it is relatively
|
293
|
+
// fast to do the conversion anyway.
|
294
|
+
StackBuffer utf8;
|
295
|
+
if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
|
296
|
+
AppendInvalidNarrowString(host, 0, host_len, output);
|
297
|
+
return false;
|
298
|
+
}
|
299
|
+
|
300
|
+
// Once we convert to UTF-8, we can use the 8-bit version of the complex
|
301
|
+
// host handling code above.
|
302
|
+
return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
|
303
|
+
has_escaped, output);
|
304
|
+
}
|
305
|
+
|
306
|
+
// No unescaping necessary, we can safely pass the input to ICU. This
|
307
|
+
// function will only get called if we either have escaped or non-ascii
|
308
|
+
// input, so it's safe to just use ICU now. Even if the input is ASCII,
|
309
|
+
// this function will do the right thing (just slower than we could).
|
310
|
+
return DoIDNHost(host, host_len, output);
|
311
|
+
}
|
312
|
+
|
313
|
+
template<typename CHAR, typename UCHAR>
|
314
|
+
void DoHost(const CHAR* spec,
|
315
|
+
const url_parse::Component& host,
|
316
|
+
CanonOutput* output,
|
317
|
+
CanonHostInfo* host_info) {
|
318
|
+
if (host.len <= 0) {
|
319
|
+
// Empty hosts don't need anything.
|
320
|
+
host_info->family = CanonHostInfo::NEUTRAL;
|
321
|
+
host_info->out_host = url_parse::Component();
|
322
|
+
return;
|
323
|
+
}
|
324
|
+
|
325
|
+
bool has_non_ascii, has_escaped;
|
326
|
+
ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
|
327
|
+
|
328
|
+
// Keep track of output's initial length, so we can rewind later.
|
329
|
+
const int output_begin = output->length();
|
330
|
+
|
331
|
+
bool success;
|
332
|
+
if (!has_non_ascii && !has_escaped) {
|
333
|
+
success = DoSimpleHost(&spec[host.begin], host.len,
|
334
|
+
output, &has_non_ascii);
|
335
|
+
DCHECK(!has_non_ascii);
|
336
|
+
} else {
|
337
|
+
success = DoComplexHost(&spec[host.begin], host.len,
|
338
|
+
has_non_ascii, has_escaped, output);
|
339
|
+
}
|
340
|
+
|
341
|
+
if (!success) {
|
342
|
+
// Canonicalization failed. Set BROKEN to notify the caller.
|
343
|
+
host_info->family = CanonHostInfo::BROKEN;
|
344
|
+
} else {
|
345
|
+
// After all the other canonicalization, check if we ended up with an IP
|
346
|
+
// address. IP addresses are small, so writing into this temporary buffer
|
347
|
+
// should not cause an allocation.
|
348
|
+
RawCanonOutput<64> canon_ip;
|
349
|
+
CanonicalizeIPAddress(output->data(),
|
350
|
+
url_parse::MakeRange(output_begin, output->length()),
|
351
|
+
&canon_ip, host_info);
|
352
|
+
|
353
|
+
// If we got an IPv4/IPv6 address, copy the canonical form back to the
|
354
|
+
// real buffer. Otherwise, it's a hostname or broken IP, in which case
|
355
|
+
// we just leave it in place.
|
356
|
+
if (host_info->IsIPAddress()) {
|
357
|
+
output->set_length(output_begin);
|
358
|
+
output->Append(canon_ip.data(), canon_ip.length());
|
359
|
+
}
|
360
|
+
}
|
361
|
+
|
362
|
+
host_info->out_host = url_parse::MakeRange(output_begin, output->length());
|
363
|
+
}
|
364
|
+
|
365
|
+
} // namespace
|
366
|
+
|
367
|
+
bool CanonicalizeHost(const char* spec,
|
368
|
+
const url_parse::Component& host,
|
369
|
+
CanonOutput* output,
|
370
|
+
url_parse::Component* out_host) {
|
371
|
+
CanonHostInfo host_info;
|
372
|
+
DoHost<char, unsigned char>(spec, host, output, &host_info);
|
373
|
+
*out_host = host_info.out_host;
|
374
|
+
return (host_info.family != CanonHostInfo::BROKEN);
|
375
|
+
}
|
376
|
+
|
377
|
+
bool CanonicalizeHost(const char16* spec,
|
378
|
+
const url_parse::Component& host,
|
379
|
+
CanonOutput* output,
|
380
|
+
url_parse::Component* out_host) {
|
381
|
+
CanonHostInfo host_info;
|
382
|
+
DoHost<char16, char16>(spec, host, output, &host_info);
|
383
|
+
*out_host = host_info.out_host;
|
384
|
+
return (host_info.family != CanonHostInfo::BROKEN);
|
385
|
+
}
|
386
|
+
|
387
|
+
void CanonicalizeHostVerbose(const char* spec,
|
388
|
+
const url_parse::Component& host,
|
389
|
+
CanonOutput* output,
|
390
|
+
CanonHostInfo *host_info) {
|
391
|
+
DoHost<char, unsigned char>(spec, host, output, host_info);
|
392
|
+
}
|
393
|
+
|
394
|
+
void CanonicalizeHostVerbose(const char16* spec,
|
395
|
+
const url_parse::Component& host,
|
396
|
+
CanonOutput* output,
|
397
|
+
CanonHostInfo *host_info) {
|
398
|
+
DoHost<char16, char16>(spec, host, output, host_info);
|
399
|
+
}
|
400
|
+
|
401
|
+
} // namespace url_canon
|
@@ -0,0 +1,207 @@
|
|
1
|
+
// Copyright 2007, Google Inc.
|
2
|
+
// All rights reserved.
|
3
|
+
//
|
4
|
+
// Redistribution and use in source and binary forms, with or without
|
5
|
+
// modification, are permitted provided that the following conditions are
|
6
|
+
// met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright
|
9
|
+
// notice, this list of conditions and the following disclaimer.
|
10
|
+
// * Redistributions in binary form must reproduce the above
|
11
|
+
// copyright notice, this list of conditions and the following disclaimer
|
12
|
+
// in the documentation and/or other materials provided with the
|
13
|
+
// distribution.
|
14
|
+
// * Neither the name of Google Inc. nor the names of its
|
15
|
+
// contributors may be used to endorse or promote products derived from
|
16
|
+
// this software without specific prior written permission.
|
17
|
+
//
|
18
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
20
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
21
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
22
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
25
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
26
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
27
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
+
|
30
|
+
// ICU integration functions.
|
31
|
+
|
32
|
+
#include <stdlib.h>
|
33
|
+
#include <string.h>
|
34
|
+
#include <unicode/ucnv.h>
|
35
|
+
#include <unicode/ucnv_cb.h>
|
36
|
+
#include <unicode/uidna.h>
|
37
|
+
|
38
|
+
#include "url_canon_icu.h"
|
39
|
+
#include "url_canon_internal.h" // for _itoa_s
|
40
|
+
|
41
|
+
#include "logging.h"
|
42
|
+
|
43
|
+
namespace url_canon {
|
44
|
+
|
45
|
+
namespace {
|
46
|
+
|
47
|
+
// Called when converting a character that can not be represented, this will
|
48
|
+
// append an escaped version of the numerical character reference for that code
|
49
|
+
// point. It is of the form "Ӓ" and we will escape the non-digits to
|
50
|
+
// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
|
51
|
+
void appendURLEscapedChar(const void* context,
|
52
|
+
UConverterFromUnicodeArgs* from_args,
|
53
|
+
const UChar* code_units,
|
54
|
+
int32_t length,
|
55
|
+
UChar32 code_point,
|
56
|
+
UConverterCallbackReason reason,
|
57
|
+
UErrorCode* err) {
|
58
|
+
if (reason == UCNV_UNASSIGNED) {
|
59
|
+
*err = U_ZERO_ERROR;
|
60
|
+
|
61
|
+
const static int prefix_len = 6;
|
62
|
+
const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped
|
63
|
+
ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
|
64
|
+
|
65
|
+
DCHECK(code_point < 0x110000);
|
66
|
+
char number[8]; // Max Unicode code point is 7 digits.
|
67
|
+
_itoa_s(code_point, number, 10);
|
68
|
+
int number_len = static_cast<int>(strlen(number));
|
69
|
+
ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
|
70
|
+
|
71
|
+
const static int postfix_len = 3;
|
72
|
+
const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped
|
73
|
+
ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
// A class for scoping the installation of the invalid character callback.
|
78
|
+
class AppendHandlerInstaller {
|
79
|
+
public:
|
80
|
+
// The owner of this object must ensure that the converter is alive for the
|
81
|
+
// duration of this object's lifetime.
|
82
|
+
AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
|
83
|
+
UErrorCode err = U_ZERO_ERROR;
|
84
|
+
ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
|
85
|
+
&old_callback_, &old_context_, &err);
|
86
|
+
}
|
87
|
+
|
88
|
+
~AppendHandlerInstaller() {
|
89
|
+
UErrorCode err = U_ZERO_ERROR;
|
90
|
+
ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
|
91
|
+
}
|
92
|
+
|
93
|
+
private:
|
94
|
+
UConverter* converter_;
|
95
|
+
|
96
|
+
UConverterFromUCallback old_callback_;
|
97
|
+
const void* old_context_;
|
98
|
+
};
|
99
|
+
|
100
|
+
} // namespace
|
101
|
+
|
102
|
+
ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
|
103
|
+
: converter_(converter) {
|
104
|
+
}
|
105
|
+
|
106
|
+
void ICUCharsetConverter::ConvertFromUTF16(const char16* input,
|
107
|
+
int input_len,
|
108
|
+
CanonOutput* output) {
|
109
|
+
// Install our error handler. It will be called for character that can not
|
110
|
+
// be represented in the destination character set.
|
111
|
+
AppendHandlerInstaller handler(converter_);
|
112
|
+
|
113
|
+
int begin_offset = output->length();
|
114
|
+
int dest_capacity = output->capacity() - begin_offset;
|
115
|
+
output->set_length(output->length());
|
116
|
+
|
117
|
+
do {
|
118
|
+
UErrorCode err = U_ZERO_ERROR;
|
119
|
+
char* dest = &output->data()[begin_offset];
|
120
|
+
int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
|
121
|
+
input, input_len, &err);
|
122
|
+
if (err != U_BUFFER_OVERFLOW_ERROR) {
|
123
|
+
output->set_length(begin_offset + required_capacity);
|
124
|
+
return;
|
125
|
+
}
|
126
|
+
|
127
|
+
// Output didn't fit, expand
|
128
|
+
dest_capacity = required_capacity;
|
129
|
+
output->Resize(begin_offset + dest_capacity);
|
130
|
+
} while (true);
|
131
|
+
}
|
132
|
+
|
133
|
+
// Converts the Unicode input representing a hostname to ASCII using IDN rules.
|
134
|
+
// The output must be ASCII, but is represented as wide characters.
|
135
|
+
//
|
136
|
+
// On success, the output will be filled with the ASCII host name and it will
|
137
|
+
// return true. Unlike most other canonicalization functions, this assumes that
|
138
|
+
// the output is empty. The beginning of the host will be at offset 0, and
|
139
|
+
// the length of the output will be set to the length of the new host name.
|
140
|
+
//
|
141
|
+
// On error, this will return false. The output in this case is undefined.
|
142
|
+
bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) {
|
143
|
+
DCHECK(output->length() == 0); // Output buffer is assumed empty.
|
144
|
+
while (true) {
|
145
|
+
// Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
|
146
|
+
// the spec (which do exist). This does not present any risk and is a
|
147
|
+
// little more future proof.
|
148
|
+
UErrorCode err = U_ZERO_ERROR;
|
149
|
+
int num_converted = uidna_IDNToASCII(src, src_len, output->data(),
|
150
|
+
output->capacity(),
|
151
|
+
UIDNA_ALLOW_UNASSIGNED, NULL, &err);
|
152
|
+
if (err == U_ZERO_ERROR) {
|
153
|
+
output->set_length(num_converted);
|
154
|
+
return true;
|
155
|
+
}
|
156
|
+
if (err != U_BUFFER_OVERFLOW_ERROR)
|
157
|
+
return false; // Unknown error, give up.
|
158
|
+
|
159
|
+
// Not enough room in our buffer, expand.
|
160
|
+
output->Resize(output->capacity() * 2);
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
bool ReadUTFChar(const char* str, int* begin, int length,
|
165
|
+
unsigned* code_point_out) {
|
166
|
+
int code_point; // Avoids warning when U8_NEXT writes -1 to it.
|
167
|
+
U8_NEXT(str, *begin, length, code_point);
|
168
|
+
*code_point_out = static_cast<unsigned>(code_point);
|
169
|
+
|
170
|
+
// The ICU macro above moves to the next char, we want to point to the last
|
171
|
+
// char consumed.
|
172
|
+
(*begin)--;
|
173
|
+
|
174
|
+
// Validate the decoded value.
|
175
|
+
if (U_IS_UNICODE_CHAR(code_point))
|
176
|
+
return true;
|
177
|
+
*code_point_out = kUnicodeReplacementCharacter;
|
178
|
+
return false;
|
179
|
+
}
|
180
|
+
|
181
|
+
bool ReadUTFChar(const char16* str, int* begin, int length,
|
182
|
+
unsigned* code_point) {
|
183
|
+
if (U16_IS_SURROGATE(str[*begin])) {
|
184
|
+
if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
|
185
|
+
!U16_IS_TRAIL(str[*begin + 1])) {
|
186
|
+
// Invalid surrogate pair.
|
187
|
+
*code_point = kUnicodeReplacementCharacter;
|
188
|
+
return false;
|
189
|
+
} else {
|
190
|
+
// Valid surrogate pair.
|
191
|
+
*code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
|
192
|
+
(*begin)++;
|
193
|
+
}
|
194
|
+
} else {
|
195
|
+
// Not a surrogate, just one 16-bit word.
|
196
|
+
*code_point = str[*begin];
|
197
|
+
}
|
198
|
+
|
199
|
+
if (U_IS_UNICODE_CHAR(*code_point))
|
200
|
+
return true;
|
201
|
+
|
202
|
+
// Invalid code point.
|
203
|
+
*code_point = kUnicodeReplacementCharacter;
|
204
|
+
return false;
|
205
|
+
}
|
206
|
+
|
207
|
+
} // namespace url_canon
|