uri_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,401 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include "logging.h"
31
+ #include "url_canon.h"
32
+ #include "url_canon_internal.h"
33
+
34
+ namespace url_canon {
35
+
36
+ namespace {
37
+
38
+ // For reference, here's what IE supports:
39
+ // Key: 0 (disallowed: failure if present in the input)
40
+ // + (allowed either escaped or unescaped, and unmodified)
41
+ // U (allowed escaped or unescaped but always unescaped if present in
42
+ // escaped form)
43
+ // E (allowed escaped or unescaped but always escaped if present in
44
+ // unescaped form)
45
+ // % (only allowed escaped in the input, will be unmodified).
46
+ // I left blank alpha numeric characters.
47
+ //
48
+ // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
49
+ // -----------------------------------------------
50
+ // 0 0 E E E E E E E E E E E E E E E
51
+ // 1 E E E E E E E E E E E E E E E E
52
+ // 2 E + E E + E + + + + + + + U U 0
53
+ // 3 % % E + E 0 <-- Those are : ; < = > ?
54
+ // 4 %
55
+ // 5 U 0 U U U <-- Those are [ \ ] ^ _
56
+ // 6 E <-- That's `
57
+ // 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE)
58
+ //
59
+ // NOTE: I didn't actually test all the control characters. Some may be
60
+ // disallowed in the input, but they are all accepted escaped except for 0.
61
+ // I also didn't test if characters affecting HTML parsing are allowed
62
+ // unescaped, eg. (") or (#), which would indicate the beginning of the path.
63
+ // Surprisingly, space is accepted in the input and always escaped.
64
+
65
+ // This table lists the canonical version of all characters we allow in the
66
+ // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
67
+ // value to indicate that this character should be escaped. We are a little more
68
+ // restrictive than IE, but less restrictive than Firefox.
69
+ //
70
+ // Note that we disallow the % character. We will allow it when part of an
71
+ // escape sequence, of course, but this disallows "%25". Even though IE allows
72
+ // it, allowing it would put us in a funny state. If there was an invalid
73
+ // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
74
+ // Allowing percents means we'll succeed a second time, so validity would change
75
+ // based on how many times you run the canonicalizer. We prefer to always report
76
+ // the same vailidity, so reject this.
77
+ const unsigned char kEsc = 0xff;
78
+ const unsigned char kHostCharLookup[0x80] = {
79
+ // 00-1f: all are invalid
80
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82
+ // ' ' ! " # $ % & ' ( ) * + , - . /
83
+ kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
84
+ // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
85
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
86
+ // @ A B C D E F G H I J K L M N O
87
+ kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
88
+ // P Q R S T U V W X Y Z [ \ ] ^ _
89
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
90
+ // ` a b c d e f g h i j k l m n o
91
+ kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
92
+ // p q r s t u v w x y z { | } ~
93
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
94
+
95
+ const int kTempHostBufferLen = 1024;
96
+ typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
97
+ typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
98
+
99
+ // Scans a host name and fills in the output flags according to what we find.
100
+ // |has_non_ascii| will be true if there are any non-7-bit characters, and
101
+ // |has_escaped| will be true if there is a percent sign.
102
+ template<typename CHAR, typename UCHAR>
103
+ void ScanHostname(const CHAR* spec, const url_parse::Component& host,
104
+ bool* has_non_ascii, bool* has_escaped) {
105
+ int end = host.end();
106
+ *has_non_ascii = false;
107
+ *has_escaped = false;
108
+ for (int i = host.begin; i < end; i++) {
109
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
110
+ *has_non_ascii = true;
111
+ else if (spec[i] == '%')
112
+ *has_escaped = true;
113
+ }
114
+ }
115
+
116
+ // Canonicalizes a host name that is entirely 8-bit characters (even though
117
+ // the type holding them may be 16 bits. Escaped characters will be unescaped.
118
+ // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
119
+ //
120
+ // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
121
+ // the output.
122
+ //
123
+ // This function is used in two situations:
124
+ //
125
+ // * When the caller knows there is no non-ASCII or percent escaped
126
+ // characters. This is what DoHost does. The result will be a completely
127
+ // canonicalized host since we know nothing weird can happen (escaped
128
+ // characters could be unescaped to non-7-bit, so they have to be treated
129
+ // with suspicion at this point). It does not use the |has_non_ascii| flag.
130
+ //
131
+ // * When the caller has an 8-bit string that may need unescaping.
132
+ // DoComplexHost calls us this situation to do unescaping and validation.
133
+ // After this, it may do other IDN operations depending on the value of the
134
+ // |*has_non_ascii| flag.
135
+ //
136
+ // The return value indicates if the output is a potentially valid host name.
137
+ template<typename INCHAR, typename OUTCHAR>
138
+ bool DoSimpleHost(const INCHAR* host,
139
+ int host_len,
140
+ CanonOutputT<OUTCHAR>* output,
141
+ bool* has_non_ascii) {
142
+ *has_non_ascii = false;
143
+
144
+ bool success = true;
145
+ for (int i = 0; i < host_len; ++i) {
146
+ unsigned int source = host[i];
147
+ if (source == '%') {
148
+ // Unescape first, if possible.
149
+ // Source will be used only if decode operation was successful.
150
+ if (!DecodeEscaped(host, &i, host_len,
151
+ reinterpret_cast<unsigned char*>(&source))) {
152
+ // Invalid escaped character. There is nothing that can make this
153
+ // host valid. We append an escaped percent so the URL looks reasonable
154
+ // and mark as failed.
155
+ AppendEscapedChar('%', output);
156
+ success = false;
157
+ continue;
158
+ }
159
+ }
160
+
161
+ if (source < 0x80) {
162
+ // We have ASCII input, we can use our lookup table.
163
+ unsigned char replacement = kHostCharLookup[source];
164
+ if (!replacement) {
165
+ // Invalid character, add it as percent-escaped and mark as failed.
166
+ AppendEscapedChar(source, output);
167
+ success = false;
168
+ } else if (replacement == kEsc) {
169
+ // This character is valid but should be escaped.
170
+ AppendEscapedChar(source, output);
171
+ } else {
172
+ // Common case, the given character is valid in a hostname, the lookup
173
+ // table tells us the canonical representation of that character (lower
174
+ // cased).
175
+ output->push_back(replacement);
176
+ }
177
+ } else {
178
+ // It's a non-ascii char. Just push it to the output.
179
+ // In case where we have char16 input, and char output it's safe to
180
+ // cast char16->char only if input string was converted to ASCII.
181
+ output->push_back(static_cast<OUTCHAR>(source));
182
+ *has_non_ascii = true;
183
+ }
184
+ }
185
+
186
+ return success;
187
+ }
188
+
189
+ // Canonicalizes a host that requires IDN conversion. Returns true on success
190
+ bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
191
+ // We need to escape URL before doing IDN conversion, since punicode strings
192
+ // cannot be escaped after they are created.
193
+ RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
194
+ bool has_non_ascii;
195
+ DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
196
+
197
+ StackBufferW wide_output;
198
+ if (!IDNToASCII(url_escaped_host.data(),
199
+ url_escaped_host.length(),
200
+ &wide_output)) {
201
+ // Some error, give up. This will write some reasonable looking
202
+ // representation of the string to the output.
203
+ AppendInvalidNarrowString(src, 0, src_len, output);
204
+ return false;
205
+ }
206
+
207
+ // Now we check the ASCII output like a normal host. It will also handle
208
+ // unescaping. Although we unescaped everything before this function call, if
209
+ // somebody does %00 as fullwidth, ICU will convert this to ASCII.
210
+ bool success = DoSimpleHost(wide_output.data(),
211
+ wide_output.length(),
212
+ output, &has_non_ascii);
213
+ DCHECK(!has_non_ascii);
214
+ return success;
215
+ }
216
+
217
+ // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
218
+ // UTF-16. The has_escaped flag should be set if the input string requires
219
+ // unescaping.
220
+ bool DoComplexHost(const char* host, int host_len,
221
+ bool has_non_ascii, bool has_escaped, CanonOutput* output) {
222
+ // Save the current position in the output. We may write stuff and rewind it
223
+ // below, so we need to know where to rewind to.
224
+ int begin_length = output->length();
225
+
226
+ // Points to the UTF-8 data we want to convert. This will either be the
227
+ // input or the unescaped version written to |*output| if necessary.
228
+ const char* utf8_source;
229
+ int utf8_source_len;
230
+ if (has_escaped) {
231
+ // Unescape before converting to UTF-16 for IDN. We write this into the
232
+ // output because it most likely does not require IDNization, and we can
233
+ // save another huge stack buffer. It will be replaced below if it requires
234
+ // IDN. This will also update our non-ASCII flag so we know whether the
235
+ // unescaped input requires IDN.
236
+ if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
237
+ // Error with some escape sequence. We'll call the current output
238
+ // complete. DoSimpleHost will have written some "reasonable" output.
239
+ return false;
240
+ }
241
+
242
+ // Unescaping may have left us with ASCII input, in which case the
243
+ // unescaped version we wrote to output is complete.
244
+ if (!has_non_ascii) {
245
+ return true;
246
+ }
247
+
248
+ // Save the pointer into the data was just converted (it may be appended to
249
+ // other data in the output buffer).
250
+ utf8_source = &output->data()[begin_length];
251
+ utf8_source_len = output->length() - begin_length;
252
+ } else {
253
+ // We don't need to unescape, use input for IDNization later. (We know the
254
+ // input has non-ASCII, or the simple version would have been called
255
+ // instead of us.)
256
+ utf8_source = host;
257
+ utf8_source_len = host_len;
258
+ }
259
+
260
+ // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
261
+ // Above, we may have used the output to write the unescaped values to, so
262
+ // we have to rewind it to where we started after we convert it to UTF-16.
263
+ StackBufferW utf16;
264
+ if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
265
+ // In this error case, the input may or may not be the output.
266
+ StackBuffer utf8;
267
+ for (int i = 0; i < utf8_source_len; i++)
268
+ utf8.push_back(utf8_source[i]);
269
+ output->set_length(begin_length);
270
+ AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
271
+ return false;
272
+ }
273
+ output->set_length(begin_length);
274
+
275
+ // This will call DoSimpleHost which will do normal ASCII canonicalization
276
+ // and also check for IP addresses in the outpt.
277
+ return DoIDNHost(utf16.data(), utf16.length(), output);
278
+ }
279
+
280
+ // UTF-16 convert host to its ASCII version. The set up is already ready for
281
+ // the backend, so we just pass through. The has_escaped flag should be set if
282
+ // the input string requires unescaping.
283
+ bool DoComplexHost(const char16* host, int host_len,
284
+ bool has_non_ascii, bool has_escaped, CanonOutput* output) {
285
+ if (has_escaped) {
286
+ // Yikes, we have escaped characters with wide input. The escaped
287
+ // characters should be interpreted as UTF-8. To solve this problem,
288
+ // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
289
+ //
290
+ // We don't bother to optimize the conversion in the ASCII case (which
291
+ // *could* just be a copy) and use the UTF-8 path, because it should be
292
+ // very rare that host names have escaped characters, and it is relatively
293
+ // fast to do the conversion anyway.
294
+ StackBuffer utf8;
295
+ if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
296
+ AppendInvalidNarrowString(host, 0, host_len, output);
297
+ return false;
298
+ }
299
+
300
+ // Once we convert to UTF-8, we can use the 8-bit version of the complex
301
+ // host handling code above.
302
+ return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
303
+ has_escaped, output);
304
+ }
305
+
306
+ // No unescaping necessary, we can safely pass the input to ICU. This
307
+ // function will only get called if we either have escaped or non-ascii
308
+ // input, so it's safe to just use ICU now. Even if the input is ASCII,
309
+ // this function will do the right thing (just slower than we could).
310
+ return DoIDNHost(host, host_len, output);
311
+ }
312
+
313
+ template<typename CHAR, typename UCHAR>
314
+ void DoHost(const CHAR* spec,
315
+ const url_parse::Component& host,
316
+ CanonOutput* output,
317
+ CanonHostInfo* host_info) {
318
+ if (host.len <= 0) {
319
+ // Empty hosts don't need anything.
320
+ host_info->family = CanonHostInfo::NEUTRAL;
321
+ host_info->out_host = url_parse::Component();
322
+ return;
323
+ }
324
+
325
+ bool has_non_ascii, has_escaped;
326
+ ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
327
+
328
+ // Keep track of output's initial length, so we can rewind later.
329
+ const int output_begin = output->length();
330
+
331
+ bool success;
332
+ if (!has_non_ascii && !has_escaped) {
333
+ success = DoSimpleHost(&spec[host.begin], host.len,
334
+ output, &has_non_ascii);
335
+ DCHECK(!has_non_ascii);
336
+ } else {
337
+ success = DoComplexHost(&spec[host.begin], host.len,
338
+ has_non_ascii, has_escaped, output);
339
+ }
340
+
341
+ if (!success) {
342
+ // Canonicalization failed. Set BROKEN to notify the caller.
343
+ host_info->family = CanonHostInfo::BROKEN;
344
+ } else {
345
+ // After all the other canonicalization, check if we ended up with an IP
346
+ // address. IP addresses are small, so writing into this temporary buffer
347
+ // should not cause an allocation.
348
+ RawCanonOutput<64> canon_ip;
349
+ CanonicalizeIPAddress(output->data(),
350
+ url_parse::MakeRange(output_begin, output->length()),
351
+ &canon_ip, host_info);
352
+
353
+ // If we got an IPv4/IPv6 address, copy the canonical form back to the
354
+ // real buffer. Otherwise, it's a hostname or broken IP, in which case
355
+ // we just leave it in place.
356
+ if (host_info->IsIPAddress()) {
357
+ output->set_length(output_begin);
358
+ output->Append(canon_ip.data(), canon_ip.length());
359
+ }
360
+ }
361
+
362
+ host_info->out_host = url_parse::MakeRange(output_begin, output->length());
363
+ }
364
+
365
+ } // namespace
366
+
367
+ bool CanonicalizeHost(const char* spec,
368
+ const url_parse::Component& host,
369
+ CanonOutput* output,
370
+ url_parse::Component* out_host) {
371
+ CanonHostInfo host_info;
372
+ DoHost<char, unsigned char>(spec, host, output, &host_info);
373
+ *out_host = host_info.out_host;
374
+ return (host_info.family != CanonHostInfo::BROKEN);
375
+ }
376
+
377
+ bool CanonicalizeHost(const char16* spec,
378
+ const url_parse::Component& host,
379
+ CanonOutput* output,
380
+ url_parse::Component* out_host) {
381
+ CanonHostInfo host_info;
382
+ DoHost<char16, char16>(spec, host, output, &host_info);
383
+ *out_host = host_info.out_host;
384
+ return (host_info.family != CanonHostInfo::BROKEN);
385
+ }
386
+
387
+ void CanonicalizeHostVerbose(const char* spec,
388
+ const url_parse::Component& host,
389
+ CanonOutput* output,
390
+ CanonHostInfo *host_info) {
391
+ DoHost<char, unsigned char>(spec, host, output, host_info);
392
+ }
393
+
394
+ void CanonicalizeHostVerbose(const char16* spec,
395
+ const url_parse::Component& host,
396
+ CanonOutput* output,
397
+ CanonHostInfo *host_info) {
398
+ DoHost<char16, char16>(spec, host, output, host_info);
399
+ }
400
+
401
+ } // namespace url_canon
@@ -0,0 +1,207 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // ICU integration functions.
31
+
32
+ #include <stdlib.h>
33
+ #include <string.h>
34
+ #include <unicode/ucnv.h>
35
+ #include <unicode/ucnv_cb.h>
36
+ #include <unicode/uidna.h>
37
+
38
+ #include "url_canon_icu.h"
39
+ #include "url_canon_internal.h" // for _itoa_s
40
+
41
+ #include "logging.h"
42
+
43
+ namespace url_canon {
44
+
45
+ namespace {
46
+
47
+ // Called when converting a character that can not be represented, this will
48
+ // append an escaped version of the numerical character reference for that code
49
+ // point. It is of the form "&#1234;" and we will escape the non-digits to
50
+ // "%26%231234%3B". Why? This is what Netscape did back in the olden days.
51
+ void appendURLEscapedChar(const void* context,
52
+ UConverterFromUnicodeArgs* from_args,
53
+ const UChar* code_units,
54
+ int32_t length,
55
+ UChar32 code_point,
56
+ UConverterCallbackReason reason,
57
+ UErrorCode* err) {
58
+ if (reason == UCNV_UNASSIGNED) {
59
+ *err = U_ZERO_ERROR;
60
+
61
+ const static int prefix_len = 6;
62
+ const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped
63
+ ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
64
+
65
+ DCHECK(code_point < 0x110000);
66
+ char number[8]; // Max Unicode code point is 7 digits.
67
+ _itoa_s(code_point, number, 10);
68
+ int number_len = static_cast<int>(strlen(number));
69
+ ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
70
+
71
+ const static int postfix_len = 3;
72
+ const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped
73
+ ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
74
+ }
75
+ }
76
+
77
+ // A class for scoping the installation of the invalid character callback.
78
+ class AppendHandlerInstaller {
79
+ public:
80
+ // The owner of this object must ensure that the converter is alive for the
81
+ // duration of this object's lifetime.
82
+ AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
83
+ UErrorCode err = U_ZERO_ERROR;
84
+ ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
85
+ &old_callback_, &old_context_, &err);
86
+ }
87
+
88
+ ~AppendHandlerInstaller() {
89
+ UErrorCode err = U_ZERO_ERROR;
90
+ ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
91
+ }
92
+
93
+ private:
94
+ UConverter* converter_;
95
+
96
+ UConverterFromUCallback old_callback_;
97
+ const void* old_context_;
98
+ };
99
+
100
+ } // namespace
101
+
102
+ ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
103
+ : converter_(converter) {
104
+ }
105
+
106
+ void ICUCharsetConverter::ConvertFromUTF16(const char16* input,
107
+ int input_len,
108
+ CanonOutput* output) {
109
+ // Install our error handler. It will be called for character that can not
110
+ // be represented in the destination character set.
111
+ AppendHandlerInstaller handler(converter_);
112
+
113
+ int begin_offset = output->length();
114
+ int dest_capacity = output->capacity() - begin_offset;
115
+ output->set_length(output->length());
116
+
117
+ do {
118
+ UErrorCode err = U_ZERO_ERROR;
119
+ char* dest = &output->data()[begin_offset];
120
+ int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
121
+ input, input_len, &err);
122
+ if (err != U_BUFFER_OVERFLOW_ERROR) {
123
+ output->set_length(begin_offset + required_capacity);
124
+ return;
125
+ }
126
+
127
+ // Output didn't fit, expand
128
+ dest_capacity = required_capacity;
129
+ output->Resize(begin_offset + dest_capacity);
130
+ } while (true);
131
+ }
132
+
133
+ // Converts the Unicode input representing a hostname to ASCII using IDN rules.
134
+ // The output must be ASCII, but is represented as wide characters.
135
+ //
136
+ // On success, the output will be filled with the ASCII host name and it will
137
+ // return true. Unlike most other canonicalization functions, this assumes that
138
+ // the output is empty. The beginning of the host will be at offset 0, and
139
+ // the length of the output will be set to the length of the new host name.
140
+ //
141
+ // On error, this will return false. The output in this case is undefined.
142
+ bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) {
143
+ DCHECK(output->length() == 0); // Output buffer is assumed empty.
144
+ while (true) {
145
+ // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
146
+ // the spec (which do exist). This does not present any risk and is a
147
+ // little more future proof.
148
+ UErrorCode err = U_ZERO_ERROR;
149
+ int num_converted = uidna_IDNToASCII(src, src_len, output->data(),
150
+ output->capacity(),
151
+ UIDNA_ALLOW_UNASSIGNED, NULL, &err);
152
+ if (err == U_ZERO_ERROR) {
153
+ output->set_length(num_converted);
154
+ return true;
155
+ }
156
+ if (err != U_BUFFER_OVERFLOW_ERROR)
157
+ return false; // Unknown error, give up.
158
+
159
+ // Not enough room in our buffer, expand.
160
+ output->Resize(output->capacity() * 2);
161
+ }
162
+ }
163
+
164
+ bool ReadUTFChar(const char* str, int* begin, int length,
165
+ unsigned* code_point_out) {
166
+ int code_point; // Avoids warning when U8_NEXT writes -1 to it.
167
+ U8_NEXT(str, *begin, length, code_point);
168
+ *code_point_out = static_cast<unsigned>(code_point);
169
+
170
+ // The ICU macro above moves to the next char, we want to point to the last
171
+ // char consumed.
172
+ (*begin)--;
173
+
174
+ // Validate the decoded value.
175
+ if (U_IS_UNICODE_CHAR(code_point))
176
+ return true;
177
+ *code_point_out = kUnicodeReplacementCharacter;
178
+ return false;
179
+ }
180
+
181
+ bool ReadUTFChar(const char16* str, int* begin, int length,
182
+ unsigned* code_point) {
183
+ if (U16_IS_SURROGATE(str[*begin])) {
184
+ if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
185
+ !U16_IS_TRAIL(str[*begin + 1])) {
186
+ // Invalid surrogate pair.
187
+ *code_point = kUnicodeReplacementCharacter;
188
+ return false;
189
+ } else {
190
+ // Valid surrogate pair.
191
+ *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
192
+ (*begin)++;
193
+ }
194
+ } else {
195
+ // Not a surrogate, just one 16-bit word.
196
+ *code_point = str[*begin];
197
+ }
198
+
199
+ if (U_IS_UNICODE_CHAR(*code_point))
200
+ return true;
201
+
202
+ // Invalid code point.
203
+ *code_point = kUnicodeReplacementCharacter;
204
+ return false;
205
+ }
206
+
207
+ } // namespace url_canon