RubyGems - uri_parser - Versions diffs - 0.0.1 - Mend

uri_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

data/.gitignore +6 -0
data/.rvmrc +1 -0
data/Gemfile +6 -0
data/Rakefile +13 -0
data/ext/uri_parser/basictypes.h +89 -0
data/ext/uri_parser/extconf.h +6 -0
data/ext/uri_parser/extconf.rb +50 -0
data/ext/uri_parser/logging.h +5 -0
data/ext/uri_parser/scoped_ptr.h +322 -0
data/ext/uri_parser/string16.cc +95 -0
data/ext/uri_parser/string16.h +194 -0
data/ext/uri_parser/uri_parser.cc +87 -0
data/ext/uri_parser/url_canon.h +872 -0
data/ext/uri_parser/url_canon_etc.cc +392 -0
data/ext/uri_parser/url_canon_fileurl.cc +215 -0
data/ext/uri_parser/url_canon_host.cc +401 -0
data/ext/uri_parser/url_canon_icu.cc +207 -0
data/ext/uri_parser/url_canon_icu.h +63 -0
data/ext/uri_parser/url_canon_internal.cc +427 -0
data/ext/uri_parser/url_canon_internal.h +453 -0
data/ext/uri_parser/url_canon_internal_file.h +157 -0
data/ext/uri_parser/url_canon_ip.cc +737 -0
data/ext/uri_parser/url_canon_ip.h +101 -0
data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
data/ext/uri_parser/url_canon_path.cc +380 -0
data/ext/uri_parser/url_canon_pathurl.cc +128 -0
data/ext/uri_parser/url_canon_query.cc +189 -0
data/ext/uri_parser/url_canon_relative.cc +572 -0
data/ext/uri_parser/url_canon_stdstring.h +134 -0
data/ext/uri_parser/url_canon_stdurl.cc +211 -0
data/ext/uri_parser/url_common.h +48 -0
data/ext/uri_parser/url_file.h +108 -0
data/ext/uri_parser/url_parse.cc +760 -0
data/ext/uri_parser/url_parse.h +336 -0
data/ext/uri_parser/url_parse_file.cc +243 -0
data/ext/uri_parser/url_parse_internal.h +112 -0
data/ext/uri_parser/url_util.cc +553 -0
data/ext/uri_parser/url_util.h +222 -0
data/lib/uri_parser.rb +28 -0
data/lib/uri_parser/version.rb +3 -0
data/spec/spec_helper.rb +16 -0
data/spec/uri_parser_spec.rb +54 -0
data/uri_parser.gemspec +26 -0
metadata +117 -0

data/ext/uri_parser/url_canon_host.cc ADDED Viewed

@@ -0,0 +1,401 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include "logging.h"
+#include "url_canon.h"
+#include "url_canon_internal.h"
+namespace url_canon {
+namespace {
+// For reference, here's what IE supports:
+// Key: 0 (disallowed: failure if present in the input)
+//      + (allowed either escaped or unescaped, and unmodified)
+//      U (allowed escaped or unescaped but always unescaped if present in
+//         escaped form)
+//      E (allowed escaped or unescaped but always escaped if present in
+//         unescaped form)
+//      % (only allowed escaped in the input, will be unmodified).
+//      I left blank alpha numeric characters.
+//
+//    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+//    -----------------------------------------------
+// 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
+// 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
+// 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
+// 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
+// 4   %
+// 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
+// 6   E                                               <-- That's  `
+// 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
+//
+// NOTE: I didn't actually test all the control characters. Some may be
+// disallowed in the input, but they are all accepted escaped except for 0.
+// I also didn't test if characters affecting HTML parsing are allowed
+// unescaped, eg. (") or (#), which would indicate the beginning of the path.
+// Surprisingly, space is accepted in the input and always escaped.
+// This table lists the canonical version of all characters we allow in the
+// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
+// value to indicate that this character should be escaped. We are a little more
+// restrictive than IE, but less restrictive than Firefox.
+//
+// Note that we disallow the % character. We will allow it when part of an
+// escape sequence, of course, but this disallows "%25". Even though IE allows
+// it, allowing it would put us in a funny state. If there was an invalid
+// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
+// Allowing percents means we'll succeed a second time, so validity would change
+// based on how many times you run the canonicalizer. We prefer to always report
+// the same vailidity, so reject this.
+const unsigned char kEsc = 0xff;
+const unsigned char kHostCharLookup[0x80] = {
+// 00-1f: all are invalid
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+//  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
+   kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
+//   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
+//   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
+   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+//   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
+//   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
+   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
+const int kTempHostBufferLen = 1024;
+typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
+typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
+// Scans a host name and fills in the output flags according to what we find.
+// |has_non_ascii| will be true if there are any non-7-bit characters, and
+// |has_escaped| will be true if there is a percent sign.
+template<typename CHAR, typename UCHAR>
+void ScanHostname(const CHAR* spec, const url_parse::Component& host,
+                  bool* has_non_ascii, bool* has_escaped) {
+  int end = host.end();
+  *has_non_ascii = false;
+  *has_escaped = false;
+  for (int i = host.begin; i < end; i++) {
+    if (static_cast<UCHAR>(spec[i]) >= 0x80)
+      *has_non_ascii = true;
+    else if (spec[i] == '%')
+      *has_escaped = true;
+  }
+}
+// Canonicalizes a host name that is entirely 8-bit characters (even though
+// the type holding them may be 16 bits. Escaped characters will be unescaped.
+// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
+//
+// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
+// the output.
+//
+// This function is used in two situations:
+//
+//  * When the caller knows there is no non-ASCII or percent escaped
+//    characters. This is what DoHost does. The result will be a completely
+//    canonicalized host since we know nothing weird can happen (escaped
+//    characters could be unescaped to non-7-bit, so they have to be treated
+//    with suspicion at this point). It does not use the |has_non_ascii| flag.
+//
+//  * When the caller has an 8-bit string that may need unescaping.
+//    DoComplexHost calls us this situation to do unescaping and validation.
+//    After this, it may do other IDN operations depending on the value of the
+//    |*has_non_ascii| flag.
+//
+// The return value indicates if the output is a potentially valid host name.
+template<typename INCHAR, typename OUTCHAR>
+bool DoSimpleHost(const INCHAR* host,
+                  int host_len,
+                  CanonOutputT<OUTCHAR>* output,
+                  bool* has_non_ascii) {
+  *has_non_ascii = false;
+  bool success = true;
+  for (int i = 0; i < host_len; ++i) {
+    unsigned int source = host[i];
+    if (source == '%') {
+      // Unescape first, if possible.
+      // Source will be used only if decode operation was successful.
+      if (!DecodeEscaped(host, &i, host_len,
+                         reinterpret_cast<unsigned char*>(&source))) {
+        // Invalid escaped character. There is nothing that can make this
+        // host valid. We append an escaped percent so the URL looks reasonable
+        // and mark as failed.
+        AppendEscapedChar('%', output);
+        success = false;
+        continue;
+      }
+    }
+    if (source < 0x80) {
+      // We have ASCII input, we can use our lookup table.
+      unsigned char replacement = kHostCharLookup[source];
+      if (!replacement) {
+        // Invalid character, add it as percent-escaped and mark as failed.
+        AppendEscapedChar(source, output);
+        success = false;
+      } else if (replacement == kEsc) {
+        // This character is valid but should be escaped.
+        AppendEscapedChar(source, output);
+      } else {
+        // Common case, the given character is valid in a hostname, the lookup
+        // table tells us the canonical representation of that character (lower
+        // cased).
+        output->push_back(replacement);
+      }
+    } else {
+      // It's a non-ascii char. Just push it to the output.
+      // In case where we have char16 input, and char output it's safe to
+      // cast char16->char only if input string was converted to ASCII.
+      output->push_back(static_cast<OUTCHAR>(source));
+      *has_non_ascii = true;
+    }
+  }
+  return success;
+}
+// Canonicalizes a host that requires IDN conversion. Returns true on success
+bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
+  // We need to escape URL before doing IDN conversion, since punicode strings
+  // cannot be escaped after they are created.
+  RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
+  bool has_non_ascii;
+  DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
+  StackBufferW wide_output;
+  if (!IDNToASCII(url_escaped_host.data(),
+                  url_escaped_host.length(),
+                  &wide_output)) {
+    // Some error, give up. This will write some reasonable looking
+    // representation of the string to the output.
+    AppendInvalidNarrowString(src, 0, src_len, output);
+    return false;
+  }
+  // Now we check the ASCII output like a normal host. It will also handle
+  // unescaping. Although we unescaped everything before this function call, if
+  // somebody does %00 as fullwidth, ICU will convert this to ASCII.
+  bool success = DoSimpleHost(wide_output.data(),
+                              wide_output.length(),
+                              output, &has_non_ascii);
+  DCHECK(!has_non_ascii);
+  return success;
+}
+// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
+// UTF-16. The has_escaped flag should be set if the input string requires
+// unescaping.
+bool DoComplexHost(const char* host, int host_len,
+                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+  // Save the current position in the output. We may write stuff and rewind it
+  // below, so we need to know where to rewind to.
+  int begin_length = output->length();
+  // Points to the UTF-8 data we want to convert. This will either be the
+  // input or the unescaped version written to |*output| if necessary.
+  const char* utf8_source;
+  int utf8_source_len;
+  if (has_escaped) {
+    // Unescape before converting to UTF-16 for IDN. We write this into the
+    // output because it most likely does not require IDNization, and we can
+    // save another huge stack buffer. It will be replaced below if it requires
+    // IDN. This will also update our non-ASCII flag so we know whether the
+    // unescaped input requires IDN.
+    if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
+      // Error with some escape sequence. We'll call the current output
+      // complete. DoSimpleHost will have written some "reasonable" output.
+      return false;
+    }
+    // Unescaping may have left us with ASCII input, in which case the
+    // unescaped version we wrote to output is complete.
+    if (!has_non_ascii) {
+      return true;
+    }
+    // Save the pointer into the data was just converted (it may be appended to
+    // other data in the output buffer).
+    utf8_source = &output->data()[begin_length];
+    utf8_source_len = output->length() - begin_length;
+  } else {
+    // We don't need to unescape, use input for IDNization later. (We know the
+    // input has non-ASCII, or the simple version would have been called
+    // instead of us.)
+    utf8_source = host;
+    utf8_source_len = host_len;
+  }
+  // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
+  // Above, we may have used the output to write the unescaped values to, so
+  // we have to rewind it to where we started after we convert it to UTF-16.
+  StackBufferW utf16;
+  if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
+    // In this error case, the input may or may not be the output.
+    StackBuffer utf8;
+    for (int i = 0; i < utf8_source_len; i++)
+      utf8.push_back(utf8_source[i]);
+    output->set_length(begin_length);
+    AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
+    return false;
+  }
+  output->set_length(begin_length);
+  // This will call DoSimpleHost which will do normal ASCII canonicalization
+  // and also check for IP addresses in the outpt.
+  return DoIDNHost(utf16.data(), utf16.length(), output);
+}
+// UTF-16 convert host to its ASCII version. The set up is already ready for
+// the backend, so we just pass through. The has_escaped flag should be set if
+// the input string requires unescaping.
+bool DoComplexHost(const char16* host, int host_len,
+                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+  if (has_escaped) {
+    // Yikes, we have escaped characters with wide input. The escaped
+    // characters should be interpreted as UTF-8. To solve this problem,
+    // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
+    //
+    // We don't bother to optimize the conversion in the ASCII case (which
+    // *could* just be a copy) and use the UTF-8 path, because it should be
+    // very rare that host names have escaped characters, and it is relatively
+    // fast to do the conversion anyway.
+    StackBuffer utf8;
+    if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
+      AppendInvalidNarrowString(host, 0, host_len, output);
+      return false;
+    }
+    // Once we convert to UTF-8, we can use the 8-bit version of the complex
+    // host handling code above.
+    return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
+                         has_escaped, output);
+  }
+  // No unescaping necessary, we can safely pass the input to ICU. This
+  // function will only get called if we either have escaped or non-ascii
+  // input, so it's safe to just use ICU now. Even if the input is ASCII,
+  // this function will do the right thing (just slower than we could).
+  return DoIDNHost(host, host_len, output);
+}
+template<typename CHAR, typename UCHAR>
+void DoHost(const CHAR* spec,
+            const url_parse::Component& host,
+            CanonOutput* output,
+            CanonHostInfo* host_info) {
+  if (host.len <= 0) {
+    // Empty hosts don't need anything.
+    host_info->family = CanonHostInfo::NEUTRAL;
+    host_info->out_host = url_parse::Component();
+    return;
+  }
+  bool has_non_ascii, has_escaped;
+  ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
+  // Keep track of output's initial length, so we can rewind later.
+  const int output_begin = output->length();
+  bool success;
+  if (!has_non_ascii && !has_escaped) {
+    success = DoSimpleHost(&spec[host.begin], host.len,
+                           output, &has_non_ascii);
+    DCHECK(!has_non_ascii);
+  } else {
+    success = DoComplexHost(&spec[host.begin], host.len,
+                            has_non_ascii, has_escaped, output);
+  }
+  if (!success) {
+    // Canonicalization failed.  Set BROKEN to notify the caller.
+    host_info->family = CanonHostInfo::BROKEN;
+  } else {
+    // After all the other canonicalization, check if we ended up with an IP
+    // address.  IP addresses are small, so writing into this temporary buffer
+    // should not cause an allocation.
+    RawCanonOutput<64> canon_ip;
+    CanonicalizeIPAddress(output->data(),
+                          url_parse::MakeRange(output_begin, output->length()),
+                          &canon_ip, host_info);
+    // If we got an IPv4/IPv6 address, copy the canonical form back to the
+    // real buffer.  Otherwise, it's a hostname or broken IP, in which case
+    // we just leave it in place.
+    if (host_info->IsIPAddress()) {
+      output->set_length(output_begin);
+      output->Append(canon_ip.data(), canon_ip.length());
+    }
+  }
+  host_info->out_host = url_parse::MakeRange(output_begin, output->length());
+}
+}  // namespace
+bool CanonicalizeHost(const char* spec,
+                      const url_parse::Component& host,
+                      CanonOutput* output,
+                      url_parse::Component* out_host) {
+  CanonHostInfo host_info;
+  DoHost<char, unsigned char>(spec, host, output, &host_info);
+  *out_host = host_info.out_host;
+  return (host_info.family != CanonHostInfo::BROKEN);
+}
+bool CanonicalizeHost(const char16* spec,
+                      const url_parse::Component& host,
+                      CanonOutput* output,
+                      url_parse::Component* out_host) {
+  CanonHostInfo host_info;
+  DoHost<char16, char16>(spec, host, output, &host_info);
+  *out_host = host_info.out_host;
+  return (host_info.family != CanonHostInfo::BROKEN);
+}
+void CanonicalizeHostVerbose(const char* spec,
+                             const url_parse::Component& host,
+                             CanonOutput* output,
+                             CanonHostInfo *host_info) {
+  DoHost<char, unsigned char>(spec, host, output, host_info);
+}
+void CanonicalizeHostVerbose(const char16* spec,
+                             const url_parse::Component& host,
+                             CanonOutput* output,
+                             CanonHostInfo *host_info) {
+  DoHost<char16, char16>(spec, host, output, host_info);
+}
+}  // namespace url_canon

data/ext/uri_parser/url_canon_icu.cc ADDED Viewed

@@ -0,0 +1,207 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// ICU integration functions.
+#include <stdlib.h>
+#include <string.h>
+#include <unicode/ucnv.h>
+#include <unicode/ucnv_cb.h>
+#include <unicode/uidna.h>
+#include "url_canon_icu.h"
+#include "url_canon_internal.h"  // for _itoa_s
+#include "logging.h"
+namespace url_canon {
+namespace {
+// Called when converting a character that can not be represented, this will
+// append an escaped version of the numerical character reference for that code
+// point. It is of the form "&#1234;" and we will escape the non-digits to
+// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
+void appendURLEscapedChar(const void* context,
+                          UConverterFromUnicodeArgs* from_args,
+                          const UChar* code_units,
+                          int32_t length,
+                          UChar32 code_point,
+                          UConverterCallbackReason reason,
+                          UErrorCode* err) {
+  if (reason == UCNV_UNASSIGNED) {
+    *err = U_ZERO_ERROR;
+    const static int prefix_len = 6;
+    const static char prefix[prefix_len + 1] = "%26%23";  // "&#" percent-escaped
+    ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
+    DCHECK(code_point < 0x110000);
+    char number[8];  // Max Unicode code point is 7 digits.
+    _itoa_s(code_point, number, 10);
+    int number_len = static_cast<int>(strlen(number));
+    ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
+    const static int postfix_len = 3;
+    const static char postfix[postfix_len + 1] = "%3B";   // ";" percent-escaped
+    ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
+  }
+}
+// A class for scoping the installation of the invalid character callback.
+class AppendHandlerInstaller {
+ public:
+  // The owner of this object must ensure that the converter is alive for the
+  // duration of this object's lifetime.
+  AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
+    UErrorCode err = U_ZERO_ERROR;
+    ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
+                          &old_callback_, &old_context_, &err);
+  }
+  ~AppendHandlerInstaller() {
+    UErrorCode err = U_ZERO_ERROR;
+    ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
+  }
+ private:
+  UConverter* converter_;
+  UConverterFromUCallback old_callback_;
+  const void* old_context_;
+};
+}  // namespace
+ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
+    : converter_(converter) {
+}
+void ICUCharsetConverter::ConvertFromUTF16(const char16* input,
+                                           int input_len,
+                                           CanonOutput* output) {
+  // Install our error handler. It will be called for character that can not
+  // be represented in the destination character set.
+  AppendHandlerInstaller handler(converter_);
+  int begin_offset = output->length();
+  int dest_capacity = output->capacity() - begin_offset;
+  output->set_length(output->length());
+  do {
+    UErrorCode err = U_ZERO_ERROR;
+    char* dest = &output->data()[begin_offset];
+    int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
+                                            input, input_len, &err);
+    if (err != U_BUFFER_OVERFLOW_ERROR) {
+      output->set_length(begin_offset + required_capacity);
+      return;
+    }
+    // Output didn't fit, expand
+    dest_capacity = required_capacity;
+    output->Resize(begin_offset + dest_capacity);
+  } while (true);
+}
+// Converts the Unicode input representing a hostname to ASCII using IDN rules.
+// The output must be ASCII, but is represented as wide characters.
+//
+// On success, the output will be filled with the ASCII host name and it will
+// return true. Unlike most other canonicalization functions, this assumes that
+// the output is empty. The beginning of the host will be at offset 0, and
+// the length of the output will be set to the length of the new host name.
+//
+// On error, this will return false. The output in this case is undefined.
+bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) {
+  DCHECK(output->length() == 0);  // Output buffer is assumed empty.
+  while (true) {
+    // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
+    // the spec (which do exist). This does not present any risk and is a
+    // little more future proof.
+    UErrorCode err = U_ZERO_ERROR;
+    int num_converted = uidna_IDNToASCII(src, src_len, output->data(),
+                                         output->capacity(),
+                                         UIDNA_ALLOW_UNASSIGNED, NULL, &err);
+    if (err == U_ZERO_ERROR) {
+      output->set_length(num_converted);
+      return true;
+    }
+    if (err != U_BUFFER_OVERFLOW_ERROR)
+      return false;  // Unknown error, give up.
+    // Not enough room in our buffer, expand.
+    output->Resize(output->capacity() * 2);
+  }
+}
+bool ReadUTFChar(const char* str, int* begin, int length,
+                 unsigned* code_point_out) {
+  int code_point;  // Avoids warning when U8_NEXT writes -1 to it.
+  U8_NEXT(str, *begin, length, code_point);
+  *code_point_out = static_cast<unsigned>(code_point);
+  // The ICU macro above moves to the next char, we want to point to the last
+  // char consumed.
+  (*begin)--;
+  // Validate the decoded value.
+  if (U_IS_UNICODE_CHAR(code_point))
+    return true;
+  *code_point_out = kUnicodeReplacementCharacter;
+  return false;
+}
+bool ReadUTFChar(const char16* str, int* begin, int length,
+                 unsigned* code_point) {
+  if (U16_IS_SURROGATE(str[*begin])) {
+    if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
+        !U16_IS_TRAIL(str[*begin + 1])) {
+      // Invalid surrogate pair.
+      *code_point = kUnicodeReplacementCharacter;
+      return false;
+    } else {
+      // Valid surrogate pair.
+      *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
+      (*begin)++;
+    }
+  } else {
+    // Not a surrogate, just one 16-bit word.
+    *code_point = str[*begin];
+  }
+  if (U_IS_UNICODE_CHAR(*code_point))
+    return true;
+  // Invalid code point.
+  *code_point = kUnicodeReplacementCharacter;
+  return false;
+}
+}  // namespace url_canon