RubyGems - uri_parser - Versions diffs - 0.0.1 - Mend

uri_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

data/.gitignore +6 -0
data/.rvmrc +1 -0
data/Gemfile +6 -0
data/Rakefile +13 -0
data/ext/uri_parser/basictypes.h +89 -0
data/ext/uri_parser/extconf.h +6 -0
data/ext/uri_parser/extconf.rb +50 -0
data/ext/uri_parser/logging.h +5 -0
data/ext/uri_parser/scoped_ptr.h +322 -0
data/ext/uri_parser/string16.cc +95 -0
data/ext/uri_parser/string16.h +194 -0
data/ext/uri_parser/uri_parser.cc +87 -0
data/ext/uri_parser/url_canon.h +872 -0
data/ext/uri_parser/url_canon_etc.cc +392 -0
data/ext/uri_parser/url_canon_fileurl.cc +215 -0
data/ext/uri_parser/url_canon_host.cc +401 -0
data/ext/uri_parser/url_canon_icu.cc +207 -0
data/ext/uri_parser/url_canon_icu.h +63 -0
data/ext/uri_parser/url_canon_internal.cc +427 -0
data/ext/uri_parser/url_canon_internal.h +453 -0
data/ext/uri_parser/url_canon_internal_file.h +157 -0
data/ext/uri_parser/url_canon_ip.cc +737 -0
data/ext/uri_parser/url_canon_ip.h +101 -0
data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
data/ext/uri_parser/url_canon_path.cc +380 -0
data/ext/uri_parser/url_canon_pathurl.cc +128 -0
data/ext/uri_parser/url_canon_query.cc +189 -0
data/ext/uri_parser/url_canon_relative.cc +572 -0
data/ext/uri_parser/url_canon_stdstring.h +134 -0
data/ext/uri_parser/url_canon_stdurl.cc +211 -0
data/ext/uri_parser/url_common.h +48 -0
data/ext/uri_parser/url_file.h +108 -0
data/ext/uri_parser/url_parse.cc +760 -0
data/ext/uri_parser/url_parse.h +336 -0
data/ext/uri_parser/url_parse_file.cc +243 -0
data/ext/uri_parser/url_parse_internal.h +112 -0
data/ext/uri_parser/url_util.cc +553 -0
data/ext/uri_parser/url_util.h +222 -0
data/lib/uri_parser.rb +28 -0
data/lib/uri_parser/version.rb +3 -0
data/spec/spec_helper.rb +16 -0
data/spec/uri_parser_spec.rb +54 -0
data/uri_parser.gemspec +26 -0
metadata +117 -0

data/ext/uri_parser/url_canon_stdstring.h ADDED Viewed

@@ -0,0 +1,134 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// This header file defines a canonicalizer output method class for STL
+// strings. Because the canonicalizer tries not to be dependent on the STL,
+// we have segregated it here.
+#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+#include <string>
+#include "url_canon.h"
+namespace url_canon {
+// Write into a std::string given in the constructor. This object does not own
+// the string itself, and the user must ensure that the string stays alive
+// throughout the lifetime of this object.
+//
+// The given string will be appended to; any existing data in the string will
+// be preserved. The caller should reserve() the amount of data in the string
+// they expect to be written. We will resize if necessary, but that's slow.
+//
+// Note that when canonicalization is complete, the string will likely have
+// unused space at the end because we make the string very big to start out
+// with (by |initial_size|). This ends up being important because resize
+// operations are slow, and because the base class needs to write directly
+// into the buffer.
+//
+// Therefore, the user should call Complete() before using the string that
+// this class wrote into.
+class StdStringCanonOutput : public CanonOutput {
+ public:
+  StdStringCanonOutput(std::string* str)
+      : CanonOutput(),
+        str_(str) {
+    cur_len_ = static_cast<int>(str_->size());  // Append to existing data.
+    str_->resize(str_->capacity());
+    buffer_ = &(*str_)[0];
+    buffer_len_ = static_cast<int>(str_->size());
+  }
+  virtual ~StdStringCanonOutput() {
+    // Nothing to do, we don't own the string.
+  }
+  // Must be called after writing has completed but before the string is used.
+  void Complete() {
+    str_->resize(cur_len_);
+    buffer_len_ = cur_len_;
+  }
+  virtual void Resize(int sz) {
+    str_->resize(sz);
+    buffer_ = &(*str_)[0];
+    buffer_len_ = sz;
+  }
+ protected:
+  std::string* str_;
+};
+// An extension of the Replacements class that allows the setters to use
+// standard strings.
+//
+// The strings passed as arguments are not copied and must remain valid until
+// this class goes out of scope.
+template<typename STR>
+class StdStringReplacements :
+    public url_canon::Replacements<typename STR::value_type> {
+ public:
+  void SetSchemeStr(const STR& s) {
+    this->SetScheme(s.data(),
+                    url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetUsernameStr(const STR& s) {
+    this->SetUsername(s.data(),
+                      url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetPasswordStr(const STR& s) {
+    this->SetPassword(s.data(),
+                      url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetHostStr(const STR& s) {
+    this->SetHost(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetPortStr(const STR& s) {
+    this->SetPort(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetPathStr(const STR& s) {
+    this->SetPath(s.data(),
+                  url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetQueryStr(const STR& s) {
+    this->SetQuery(s.data(),
+                   url_parse::Component(0, static_cast<int>(s.length())));
+  }
+  void SetRefStr(const STR& s) {
+    this->SetRef(s.data(),
+                 url_parse::Component(0, static_cast<int>(s.length())));
+  }
+};
+}  // namespace url_canon
+#endif  // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__

data/ext/uri_parser/url_canon_stdurl.cc ADDED Viewed

@@ -0,0 +1,211 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Functions to canonicalize "standard" URLs, which are ones that have an
+// authority section including a host name.
+#include "url_canon.h"
+#include "url_canon_internal.h"
+namespace url_canon {
+namespace {
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
+                               const url_parse::Parsed& parsed,
+                               CharsetConverter* query_converter,
+                               CanonOutput* output,
+                               url_parse::Parsed* new_parsed) {
+  // Scheme: this will append the colon.
+  bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
+                                    output, &new_parsed->scheme);
+  // Authority (username, password, host, port)
+  bool have_authority;
+  if (parsed.username.is_valid() || parsed.password.is_valid() ||
+      parsed.host.is_nonempty() || parsed.port.is_valid()) {
+    have_authority = true;
+    // Only write the authority separators when we have a scheme.
+    if (parsed.scheme.is_valid()) {
+      output->push_back('/');
+      output->push_back('/');
+    }
+    // User info: the canonicalizer will handle the : and @.
+    success &= CanonicalizeUserInfo(source.username, parsed.username,
+                                    source.password, parsed.password,
+                                    output,
+                                    &new_parsed->username,
+                                    &new_parsed->password);
+    success &= CanonicalizeHost(source.host, parsed.host,
+                                output, &new_parsed->host);
+    // Host must not be empty for standard URLs.
+    if (!parsed.host.is_nonempty())
+      success = false;
+    // Port: the port canonicalizer will handle the colon.
+    int default_port = DefaultPortForScheme(
+        &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len);
+    success &= CanonicalizePort(source.port, parsed.port, default_port,
+                                output, &new_parsed->port);
+  } else {
+    // No authority, clear the components.
+    have_authority = false;
+    new_parsed->host.reset();
+    new_parsed->username.reset();
+    new_parsed->password.reset();
+    new_parsed->port.reset();
+    success = false;  // Standard URLs must have an authority.
+  }
+  // Path
+  if (parsed.path.is_valid()) {
+    success &= CanonicalizePath(source.path, parsed.path,
+                                output, &new_parsed->path);
+  } else if (have_authority ||
+             parsed.query.is_valid() || parsed.ref.is_valid()) {
+    // When we have an empty path, make up a path when we have an authority
+    // or something following the path. The only time we allow an empty
+    // output path is when there is nothing else.
+    new_parsed->path = url_parse::Component(output->length(), 1);
+    output->push_back('/');
+  } else {
+    // No path at all
+    new_parsed->path.reset();
+  }
+  // Query
+  CanonicalizeQuery(source.query, parsed.query, query_converter,
+                    output, &new_parsed->query);
+  // Ref: ignore failure for this, since the page can probably still be loaded.
+  CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+  return success;
+}
+}  // namespace
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+int DefaultPortForScheme(const char* scheme, int scheme_len) {
+  int default_port = url_parse::PORT_UNSPECIFIED;
+  switch (scheme_len) {
+    case 4:
+      if (!strncmp(scheme, "http", scheme_len))
+        default_port = 80;
+      break;
+    case 5:
+      if (!strncmp(scheme, "https", scheme_len))
+        default_port = 443;
+      break;
+    case 3:
+      if (!strncmp(scheme, "ftp", scheme_len))
+        default_port = 21;
+      else if (!strncmp(scheme, "wss", scheme_len))
+        default_port = 443;
+      break;
+    case 6:
+      if (!strncmp(scheme, "gopher", scheme_len))
+        default_port = 70;
+      break;
+    case 2:
+      if (!strncmp(scheme, "ws", scheme_len))
+        default_port = 80;
+      break;
+  }
+  return default_port;
+}
+bool CanonicalizeStandardURL(const char* spec,
+                             int spec_len,
+                             const url_parse::Parsed& parsed,
+                             CharsetConverter* query_converter,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed) {
+  return DoCanonicalizeStandardURL<char, unsigned char>(
+      URLComponentSource<char>(spec), parsed, query_converter,
+      output, new_parsed);
+}
+bool CanonicalizeStandardURL(const char16* spec,
+                             int spec_len,
+                             const url_parse::Parsed& parsed,
+                             CharsetConverter* query_converter,
+                             CanonOutput* output,
+                             url_parse::Parsed* new_parsed) {
+  return DoCanonicalizeStandardURL<char16, char16>(
+      URLComponentSource<char16>(spec), parsed, query_converter,
+      output, new_parsed);
+}
+// It might be nice in the future to optimize this so unchanged components don't
+// need to be recanonicalized. This is especially true since the common case for
+// ReplaceComponents is removing things we don't want, like reference fragments
+// and usernames. These cases can become more efficient if we can assume the
+// rest of the URL is OK with these removed (or only the modified parts
+// recanonicalized). This would be much more complex to implement, however.
+//
+// You would also need to update DoReplaceComponents in url_util.cc which
+// relies on this re-checking everything (see the comment there for why).
+bool ReplaceStandardURL(const char* base,
+                        const url_parse::Parsed& base_parsed,
+                        const Replacements<char>& replacements,
+                        CharsetConverter* query_converter,
+                        CanonOutput* output,
+                        url_parse::Parsed* new_parsed) {
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupOverrideComponents(base, replacements, &source, &parsed);
+  return DoCanonicalizeStandardURL<char, unsigned char>(
+      source, parsed, query_converter, output, new_parsed);
+}
+// For 16-bit replacements, we turn all the replacements into UTF-8 so the
+// regular codepath can be used.
+bool ReplaceStandardURL(const char* base,
+                        const url_parse::Parsed& base_parsed,
+                        const Replacements<char16>& replacements,
+                        CharsetConverter* query_converter,
+                        CanonOutput* output,
+                        url_parse::Parsed* new_parsed) {
+  RawCanonOutput<1024> utf8;
+  URLComponentSource<char> source(base);
+  url_parse::Parsed parsed(base_parsed);
+  SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+  return DoCanonicalizeStandardURL<char, unsigned char>(
+      source, parsed, query_converter, output, new_parsed);
+}
+}  // namespace url_canon

data/ext/uri_parser/url_common.h ADDED Viewed

@@ -0,0 +1,48 @@
+// Copyright 2010, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#ifndef GOOGLEURL_SRC_URL_COMMON_H__
+#define GOOGLEURL_SRC_URL_COMMON_H__
+#if !defined(GURL_IMPLEMENTATION)
+#define GURL_IMPLEMENTATION 0
+#endif
+#if defined(WIN32) && defined(GURL_DLL)
+#if GURL_IMPLEMENTATION
+#define GURL_API __declspec(dllexport)
+#else
+#define GURL_API __declspec(dllimport)
+#endif
+#else
+#define GURL_API
+#endif
+#endif  // GOOGLEURL_SRC_URL_COMMON_H__

data/ext/uri_parser/url_file.h ADDED Viewed

@@ -0,0 +1,108 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Provides shared functions used by the internals of the parser and
+// canonicalizer for file URLs. Do not use outside of these modules.
+#ifndef GOOGLEURL_SRC_URL_FILE_H__
+#define GOOGLEURL_SRC_URL_FILE_H__
+#include "url_parse_internal.h"
+namespace url_parse {
+#ifdef WIN32
+// We allow both "c:" and "c|" as drive identifiers.
+inline bool IsWindowsDriveSeparator(char16 ch) {
+  return ch == ':' || ch == '|';
+}
+inline bool IsWindowsDriveLetter(char16 ch) {
+  return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
+}
+#endif  // WIN32
+// Returns the index of the next slash in the input after the given index, or
+// spec_len if the end of the input is reached.
+template<typename CHAR>
+inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) {
+  int idx = begin_index;
+  while (idx < spec_len && !IsURLSlash(spec[idx]))
+    idx++;
+  return idx;
+}
+#ifdef WIN32
+// Returns true if the start_offset in the given spec looks like it begins a
+// drive spec, for example "c:". This function explicitly handles start_offset
+// values that are equal to or larger than the spec_len to simplify callers.
+//
+// If this returns true, the spec is guaranteed to have a valid drive letter
+// plus a colon starting at |start_offset|.
+template<typename CHAR>
+inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset,
+                                      int spec_len) {
+  int remaining_len = spec_len - start_offset;
+  if (remaining_len < 2)
+    return false;  // Not enough room.
+  if (!IsWindowsDriveLetter(spec[start_offset]))
+    return false;  // Doesn't start with a valid drive letter.
+  if (!IsWindowsDriveSeparator(spec[start_offset + 1]))
+    return false;  // Isn't followed with a drive separator.
+  return true;
+}
+// Returns true if the start_offset in the given text looks like it begins a
+// UNC path, for example "\\". This function explicitly handles start_offset
+// values that are equal to or larger than the spec_len to simplify callers.
+//
+// When strict_slashes is set, this function will only accept backslashes as is
+// standard for Windows. Otherwise, it will accept forward slashes as well
+// which we use for a lot of URL handling.
+template<typename CHAR>
+inline bool DoesBeginUNCPath(const CHAR* text,
+                             int start_offset,
+                             int len,
+                             bool strict_slashes) {
+  int remaining_len = len - start_offset;
+  if (remaining_len < 2)
+    return false;
+  if (strict_slashes)
+    return text[start_offset] == '\\' && text[start_offset + 1] == '\\';
+  return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]);
+}
+#endif  // WIN32
+}  // namespace url_parse
+#endif  // GOOGLEURL_SRC_URL_FILE_H__