uri_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,95 @@
1
+ // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2
+ //
3
+ // Redistribution and use in source and binary forms, with or without
4
+ // modification, are permitted provided that the following conditions are
5
+ // met:
6
+ //
7
+ // * Redistributions of source code must retain the above copyright
8
+ // notice, this list of conditions and the following disclaimer.
9
+ // * Redistributions in binary form must reproduce the above
10
+ // copyright notice, this list of conditions and the following disclaimer
11
+ // in the documentation and/or other materials provided with the
12
+ // distribution.
13
+ // * Neither the name of Google Inc. nor the names of its
14
+ // contributors may be used to endorse or promote products derived from
15
+ // this software without specific prior written permission.
16
+ //
17
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ #include "string16.h"
30
+
31
+ #include <string.h>
32
+ #ifdef WIN32
33
+
34
+ #error This file should not be used on 2-byte wchar_t systems
35
+ // If this winds up being needed on 2-byte wchar_t systems, either the
36
+ // definitions below can be used, or the host system's wide character
37
+ // functions like wmemcmp can be wrapped.
38
+
39
+ #else // !WIN32
40
+
41
+ namespace base {
42
+
43
+ int c16memcmp(const char16* s1, const char16* s2, size_t n) {
44
+ // We cannot call memcmp because that changes the semantics.
45
+ while (n-- > 0) {
46
+ if (*s1 != *s2) {
47
+ // We cannot use (*s1 - *s2) because char16 is unsigned.
48
+ return ((*s1 < *s2) ? -1 : 1);
49
+ }
50
+ ++s1;
51
+ ++s2;
52
+ }
53
+ return 0;
54
+ }
55
+
56
+ size_t c16len(const char16* s) {
57
+ const char16 *s_orig = s;
58
+ while (*s) {
59
+ ++s;
60
+ }
61
+ return s - s_orig;
62
+ }
63
+
64
+ const char16* c16memchr(const char16* s, char16 c, size_t n) {
65
+ while (n-- > 0) {
66
+ if (*s == c) {
67
+ return s;
68
+ }
69
+ ++s;
70
+ }
71
+ return 0;
72
+ }
73
+
74
+ char16* c16memmove(char16* s1, const char16* s2, size_t n) {
75
+ return reinterpret_cast<char16*>(memmove(s1, s2, n * sizeof(char16)));
76
+ }
77
+
78
+ char16* c16memcpy(char16* s1, const char16* s2, size_t n) {
79
+ return reinterpret_cast<char16*>(memcpy(s1, s2, n * sizeof(char16)));
80
+ }
81
+
82
+ char16* c16memset(char16* s, char16 c, size_t n) {
83
+ char16 *s_orig = s;
84
+ while (n-- > 0) {
85
+ *s = c;
86
+ ++s;
87
+ }
88
+ return s_orig;
89
+ }
90
+
91
+ } // namespace base
92
+
93
+ template class std::basic_string<char16, base::string16_char_traits>;
94
+
95
+ #endif // WIN32
@@ -0,0 +1,194 @@
1
+ // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2
+ //
3
+ // Redistribution and use in source and binary forms, with or without
4
+ // modification, are permitted provided that the following conditions are
5
+ // met:
6
+ //
7
+ // * Redistributions of source code must retain the above copyright
8
+ // notice, this list of conditions and the following disclaimer.
9
+ // * Redistributions in binary form must reproduce the above
10
+ // copyright notice, this list of conditions and the following disclaimer
11
+ // in the documentation and/or other materials provided with the
12
+ // distribution.
13
+ // * Neither the name of Google Inc. nor the names of its
14
+ // contributors may be used to endorse or promote products derived from
15
+ // this software without specific prior written permission.
16
+ //
17
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ #ifndef BASE_STRING16_H_
30
+ #define BASE_STRING16_H_
31
+
32
+ // WHAT:
33
+ // A version of std::basic_string that provides 2-byte characters even when
34
+ // wchar_t is not implemented as a 2-byte type. You can access this class as
35
+ // string16. We also define char16, which string16 is based upon.
36
+ //
37
+ // WHY:
38
+ // On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2
39
+ // data. Plenty of existing code operates on strings encoded as UTF-16.
40
+ //
41
+ // On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make
42
+ // it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails
43
+ // at run time, because it calls some functions (like wcslen) that come from
44
+ // the system's native C library -- which was built with a 4-byte wchar_t!
45
+ // It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's
46
+ // entirely improper on those systems where the encoding of wchar_t is defined
47
+ // as UTF-32.
48
+ //
49
+ // Here, we define string16, which is similar to std::wstring but replaces all
50
+ // libc functions with custom, 2-byte-char compatible routines. It is capable
51
+ // of carrying UTF-16-encoded data.
52
+
53
+ #include <string>
54
+ #include <cstdio>
55
+
56
+
57
+ #include "basictypes.h"
58
+
59
+ #ifdef WIN32
60
+
61
+ typedef wchar_t char16;
62
+ typedef std::wstring string16;
63
+
64
+ #else // !WIN32
65
+
66
+ typedef uint16 char16;
67
+
68
+ namespace base {
69
+
70
+ // char16 versions of the functions required by string16_char_traits; these
71
+ // are based on the wide character functions of similar names ("w" or "wcs"
72
+ // instead of "c16").
73
+ int c16memcmp(const char16* s1, const char16* s2, size_t n);
74
+ size_t c16len(const char16* s);
75
+ const char16* c16memchr(const char16* s, char16 c, size_t n);
76
+ char16* c16memmove(char16* s1, const char16* s2, size_t n);
77
+ char16* c16memcpy(char16* s1, const char16* s2, size_t n);
78
+ char16* c16memset(char16* s, char16 c, size_t n);
79
+
80
+ struct string16_char_traits {
81
+ typedef char16 char_type;
82
+ typedef int int_type;
83
+
84
+ typedef std::streamoff off_type;
85
+ typedef mbstate_t state_type;
86
+ typedef std::fpos<state_type> pos_type;
87
+
88
+ static void assign(char_type& c1, const char_type& c2) {
89
+ c1 = c2;
90
+ }
91
+
92
+ static bool eq(const char_type& c1, const char_type& c2) {
93
+ return c1 == c2;
94
+ }
95
+ static bool lt(const char_type& c1, const char_type& c2) {
96
+ return c1 < c2;
97
+ }
98
+
99
+ static int compare(const char_type* s1, const char_type* s2, size_t n) {
100
+ return c16memcmp(s1, s2, n);
101
+ }
102
+
103
+ static size_t length(const char_type* s) {
104
+ return c16len(s);
105
+ }
106
+
107
+ static const char_type* find(const char_type* s, size_t n,
108
+ const char_type& a) {
109
+ return c16memchr(s, a, n);
110
+ }
111
+
112
+ static char_type* move(char_type* s1, const char_type* s2, int_type n) {
113
+ return c16memmove(s1, s2, n);
114
+ }
115
+
116
+ static char_type* copy(char_type* s1, const char_type* s2, size_t n) {
117
+ return c16memcpy(s1, s2, n);
118
+ }
119
+
120
+ static char_type* assign(char_type* s, size_t n, char_type a) {
121
+ return c16memset(s, a, n);
122
+ }
123
+
124
+ static int_type not_eof(const int_type& c) {
125
+ return eq_int_type(c, eof()) ? 0 : c;
126
+ }
127
+
128
+ static char_type to_char_type(const int_type& c) {
129
+ return char_type(c);
130
+ }
131
+
132
+ static int_type to_int_type(const char_type& c) {
133
+ return int_type(c);
134
+ }
135
+
136
+ static bool eq_int_type(const int_type& c1, const int_type& c2) {
137
+ return c1 == c2;
138
+ }
139
+
140
+ static int_type eof() {
141
+ return static_cast<int_type>(EOF);
142
+ }
143
+ };
144
+
145
+ } // namespace base
146
+
147
+ // The string class will be explicitly instantiated only once, in string16.cc.
148
+ //
149
+ // std::basic_string<> in GNU libstdc++ contains a static data member,
150
+ // _S_empty_rep_storage, to represent empty strings. When an operation such
151
+ // as assignment or destruction is performed on a string, causing its existing
152
+ // data member to be invalidated, it must not be freed if this static data
153
+ // member is being used. Otherwise, it counts as an attempt to free static
154
+ // (and not allocated) data, which is a memory error.
155
+ //
156
+ // Generally, due to C++ template magic, _S_empty_rep_storage will be marked
157
+ // as a coalesced symbol, meaning that the linker will combine multiple
158
+ // instances into a single one when generating output.
159
+ //
160
+ // If a string class is used by multiple shared libraries, a problem occurs.
161
+ // Each library will get its own copy of _S_empty_rep_storage. When strings
162
+ // are passed across a library boundary for alteration or destruction, memory
163
+ // errors will result. GNU libstdc++ contains a configuration option,
164
+ // --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which
165
+ // disables the static data member optimization, but it's a good optimization
166
+ // and non-STL code is generally at the mercy of the system's STL
167
+ // configuration. Fully-dynamic strings are not the default for GNU libstdc++
168
+ // libstdc++ itself or for the libstdc++ installations on the systems we care
169
+ // about, such as Mac OS X and relevant flavors of Linux.
170
+ //
171
+ // See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 .
172
+ //
173
+ // To avoid problems, string classes need to be explicitly instantiated only
174
+ // once, in exactly one library. All other string users see it via an "extern"
175
+ // declaration. This is precisely how GNU libstdc++ handles
176
+ // std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring).
177
+ //
178
+ // This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2),
179
+ // in which the linker does not fully coalesce symbols when dead code
180
+ // stripping is enabled. This bug causes the memory errors described above
181
+ // to occur even when a std::basic_string<> does not cross shared library
182
+ // boundaries, such as in statically-linked executables.
183
+ //
184
+ // TODO(mark): File this bug with Apple and update this note with a bug number.
185
+
186
+ extern template class std::basic_string<char16, base::string16_char_traits>;
187
+
188
+ typedef std::basic_string<char16, base::string16_char_traits> string16;
189
+
190
+ extern std::ostream& operator<<(std::ostream& out, const string16& str);
191
+
192
+ #endif // !WIN32
193
+
194
+ #endif // BASE_STRING16_H_
@@ -0,0 +1,87 @@
1
+ #include "ruby.h"
2
+ #include <string>
3
+
4
+ #include "url_canon_stdstring.h"
5
+ #include "url_util.h"
6
+
7
+ #define ATTR_PORT "port"
8
+ #define ATTR_SCHEME "scheme"
9
+ #define ATTR_HOST "host"
10
+ #define ATTR_PATH "path"
11
+ #define ATTR_QUERY "query"
12
+ #define ATTR_VALID "valid"
13
+ #define ATTR_URI "uri"
14
+
15
+ extern "C" {
16
+ // Defining a space for information and references about the module to be stored internally
17
+ static VALUE uri_parser = Qnil;
18
+
19
+ typedef VALUE (ruby_method_vararg)(...);
20
+
21
+ // Prototype for the initialization method - Ruby calls this, not you
22
+ void Init_uri_parser();
23
+
24
+ bool canonicalize(const std::string& input_spec,
25
+ std::string* canonical,
26
+ url_parse::Parsed* parsed)
27
+ {
28
+ // Reserve enough room in the output for the input, plus some extra so that
29
+ // we have room if we have to escape a few things without reallocating.
30
+ canonical->reserve(input_spec.size() + 32);
31
+ url_canon::StdStringCanonOutput output(canonical);
32
+ bool success = url_util::Canonicalize(
33
+ input_spec.data(), static_cast<int>(input_spec.length()),
34
+ NULL, &output, parsed);
35
+ output.Complete(); // Must be done before using string.
36
+ return success;
37
+ }
38
+
39
+
40
+ // Returns the substring of the input identified by the given component.
41
+ VALUE component_rb_str(std::string& url, const url_parse::Component& comp)
42
+ {
43
+ if (comp.len <= 0)
44
+ return rb_str_new2("");
45
+ else
46
+ return rb_str_new2(std::string(url, comp.begin, comp.len).c_str());
47
+ }
48
+
49
+ VALUE uri_parser_valid(VALUE self)
50
+ {
51
+ return rb_iv_get(self, "@"ATTR_VALID);
52
+ }
53
+
54
+ VALUE uri_parser_initialize(VALUE self, VALUE in)
55
+ {
56
+ std::string url(rb_string_value_ptr(&in));
57
+ std::string canonical;
58
+ url_parse::Parsed parsed;
59
+
60
+ bool valid = canonicalize(url, &canonical, &parsed);
61
+
62
+ rb_iv_set(self, "@"ATTR_PORT, component_rb_str(canonical, parsed.port));
63
+ rb_iv_set(self, "@"ATTR_HOST, component_rb_str(canonical, parsed.host));
64
+ rb_iv_set(self, "@"ATTR_PATH, component_rb_str(canonical, parsed.path));
65
+ rb_iv_set(self, "@"ATTR_QUERY, component_rb_str(canonical, parsed.query));
66
+ rb_iv_set(self, "@"ATTR_SCHEME, component_rb_str(canonical, parsed.scheme));
67
+ rb_iv_set(self, "@"ATTR_URI, rb_str_new2(canonical.c_str()));
68
+ rb_iv_set(self, "@"ATTR_VALID, valid ? Qtrue : Qfalse);
69
+
70
+ return Qtrue;
71
+ }
72
+
73
+ // The initialization method for this module
74
+ void Init_uri_parser() {
75
+ uri_parser= rb_define_class("URIParser", rb_cObject);
76
+ rb_define_method(uri_parser, "initialize", (ruby_method_vararg*)uri_parser_initialize, 1);
77
+ rb_define_attr(uri_parser, ATTR_PORT, 1, 0);
78
+ rb_define_attr(uri_parser, ATTR_HOST, 1, 0);
79
+ rb_define_attr(uri_parser, ATTR_PATH, 1, 0);
80
+ rb_define_attr(uri_parser, ATTR_QUERY, 1, 0);
81
+ rb_define_attr(uri_parser, ATTR_SCHEME, 1, 0);
82
+ rb_define_attr(uri_parser, ATTR_URI, 1, 0);
83
+ rb_define_attr(uri_parser, ATTR_VALID, 1, 0);
84
+ rb_define_method(uri_parser, ATTR_VALID"?", (ruby_method_vararg*)uri_parser_valid, 0);
85
+ }
86
+
87
+ }
@@ -0,0 +1,872 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ #ifndef GOOGLEURL_SRC_URL_CANON_H__
30
+ #define GOOGLEURL_SRC_URL_CANON_H__
31
+
32
+ #include <memory.h>
33
+ #include <stdlib.h>
34
+
35
+ #include "string16.h"
36
+ #include "url_common.h"
37
+ #include "url_parse.h"
38
+
39
+ namespace url_canon {
40
+
41
+ // Canonicalizer output -------------------------------------------------------
42
+
43
+ // Base class for the canonicalizer output, this maintains a buffer and
44
+ // supports simple resizing and append operations on it.
45
+ //
46
+ // It is VERY IMPORTANT that no virtual function calls be made on the common
47
+ // code path. We only have two virtual function calls, the destructor and a
48
+ // resize function that is called when the existing buffer is not big enough.
49
+ // The derived class is then in charge of setting up our buffer which we will
50
+ // manage.
51
+ template<typename T>
52
+ class CanonOutputT {
53
+ public:
54
+ CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {
55
+ }
56
+ virtual ~CanonOutputT() {
57
+ }
58
+
59
+ // Implemented to resize the buffer. This function should update the buffer
60
+ // pointer to point to the new buffer, and any old data up to |cur_len_| in
61
+ // the buffer must be copied over.
62
+ //
63
+ // The new size |sz| must be larger than buffer_len_.
64
+ virtual void Resize(int sz) = 0;
65
+
66
+ // Accessor for returning a character at a given position. The input offset
67
+ // must be in the valid range.
68
+ inline char at(int offset) const {
69
+ return buffer_[offset];
70
+ }
71
+
72
+ // Sets the character at the given position. The given position MUST be less
73
+ // than the length().
74
+ inline void set(int offset, int ch) {
75
+ buffer_[offset] = ch;
76
+ }
77
+
78
+ // Returns the number of characters currently in the buffer.
79
+ inline int length() const {
80
+ return cur_len_;
81
+ }
82
+
83
+ // Returns the current capacity of the buffer. The length() is the number of
84
+ // characters that have been declared to be written, but the capacity() is
85
+ // the number that can be written without reallocation. If the caller must
86
+ // write many characters at once, it can make sure there is enough capacity,
87
+ // write the data, then use set_size() to declare the new length().
88
+ int capacity() const {
89
+ return buffer_len_;
90
+ }
91
+
92
+ // Called by the user of this class to get the output. The output will NOT
93
+ // be NULL-terminated. Call length() to get the
94
+ // length.
95
+ const T* data() const {
96
+ return buffer_;
97
+ }
98
+ T* data() {
99
+ return buffer_;
100
+ }
101
+
102
+ // Shortens the URL to the new length. Used for "backing up" when processing
103
+ // relative paths. This can also be used if an external function writes a lot
104
+ // of data to the buffer (when using the "Raw" version below) beyond the end,
105
+ // to declare the new length.
106
+ //
107
+ // This MUST NOT be used to expand the size of the buffer beyond capacity().
108
+ void set_length(int new_len) {
109
+ cur_len_ = new_len;
110
+ }
111
+
112
+ // This is the most performance critical function, since it is called for
113
+ // every character.
114
+ void push_back(T ch) {
115
+ // In VC2005, putting this common case first speeds up execution
116
+ // dramatically because this branch is predicted as taken.
117
+ if (cur_len_ < buffer_len_) {
118
+ buffer_[cur_len_] = ch;
119
+ cur_len_++;
120
+ return;
121
+ }
122
+
123
+ // Grow the buffer to hold at least one more item. Hopefully we won't have
124
+ // to do this very often.
125
+ if (!Grow(1))
126
+ return;
127
+
128
+ // Actually do the insertion.
129
+ buffer_[cur_len_] = ch;
130
+ cur_len_++;
131
+ }
132
+
133
+ // Appends the given string to the output.
134
+ void Append(const T* str, int str_len) {
135
+ if (cur_len_ + str_len > buffer_len_) {
136
+ if (!Grow(cur_len_ + str_len - buffer_len_))
137
+ return;
138
+ }
139
+ for (int i = 0; i < str_len; i++)
140
+ buffer_[cur_len_ + i] = str[i];
141
+ cur_len_ += str_len;
142
+ }
143
+
144
+ protected:
145
+ // Grows the given buffer so that it can fit at least |min_additional|
146
+ // characters. Returns true if the buffer could be resized, false on OOM.
147
+ bool Grow(int min_additional) {
148
+ static const int kMinBufferLen = 16;
149
+ int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;
150
+ do {
151
+ if (new_len >= (1 << 30)) // Prevent overflow below.
152
+ return false;
153
+ new_len *= 2;
154
+ } while (new_len < buffer_len_ + min_additional);
155
+ Resize(new_len);
156
+ return true;
157
+ }
158
+
159
+ T* buffer_;
160
+ int buffer_len_;
161
+
162
+ // Used characters in the buffer.
163
+ int cur_len_;
164
+ };
165
+
166
+ // Simple implementation of the CanonOutput using new[]. This class
167
+ // also supports a static buffer so if it is allocated on the stack, most
168
+ // URLs can be canonicalized with no heap allocations.
169
+ template<typename T, int fixed_capacity = 1024>
170
+ class RawCanonOutputT : public CanonOutputT<T> {
171
+ public:
172
+ RawCanonOutputT() : CanonOutputT<T>() {
173
+ this->buffer_ = fixed_buffer_;
174
+ this->buffer_len_ = fixed_capacity;
175
+ }
176
+ virtual ~RawCanonOutputT() {
177
+ if (this->buffer_ != fixed_buffer_)
178
+ delete[] this->buffer_;
179
+ }
180
+
181
+ virtual void Resize(int sz) {
182
+ T* new_buf = new T[sz];
183
+ memcpy(new_buf, this->buffer_,
184
+ sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));
185
+ if (this->buffer_ != fixed_buffer_)
186
+ delete[] this->buffer_;
187
+ this->buffer_ = new_buf;
188
+ this->buffer_len_ = sz;
189
+ }
190
+
191
+ protected:
192
+ T fixed_buffer_[fixed_capacity];
193
+ };
194
+
195
+ // Normally, all canonicalization output is in narrow characters. We support
196
+ // the templates so it can also be used internally if a wide buffer is
197
+ // required.
198
+ typedef CanonOutputT<char> CanonOutput;
199
+ typedef CanonOutputT<char16> CanonOutputW;
200
+
201
+ template<int fixed_capacity>
202
+ class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
203
+ template<int fixed_capacity>
204
+ class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {};
205
+
206
+ // Character set converter ----------------------------------------------------
207
+ //
208
+ // Converts query strings into a custom encoding. The embedder can supply an
209
+ // implementation of this class to interface with their own character set
210
+ // conversion libraries.
211
+ //
212
+ // Embedders will want to see the unit test for the ICU version.
213
+
214
+ class CharsetConverter {
215
+ public:
216
+ CharsetConverter() {}
217
+ virtual ~CharsetConverter() {}
218
+
219
+ // Converts the given input string from UTF-16 to whatever output format the
220
+ // converter supports. This is used only for the query encoding conversion,
221
+ // which does not fail. Instead, the converter should insert "invalid
222
+ // character" characters in the output for invalid sequences, and do the
223
+ // best it can.
224
+ //
225
+ // If the input contains a character not representable in the output
226
+ // character set, the converter should append the HTML entity sequence in
227
+ // decimal, (such as "&#20320;") with escaping of the ampersand, number
228
+ // sign, and semicolon (in the previous example it would be
229
+ // "%26%2320320%3B"). This rule is based on what IE does in this situation.
230
+ virtual void ConvertFromUTF16(const char16* input,
231
+ int input_len,
232
+ CanonOutput* output) = 0;
233
+ };
234
+
235
+ // Whitespace -----------------------------------------------------------------
236
+
237
+ // Searches for whitespace that should be removed from the middle of URLs, and
238
+ // removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces
239
+ // are preserved, which is what most browsers do. A pointer to the output will
240
+ // be returned, and the length of that output will be in |output_len|.
241
+ //
242
+ // This should be called before parsing if whitespace removal is desired (which
243
+ // it normally is when you are canonicalizing).
244
+ //
245
+ // If no whitespace is removed, this function will not use the buffer and will
246
+ // return a pointer to the input, to avoid the extra copy. If modification is
247
+ // required, the given |buffer| will be used and the returned pointer will
248
+ // point to the beginning of the buffer.
249
+ //
250
+ // Therefore, callers should not use the buffer, since it may actuall be empty,
251
+ // use the computed pointer and |*output_len| instead.
252
+ GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,
253
+ CanonOutputT<char>* buffer,
254
+ int* output_len);
255
+ GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,
256
+ CanonOutputT<char16>* buffer,
257
+ int* output_len);
258
+
259
+ // IDN ------------------------------------------------------------------------
260
+
261
+ // Converts the Unicode input representing a hostname to ASCII using IDN rules.
262
+ // The output must fall in the ASCII range, but will be encoded in UTF-16.
263
+ //
264
+ // On success, the output will be filled with the ASCII host name and it will
265
+ // return true. Unlike most other canonicalization functions, this assumes that
266
+ // the output is empty. The beginning of the host will be at offset 0, and
267
+ // the length of the output will be set to the length of the new host name.
268
+ //
269
+ // On error, returns false. The output in this case is undefined.
270
+ GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
271
+
272
+ // Piece-by-piece canonicalizers ----------------------------------------------
273
+ //
274
+ // These individual canonicalizers append the canonicalized versions of the
275
+ // corresponding URL component to the given std::string. The spec and the
276
+ // previously-identified range of that component are the input. The range of
277
+ // the canonicalized component will be written to the output component.
278
+ //
279
+ // These functions all append to the output so they can be chained. Make sure
280
+ // the output is empty when you start.
281
+ //
282
+ // These functions returns boolean values indicating success. On failure, they
283
+ // will attempt to write something reasonable to the output so that, if
284
+ // displayed to the user, they will recognise it as something that's messed up.
285
+ // Nothing more should ever be done with these invalid URLs, however.
286
+
287
+ // Scheme: Appends the scheme and colon to the URL. The output component will
288
+ // indicate the range of characters up to but not including the colon.
289
+ //
290
+ // Canonical URLs always have a scheme. If the scheme is not present in the
291
+ // input, this will just write the colon to indicate an empty scheme. Does not
292
+ // append slashes which will be needed before any authority components for most
293
+ // URLs.
294
+ //
295
+ // The 8-bit version requires UTF-8 encoding.
296
+ GURL_API bool CanonicalizeScheme(const char* spec,
297
+ const url_parse::Component& scheme,
298
+ CanonOutput* output,
299
+ url_parse::Component* out_scheme);
300
+ GURL_API bool CanonicalizeScheme(const char16* spec,
301
+ const url_parse::Component& scheme,
302
+ CanonOutput* output,
303
+ url_parse::Component* out_scheme);
304
+
305
+ // User info: username/password. If present, this will add the delimiters so
306
+ // the output will be "<username>:<password>@" or "<username>@". Empty
307
+ // username/password pairs, or empty passwords, will get converted to
308
+ // nonexistant in the canonical version.
309
+ //
310
+ // The components for the username and password refer to ranges in the
311
+ // respective source strings. Usually, these will be the same string, which
312
+ // is legal as long as the two components don't overlap.
313
+ //
314
+ // The 8-bit version requires UTF-8 encoding.
315
+ GURL_API bool CanonicalizeUserInfo(const char* username_source,
316
+ const url_parse::Component& username,
317
+ const char* password_source,
318
+ const url_parse::Component& password,
319
+ CanonOutput* output,
320
+ url_parse::Component* out_username,
321
+ url_parse::Component* out_password);
322
+ GURL_API bool CanonicalizeUserInfo(const char16* username_source,
323
+ const url_parse::Component& username,
324
+ const char16* password_source,
325
+ const url_parse::Component& password,
326
+ CanonOutput* output,
327
+ url_parse::Component* out_username,
328
+ url_parse::Component* out_password);
329
+
330
+
331
+ // This structure holds detailed state exported from the IP/Host canonicalizers.
332
+ // Additional fields may be added as callers require them.
333
+ struct CanonHostInfo {
334
+ CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}
335
+
336
+ // Convenience function to test if family is an IP address.
337
+ bool IsIPAddress() const { return family == IPV4 || family == IPV6; }
338
+
339
+ // This field summarizes how the input was classified by the canonicalizer.
340
+ enum Family {
341
+ NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
342
+ // canonicalizer is concerned, it should be treated as a
343
+ // hostname.
344
+ BROKEN, // - Almost an IP, but was not canonicalized. This could be an
345
+ // IPv4 address where truncation occurred, or something
346
+ // containing the special characters :[] which did not parse
347
+ // as an IPv6 address. Never attempt to connect to this
348
+ // address, because it might actually succeed!
349
+ IPV4, // - Successfully canonicalized as an IPv4 address.
350
+ IPV6, // - Successfully canonicalized as an IPv6 address.
351
+ };
352
+ Family family;
353
+
354
+ // If |family| is IPV4, then this is the number of nonempty dot-separated
355
+ // components in the input text, from 1 to 4. If |family| is not IPV4,
356
+ // this value is undefined.
357
+ int num_ipv4_components;
358
+
359
+ // Location of host within the canonicalized output.
360
+ // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6.
361
+ // CanonicalizeHostVerbose() always sets it.
362
+ url_parse::Component out_host;
363
+ };
364
+
365
+
366
+ // Host.
367
+ //
368
+ // The 8-bit version requires UTF-8 encoding. Use this version when you only
369
+ // need to know whether canonicalization succeeded.
370
+ GURL_API bool CanonicalizeHost(const char* spec,
371
+ const url_parse::Component& host,
372
+ CanonOutput* output,
373
+ url_parse::Component* out_host);
374
+ GURL_API bool CanonicalizeHost(const char16* spec,
375
+ const url_parse::Component& host,
376
+ CanonOutput* output,
377
+ url_parse::Component* out_host);
378
+
379
+ // Extended version of CanonicalizeHost, which returns additional information.
380
+ // Use this when you need to know whether the hostname was an IP address.
381
+ // A successful return is indicated by host_info->family != BROKEN. See the
382
+ // definition of CanonHostInfo above for details.
383
+ GURL_API void CanonicalizeHostVerbose(const char* spec,
384
+ const url_parse::Component& host,
385
+ CanonOutput* output,
386
+ CanonHostInfo* host_info);
387
+ GURL_API void CanonicalizeHostVerbose(const char16* spec,
388
+ const url_parse::Component& host,
389
+ CanonOutput* output,
390
+ CanonHostInfo* host_info);
391
+
392
+
393
+ // IP addresses.
394
+ //
395
+ // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is
396
+ // an IP address, it will canonicalize it as such, appending it to |output|.
397
+ // Additional status information is returned via the |*host_info| parameter.
398
+ // See the definition of CanonHostInfo above for details.
399
+ //
400
+ // This is called AUTOMATICALLY from the host canonicalizer, which ensures that
401
+ // the input is unescaped and name-prepped, etc. It should not normally be
402
+ // necessary or wise to call this directly.
403
+ GURL_API void CanonicalizeIPAddress(const char* spec,
404
+ const url_parse::Component& host,
405
+ CanonOutput* output,
406
+ CanonHostInfo* host_info);
407
+ GURL_API void CanonicalizeIPAddress(const char16* spec,
408
+ const url_parse::Component& host,
409
+ CanonOutput* output,
410
+ CanonHostInfo* host_info);
411
+
412
+ // Port: this function will add the colon for the port if a port is present.
413
+ // The caller can pass url_parse::PORT_UNSPECIFIED as the
414
+ // default_port_for_scheme argument if there is no default port.
415
+ //
416
+ // The 8-bit version requires UTF-8 encoding.
417
+ GURL_API bool CanonicalizePort(const char* spec,
418
+ const url_parse::Component& port,
419
+ int default_port_for_scheme,
420
+ CanonOutput* output,
421
+ url_parse::Component* out_port);
422
+ GURL_API bool CanonicalizePort(const char16* spec,
423
+ const url_parse::Component& port,
424
+ int default_port_for_scheme,
425
+ CanonOutput* output,
426
+ url_parse::Component* out_port);
427
+
428
+ // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
429
+ // if the scheme is unknown.
430
+ GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);
431
+
432
+ // Path. If the input does not begin in a slash (including if the input is
433
+ // empty), we'll prepend a slash to the path to make it canonical.
434
+ //
435
+ // The 8-bit version assumes UTF-8 encoding, but does not verify the validity
436
+ // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid
437
+ // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't
438
+ // an issue. Somebody giving us an 8-bit path is responsible for generating
439
+ // the path that the server expects (we'll escape high-bit characters), so
440
+ // if something is invalid, it's their problem.
441
+ GURL_API bool CanonicalizePath(const char* spec,
442
+ const url_parse::Component& path,
443
+ CanonOutput* output,
444
+ url_parse::Component* out_path);
445
+ GURL_API bool CanonicalizePath(const char16* spec,
446
+ const url_parse::Component& path,
447
+ CanonOutput* output,
448
+ url_parse::Component* out_path);
449
+
450
+ // Canonicalizes the input as a file path. This is like CanonicalizePath except
451
+ // that it also handles Windows drive specs. For example, the path can begin
452
+ // with "c|\" and it will get properly canonicalized to "C:/".
453
+ // The string will be appended to |*output| and |*out_path| will be updated.
454
+ //
455
+ // The 8-bit version requires UTF-8 encoding.
456
+ GURL_API bool FileCanonicalizePath(const char* spec,
457
+ const url_parse::Component& path,
458
+ CanonOutput* output,
459
+ url_parse::Component* out_path);
460
+ GURL_API bool FileCanonicalizePath(const char16* spec,
461
+ const url_parse::Component& path,
462
+ CanonOutput* output,
463
+ url_parse::Component* out_path);
464
+
465
+ // Query: Prepends the ? if needed.
466
+ //
467
+ // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly
468
+ // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode
469
+ // "invalid character." This function can not fail, we always just try to do
470
+ // our best for crazy input here since web pages can set it themselves.
471
+ //
472
+ // This will convert the given input into the output encoding that the given
473
+ // character set converter object provides. The converter will only be called
474
+ // if necessary, for ASCII input, no conversions are necessary.
475
+ //
476
+ // The converter can be NULL. In this case, the output encoding will be UTF-8.
477
+ GURL_API void CanonicalizeQuery(const char* spec,
478
+ const url_parse::Component& query,
479
+ CharsetConverter* converter,
480
+ CanonOutput* output,
481
+ url_parse::Component* out_query);
482
+ GURL_API void CanonicalizeQuery(const char16* spec,
483
+ const url_parse::Component& query,
484
+ CharsetConverter* converter,
485
+ CanonOutput* output,
486
+ url_parse::Component* out_query);
487
+
488
+ // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
489
+ // canonicalizer that does not produce ASCII output). The output is
490
+ // guaranteed to be valid UTF-8.
491
+ //
492
+ // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
493
+ // the "Unicode replacement character" for the confusing bits and copy the rest.
494
+ GURL_API void CanonicalizeRef(const char* spec,
495
+ const url_parse::Component& path,
496
+ CanonOutput* output,
497
+ url_parse::Component* out_path);
498
+ GURL_API void CanonicalizeRef(const char16* spec,
499
+ const url_parse::Component& path,
500
+ CanonOutput* output,
501
+ url_parse::Component* out_path);
502
+
503
+ // Full canonicalizer ---------------------------------------------------------
504
+ //
505
+ // These functions replace any string contents, rather than append as above.
506
+ // See the above piece-by-piece functions for information specific to
507
+ // canonicalizing individual components.
508
+ //
509
+ // The output will be ASCII except the reference fragment, which may be UTF-8.
510
+ //
511
+ // The 8-bit versions require UTF-8 encoding.
512
+
513
+ // Use for standard URLs with authorities and paths.
514
+ GURL_API bool CanonicalizeStandardURL(const char* spec,
515
+ int spec_len,
516
+ const url_parse::Parsed& parsed,
517
+ CharsetConverter* query_converter,
518
+ CanonOutput* output,
519
+ url_parse::Parsed* new_parsed);
520
+ GURL_API bool CanonicalizeStandardURL(const char16* spec,
521
+ int spec_len,
522
+ const url_parse::Parsed& parsed,
523
+ CharsetConverter* query_converter,
524
+ CanonOutput* output,
525
+ url_parse::Parsed* new_parsed);
526
+
527
+ // Use for file URLs.
528
+ GURL_API bool CanonicalizeFileURL(const char* spec,
529
+ int spec_len,
530
+ const url_parse::Parsed& parsed,
531
+ CharsetConverter* query_converter,
532
+ CanonOutput* output,
533
+ url_parse::Parsed* new_parsed);
534
+ GURL_API bool CanonicalizeFileURL(const char16* spec,
535
+ int spec_len,
536
+ const url_parse::Parsed& parsed,
537
+ CharsetConverter* query_converter,
538
+ CanonOutput* output,
539
+ url_parse::Parsed* new_parsed);
540
+
541
+ // Use for path URLs such as javascript. This does not modify the path in any
542
+ // way, for example, by escaping it.
543
+ GURL_API bool CanonicalizePathURL(const char* spec,
544
+ int spec_len,
545
+ const url_parse::Parsed& parsed,
546
+ CanonOutput* output,
547
+ url_parse::Parsed* new_parsed);
548
+ GURL_API bool CanonicalizePathURL(const char16* spec,
549
+ int spec_len,
550
+ const url_parse::Parsed& parsed,
551
+ CanonOutput* output,
552
+ url_parse::Parsed* new_parsed);
553
+
554
+ // Use for mailto URLs. This "canonicalizes" the url into a path and query
555
+ // component. It does not attempt to merge "to" fields. It uses UTF-8 for
556
+ // the query encoding if there is a query. This is because a mailto URL is
557
+ // really intended for an external mail program, and the encoding of a page,
558
+ // etc. which would influence a query encoding normally are irrelevant.
559
+ GURL_API bool CanonicalizeMailtoURL(const char* spec,
560
+ int spec_len,
561
+ const url_parse::Parsed& parsed,
562
+ CanonOutput* output,
563
+ url_parse::Parsed* new_parsed);
564
+ GURL_API bool CanonicalizeMailtoURL(const char16* spec,
565
+ int spec_len,
566
+ const url_parse::Parsed& parsed,
567
+ CanonOutput* output,
568
+ url_parse::Parsed* new_parsed);
569
+
570
+ // Part replacer --------------------------------------------------------------
571
+
572
+ // Internal structure used for storing separate strings for each component.
573
+ // The basic canonicalization functions use this structure internally so that
574
+ // component remplacement (different strings for different components) can be
575
+ // treated on the same code path as regular canonicalization (the same string
576
+ // for each component).
577
+ //
578
+ // A url_parse::Parsed structure usually goes along with this. Those
579
+ // components identify offsets within these strings, so that they can all be
580
+ // in the same string, or spread arbitrarily across different ones.
581
+ //
582
+ // This structures does not own any data. It is the caller's responsibility to
583
+ // ensure that the data the pointers point to stays in scope and is not
584
+ // modified.
585
+ template<typename CHAR>
586
+ struct URLComponentSource {
587
+ // Constructor normally used by callers wishing to replace components. This
588
+ // will make them all NULL, which is no replacement. The caller would then
589
+ // override the components they want to replace.
590
+ URLComponentSource()
591
+ : scheme(NULL),
592
+ username(NULL),
593
+ password(NULL),
594
+ host(NULL),
595
+ port(NULL),
596
+ path(NULL),
597
+ query(NULL),
598
+ ref(NULL) {
599
+ }
600
+
601
+ // Constructor normally used internally to initialize all the components to
602
+ // point to the same spec.
603
+ explicit URLComponentSource(const CHAR* default_value)
604
+ : scheme(default_value),
605
+ username(default_value),
606
+ password(default_value),
607
+ host(default_value),
608
+ port(default_value),
609
+ path(default_value),
610
+ query(default_value),
611
+ ref(default_value) {
612
+ }
613
+
614
+ const CHAR* scheme;
615
+ const CHAR* username;
616
+ const CHAR* password;
617
+ const CHAR* host;
618
+ const CHAR* port;
619
+ const CHAR* path;
620
+ const CHAR* query;
621
+ const CHAR* ref;
622
+ };
623
+
624
+ // This structure encapsulates information on modifying a URL. Each component
625
+ // may either be left unchanged, replaced, or deleted.
626
+ //
627
+ // By default, each component is unchanged. For those components that should be
628
+ // modified, call either Set* or Clear* to modify it.
629
+ //
630
+ // The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT
631
+ // IN SCOPE BY THE CALLER for as long as this object exists!
632
+ //
633
+ // Prefer the 8-bit replacement version if possible since it is more efficient.
634
+ template<typename CHAR>
635
+ class Replacements {
636
+ public:
637
+ Replacements() {
638
+ }
639
+
640
+ // Scheme
641
+ void SetScheme(const CHAR* s, const url_parse::Component& comp) {
642
+ sources_.scheme = s;
643
+ components_.scheme = comp;
644
+ }
645
+ // Note: we don't have a ClearScheme since this doesn't make any sense.
646
+ bool IsSchemeOverridden() const { return sources_.scheme != NULL; }
647
+
648
+ // Username
649
+ void SetUsername(const CHAR* s, const url_parse::Component& comp) {
650
+ sources_.username = s;
651
+ components_.username = comp;
652
+ }
653
+ void ClearUsername() {
654
+ sources_.username = Placeholder();
655
+ components_.username = url_parse::Component();
656
+ }
657
+ bool IsUsernameOverridden() const { return sources_.username != NULL; }
658
+
659
+ // Password
660
+ void SetPassword(const CHAR* s, const url_parse::Component& comp) {
661
+ sources_.password = s;
662
+ components_.password = comp;
663
+ }
664
+ void ClearPassword() {
665
+ sources_.password = Placeholder();
666
+ components_.password = url_parse::Component();
667
+ }
668
+ bool IsPasswordOverridden() const { return sources_.password != NULL; }
669
+
670
+ // Host
671
+ void SetHost(const CHAR* s, const url_parse::Component& comp) {
672
+ sources_.host = s;
673
+ components_.host = comp;
674
+ }
675
+ void ClearHost() {
676
+ sources_.host = Placeholder();
677
+ components_.host = url_parse::Component();
678
+ }
679
+ bool IsHostOverridden() const { return sources_.host != NULL; }
680
+
681
+ // Port
682
+ void SetPort(const CHAR* s, const url_parse::Component& comp) {
683
+ sources_.port = s;
684
+ components_.port = comp;
685
+ }
686
+ void ClearPort() {
687
+ sources_.port = Placeholder();
688
+ components_.port = url_parse::Component();
689
+ }
690
+ bool IsPortOverridden() const { return sources_.port != NULL; }
691
+
692
+ // Path
693
+ void SetPath(const CHAR* s, const url_parse::Component& comp) {
694
+ sources_.path = s;
695
+ components_.path = comp;
696
+ }
697
+ void ClearPath() {
698
+ sources_.path = Placeholder();
699
+ components_.path = url_parse::Component();
700
+ }
701
+ bool IsPathOverridden() const { return sources_.path != NULL; }
702
+
703
+ // Query
704
+ void SetQuery(const CHAR* s, const url_parse::Component& comp) {
705
+ sources_.query = s;
706
+ components_.query = comp;
707
+ }
708
+ void ClearQuery() {
709
+ sources_.query = Placeholder();
710
+ components_.query = url_parse::Component();
711
+ }
712
+ bool IsQueryOverridden() const { return sources_.query != NULL; }
713
+
714
+ // Ref
715
+ void SetRef(const CHAR* s, const url_parse::Component& comp) {
716
+ sources_.ref = s;
717
+ components_.ref = comp;
718
+ }
719
+ void ClearRef() {
720
+ sources_.ref = Placeholder();
721
+ components_.ref = url_parse::Component();
722
+ }
723
+ bool IsRefOverridden() const { return sources_.ref != NULL; }
724
+
725
+ // Getters for the itnernal data. See the variables below for how the
726
+ // information is encoded.
727
+ const URLComponentSource<CHAR>& sources() const { return sources_; }
728
+ const url_parse::Parsed& components() const { return components_; }
729
+
730
+ private:
731
+ // Returns a pointer to a static empty string that is used as a placeholder
732
+ // to indicate a component should be deleted (see below).
733
+ const CHAR* Placeholder() {
734
+ static const CHAR empty_string = 0;
735
+ return &empty_string;
736
+ }
737
+
738
+ // We support three states:
739
+ //
740
+ // Action | Source Component
741
+ // -----------------------+--------------------------------------------------
742
+ // Don't change component | NULL (unused)
743
+ // Replace component | (replacement string) (replacement component)
744
+ // Delete component | (non-NULL) (invalid component: (0,-1))
745
+ //
746
+ // We use a pointer to the empty string for the source when the component
747
+ // should be deleted.
748
+ URLComponentSource<CHAR> sources_;
749
+ url_parse::Parsed components_;
750
+ };
751
+
752
+ // The base must be an 8-bit canonical URL.
753
+ GURL_API bool ReplaceStandardURL(const char* base,
754
+ const url_parse::Parsed& base_parsed,
755
+ const Replacements<char>& replacements,
756
+ CharsetConverter* query_converter,
757
+ CanonOutput* output,
758
+ url_parse::Parsed* new_parsed);
759
+ GURL_API bool ReplaceStandardURL(const char* base,
760
+ const url_parse::Parsed& base_parsed,
761
+ const Replacements<char16>& replacements,
762
+ CharsetConverter* query_converter,
763
+ CanonOutput* output,
764
+ url_parse::Parsed* new_parsed);
765
+
766
+ // Replacing some parts of a file URL is not permitted. Everything except
767
+ // the host, path, query, and ref will be ignored.
768
+ GURL_API bool ReplaceFileURL(const char* base,
769
+ const url_parse::Parsed& base_parsed,
770
+ const Replacements<char>& replacements,
771
+ CharsetConverter* query_converter,
772
+ CanonOutput* output,
773
+ url_parse::Parsed* new_parsed);
774
+ GURL_API bool ReplaceFileURL(const char* base,
775
+ const url_parse::Parsed& base_parsed,
776
+ const Replacements<char16>& replacements,
777
+ CharsetConverter* query_converter,
778
+ CanonOutput* output,
779
+ url_parse::Parsed* new_parsed);
780
+
781
+ // Path URLs can only have the scheme and path replaced. All other components
782
+ // will be ignored.
783
+ GURL_API bool ReplacePathURL(const char* base,
784
+ const url_parse::Parsed& base_parsed,
785
+ const Replacements<char>& replacements,
786
+ CanonOutput* output,
787
+ url_parse::Parsed* new_parsed);
788
+ GURL_API bool ReplacePathURL(const char* base,
789
+ const url_parse::Parsed& base_parsed,
790
+ const Replacements<char16>& replacements,
791
+ CanonOutput* output,
792
+ url_parse::Parsed* new_parsed);
793
+
794
+ // Mailto URLs can only have the scheme, path, and query replaced.
795
+ // All other components will be ignored.
796
+ GURL_API bool ReplaceMailtoURL(const char* base,
797
+ const url_parse::Parsed& base_parsed,
798
+ const Replacements<char>& replacements,
799
+ CanonOutput* output,
800
+ url_parse::Parsed* new_parsed);
801
+ GURL_API bool ReplaceMailtoURL(const char* base,
802
+ const url_parse::Parsed& base_parsed,
803
+ const Replacements<char16>& replacements,
804
+ CanonOutput* output,
805
+ url_parse::Parsed* new_parsed);
806
+
807
+ // Relative URL ---------------------------------------------------------------
808
+
809
+ // Given an input URL or URL fragment |fragment|, determines if it is a
810
+ // relative or absolute URL and places the result into |*is_relative|. If it is
811
+ // relative, the relevant portion of the URL will be placed into
812
+ // |*relative_component| (there may have been trimmed whitespace, for example).
813
+ // This value is passed to ResolveRelativeURL. If the input is not relative,
814
+ // this value is UNDEFINED (it may be changed by the functin).
815
+ //
816
+ // Returns true on success (we successfully determined the URL is relative or
817
+ // not). Failure means that the combination of URLs doesn't make any sense.
818
+ //
819
+ // The base URL should always be canonical, therefore is ASCII.
820
+ GURL_API bool IsRelativeURL(const char* base,
821
+ const url_parse::Parsed& base_parsed,
822
+ const char* fragment,
823
+ int fragment_len,
824
+ bool is_base_hierarchical,
825
+ bool* is_relative,
826
+ url_parse::Component* relative_component);
827
+ GURL_API bool IsRelativeURL(const char* base,
828
+ const url_parse::Parsed& base_parsed,
829
+ const char16* fragment,
830
+ int fragment_len,
831
+ bool is_base_hierarchical,
832
+ bool* is_relative,
833
+ url_parse::Component* relative_component);
834
+
835
+ // Given a canonical parsed source URL, a URL fragment known to be relative,
836
+ // and the identified relevant portion of the relative URL (computed by
837
+ // IsRelativeURL), this produces a new parsed canonical URL in |output| and
838
+ // |out_parsed|.
839
+ //
840
+ // It also requires a flag indicating whether the base URL is a file: URL
841
+ // which triggers additional logic.
842
+ //
843
+ // The base URL should be canonical and have a host (may be empty for file
844
+ // URLs) and a path. If it doesn't have these, we can't resolve relative
845
+ // URLs off of it and will return the base as the output with an error flag.
846
+ // Becausee it is canonical is should also be ASCII.
847
+ //
848
+ // The query charset converter follows the same rules as CanonicalizeQuery.
849
+ //
850
+ // Returns true on success. On failure, the output will be "something
851
+ // reasonable" that will be consistent and valid, just probably not what
852
+ // was intended by the web page author or caller.
853
+ GURL_API bool ResolveRelativeURL(const char* base_url,
854
+ const url_parse::Parsed& base_parsed,
855
+ bool base_is_file,
856
+ const char* relative_url,
857
+ const url_parse::Component& relative_component,
858
+ CharsetConverter* query_converter,
859
+ CanonOutput* output,
860
+ url_parse::Parsed* out_parsed);
861
+ GURL_API bool ResolveRelativeURL(const char* base_url,
862
+ const url_parse::Parsed& base_parsed,
863
+ bool base_is_file,
864
+ const char16* relative_url,
865
+ const url_parse::Component& relative_component,
866
+ CharsetConverter* query_converter,
867
+ CanonOutput* output,
868
+ url_parse::Parsed* out_parsed);
869
+
870
+ } // namespace url_canon
871
+
872
+ #endif // GOOGLEURL_SRC_URL_CANON_H__