uri_parser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/.rvmrc +1 -0
- data/Gemfile +6 -0
- data/Rakefile +13 -0
- data/ext/uri_parser/basictypes.h +89 -0
- data/ext/uri_parser/extconf.h +6 -0
- data/ext/uri_parser/extconf.rb +50 -0
- data/ext/uri_parser/logging.h +5 -0
- data/ext/uri_parser/scoped_ptr.h +322 -0
- data/ext/uri_parser/string16.cc +95 -0
- data/ext/uri_parser/string16.h +194 -0
- data/ext/uri_parser/uri_parser.cc +87 -0
- data/ext/uri_parser/url_canon.h +872 -0
- data/ext/uri_parser/url_canon_etc.cc +392 -0
- data/ext/uri_parser/url_canon_fileurl.cc +215 -0
- data/ext/uri_parser/url_canon_host.cc +401 -0
- data/ext/uri_parser/url_canon_icu.cc +207 -0
- data/ext/uri_parser/url_canon_icu.h +63 -0
- data/ext/uri_parser/url_canon_internal.cc +427 -0
- data/ext/uri_parser/url_canon_internal.h +453 -0
- data/ext/uri_parser/url_canon_internal_file.h +157 -0
- data/ext/uri_parser/url_canon_ip.cc +737 -0
- data/ext/uri_parser/url_canon_ip.h +101 -0
- data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
- data/ext/uri_parser/url_canon_path.cc +380 -0
- data/ext/uri_parser/url_canon_pathurl.cc +128 -0
- data/ext/uri_parser/url_canon_query.cc +189 -0
- data/ext/uri_parser/url_canon_relative.cc +572 -0
- data/ext/uri_parser/url_canon_stdstring.h +134 -0
- data/ext/uri_parser/url_canon_stdurl.cc +211 -0
- data/ext/uri_parser/url_common.h +48 -0
- data/ext/uri_parser/url_file.h +108 -0
- data/ext/uri_parser/url_parse.cc +760 -0
- data/ext/uri_parser/url_parse.h +336 -0
- data/ext/uri_parser/url_parse_file.cc +243 -0
- data/ext/uri_parser/url_parse_internal.h +112 -0
- data/ext/uri_parser/url_util.cc +553 -0
- data/ext/uri_parser/url_util.h +222 -0
- data/lib/uri_parser.rb +28 -0
- data/lib/uri_parser/version.rb +3 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/uri_parser_spec.rb +54 -0
- data/uri_parser.gemspec +26 -0
- metadata +117 -0
@@ -0,0 +1,95 @@
|
|
1
|
+
// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
|
2
|
+
//
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
4
|
+
// modification, are permitted provided that the following conditions are
|
5
|
+
// met:
|
6
|
+
//
|
7
|
+
// * Redistributions of source code must retain the above copyright
|
8
|
+
// notice, this list of conditions and the following disclaimer.
|
9
|
+
// * Redistributions in binary form must reproduce the above
|
10
|
+
// copyright notice, this list of conditions and the following disclaimer
|
11
|
+
// in the documentation and/or other materials provided with the
|
12
|
+
// distribution.
|
13
|
+
// * Neither the name of Google Inc. nor the names of its
|
14
|
+
// contributors may be used to endorse or promote products derived from
|
15
|
+
// this software without specific prior written permission.
|
16
|
+
//
|
17
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
18
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
19
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
20
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
21
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
22
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
23
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
24
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
25
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
26
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
27
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
28
|
+
|
29
|
+
#include "string16.h"
|
30
|
+
|
31
|
+
#include <string.h>
|
32
|
+
#ifdef WIN32
|
33
|
+
|
34
|
+
#error This file should not be used on 2-byte wchar_t systems
|
35
|
+
// If this winds up being needed on 2-byte wchar_t systems, either the
|
36
|
+
// definitions below can be used, or the host system's wide character
|
37
|
+
// functions like wmemcmp can be wrapped.
|
38
|
+
|
39
|
+
#else // !WIN32
|
40
|
+
|
41
|
+
namespace base {
|
42
|
+
|
43
|
+
int c16memcmp(const char16* s1, const char16* s2, size_t n) {
|
44
|
+
// We cannot call memcmp because that changes the semantics.
|
45
|
+
while (n-- > 0) {
|
46
|
+
if (*s1 != *s2) {
|
47
|
+
// We cannot use (*s1 - *s2) because char16 is unsigned.
|
48
|
+
return ((*s1 < *s2) ? -1 : 1);
|
49
|
+
}
|
50
|
+
++s1;
|
51
|
+
++s2;
|
52
|
+
}
|
53
|
+
return 0;
|
54
|
+
}
|
55
|
+
|
56
|
+
size_t c16len(const char16* s) {
|
57
|
+
const char16 *s_orig = s;
|
58
|
+
while (*s) {
|
59
|
+
++s;
|
60
|
+
}
|
61
|
+
return s - s_orig;
|
62
|
+
}
|
63
|
+
|
64
|
+
const char16* c16memchr(const char16* s, char16 c, size_t n) {
|
65
|
+
while (n-- > 0) {
|
66
|
+
if (*s == c) {
|
67
|
+
return s;
|
68
|
+
}
|
69
|
+
++s;
|
70
|
+
}
|
71
|
+
return 0;
|
72
|
+
}
|
73
|
+
|
74
|
+
char16* c16memmove(char16* s1, const char16* s2, size_t n) {
|
75
|
+
return reinterpret_cast<char16*>(memmove(s1, s2, n * sizeof(char16)));
|
76
|
+
}
|
77
|
+
|
78
|
+
char16* c16memcpy(char16* s1, const char16* s2, size_t n) {
|
79
|
+
return reinterpret_cast<char16*>(memcpy(s1, s2, n * sizeof(char16)));
|
80
|
+
}
|
81
|
+
|
82
|
+
char16* c16memset(char16* s, char16 c, size_t n) {
|
83
|
+
char16 *s_orig = s;
|
84
|
+
while (n-- > 0) {
|
85
|
+
*s = c;
|
86
|
+
++s;
|
87
|
+
}
|
88
|
+
return s_orig;
|
89
|
+
}
|
90
|
+
|
91
|
+
} // namespace base
|
92
|
+
|
93
|
+
template class std::basic_string<char16, base::string16_char_traits>;
|
94
|
+
|
95
|
+
#endif // WIN32
|
@@ -0,0 +1,194 @@
|
|
1
|
+
// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
|
2
|
+
//
|
3
|
+
// Redistribution and use in source and binary forms, with or without
|
4
|
+
// modification, are permitted provided that the following conditions are
|
5
|
+
// met:
|
6
|
+
//
|
7
|
+
// * Redistributions of source code must retain the above copyright
|
8
|
+
// notice, this list of conditions and the following disclaimer.
|
9
|
+
// * Redistributions in binary form must reproduce the above
|
10
|
+
// copyright notice, this list of conditions and the following disclaimer
|
11
|
+
// in the documentation and/or other materials provided with the
|
12
|
+
// distribution.
|
13
|
+
// * Neither the name of Google Inc. nor the names of its
|
14
|
+
// contributors may be used to endorse or promote products derived from
|
15
|
+
// this software without specific prior written permission.
|
16
|
+
//
|
17
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
18
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
19
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
20
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
21
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
22
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
23
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
24
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
25
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
26
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
27
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
28
|
+
|
29
|
+
#ifndef BASE_STRING16_H_
|
30
|
+
#define BASE_STRING16_H_
|
31
|
+
|
32
|
+
// WHAT:
|
33
|
+
// A version of std::basic_string that provides 2-byte characters even when
|
34
|
+
// wchar_t is not implemented as a 2-byte type. You can access this class as
|
35
|
+
// string16. We also define char16, which string16 is based upon.
|
36
|
+
//
|
37
|
+
// WHY:
|
38
|
+
// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2
|
39
|
+
// data. Plenty of existing code operates on strings encoded as UTF-16.
|
40
|
+
//
|
41
|
+
// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make
|
42
|
+
// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails
|
43
|
+
// at run time, because it calls some functions (like wcslen) that come from
|
44
|
+
// the system's native C library -- which was built with a 4-byte wchar_t!
|
45
|
+
// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's
|
46
|
+
// entirely improper on those systems where the encoding of wchar_t is defined
|
47
|
+
// as UTF-32.
|
48
|
+
//
|
49
|
+
// Here, we define string16, which is similar to std::wstring but replaces all
|
50
|
+
// libc functions with custom, 2-byte-char compatible routines. It is capable
|
51
|
+
// of carrying UTF-16-encoded data.
|
52
|
+
|
53
|
+
#include <string>
|
54
|
+
#include <cstdio>
|
55
|
+
|
56
|
+
|
57
|
+
#include "basictypes.h"
|
58
|
+
|
59
|
+
#ifdef WIN32
|
60
|
+
|
61
|
+
typedef wchar_t char16;
|
62
|
+
typedef std::wstring string16;
|
63
|
+
|
64
|
+
#else // !WIN32
|
65
|
+
|
66
|
+
typedef uint16 char16;
|
67
|
+
|
68
|
+
namespace base {
|
69
|
+
|
70
|
+
// char16 versions of the functions required by string16_char_traits; these
|
71
|
+
// are based on the wide character functions of similar names ("w" or "wcs"
|
72
|
+
// instead of "c16").
|
73
|
+
int c16memcmp(const char16* s1, const char16* s2, size_t n);
|
74
|
+
size_t c16len(const char16* s);
|
75
|
+
const char16* c16memchr(const char16* s, char16 c, size_t n);
|
76
|
+
char16* c16memmove(char16* s1, const char16* s2, size_t n);
|
77
|
+
char16* c16memcpy(char16* s1, const char16* s2, size_t n);
|
78
|
+
char16* c16memset(char16* s, char16 c, size_t n);
|
79
|
+
|
80
|
+
struct string16_char_traits {
|
81
|
+
typedef char16 char_type;
|
82
|
+
typedef int int_type;
|
83
|
+
|
84
|
+
typedef std::streamoff off_type;
|
85
|
+
typedef mbstate_t state_type;
|
86
|
+
typedef std::fpos<state_type> pos_type;
|
87
|
+
|
88
|
+
static void assign(char_type& c1, const char_type& c2) {
|
89
|
+
c1 = c2;
|
90
|
+
}
|
91
|
+
|
92
|
+
static bool eq(const char_type& c1, const char_type& c2) {
|
93
|
+
return c1 == c2;
|
94
|
+
}
|
95
|
+
static bool lt(const char_type& c1, const char_type& c2) {
|
96
|
+
return c1 < c2;
|
97
|
+
}
|
98
|
+
|
99
|
+
static int compare(const char_type* s1, const char_type* s2, size_t n) {
|
100
|
+
return c16memcmp(s1, s2, n);
|
101
|
+
}
|
102
|
+
|
103
|
+
static size_t length(const char_type* s) {
|
104
|
+
return c16len(s);
|
105
|
+
}
|
106
|
+
|
107
|
+
static const char_type* find(const char_type* s, size_t n,
|
108
|
+
const char_type& a) {
|
109
|
+
return c16memchr(s, a, n);
|
110
|
+
}
|
111
|
+
|
112
|
+
static char_type* move(char_type* s1, const char_type* s2, int_type n) {
|
113
|
+
return c16memmove(s1, s2, n);
|
114
|
+
}
|
115
|
+
|
116
|
+
static char_type* copy(char_type* s1, const char_type* s2, size_t n) {
|
117
|
+
return c16memcpy(s1, s2, n);
|
118
|
+
}
|
119
|
+
|
120
|
+
static char_type* assign(char_type* s, size_t n, char_type a) {
|
121
|
+
return c16memset(s, a, n);
|
122
|
+
}
|
123
|
+
|
124
|
+
static int_type not_eof(const int_type& c) {
|
125
|
+
return eq_int_type(c, eof()) ? 0 : c;
|
126
|
+
}
|
127
|
+
|
128
|
+
static char_type to_char_type(const int_type& c) {
|
129
|
+
return char_type(c);
|
130
|
+
}
|
131
|
+
|
132
|
+
static int_type to_int_type(const char_type& c) {
|
133
|
+
return int_type(c);
|
134
|
+
}
|
135
|
+
|
136
|
+
static bool eq_int_type(const int_type& c1, const int_type& c2) {
|
137
|
+
return c1 == c2;
|
138
|
+
}
|
139
|
+
|
140
|
+
static int_type eof() {
|
141
|
+
return static_cast<int_type>(EOF);
|
142
|
+
}
|
143
|
+
};
|
144
|
+
|
145
|
+
} // namespace base
|
146
|
+
|
147
|
+
// The string class will be explicitly instantiated only once, in string16.cc.
|
148
|
+
//
|
149
|
+
// std::basic_string<> in GNU libstdc++ contains a static data member,
|
150
|
+
// _S_empty_rep_storage, to represent empty strings. When an operation such
|
151
|
+
// as assignment or destruction is performed on a string, causing its existing
|
152
|
+
// data member to be invalidated, it must not be freed if this static data
|
153
|
+
// member is being used. Otherwise, it counts as an attempt to free static
|
154
|
+
// (and not allocated) data, which is a memory error.
|
155
|
+
//
|
156
|
+
// Generally, due to C++ template magic, _S_empty_rep_storage will be marked
|
157
|
+
// as a coalesced symbol, meaning that the linker will combine multiple
|
158
|
+
// instances into a single one when generating output.
|
159
|
+
//
|
160
|
+
// If a string class is used by multiple shared libraries, a problem occurs.
|
161
|
+
// Each library will get its own copy of _S_empty_rep_storage. When strings
|
162
|
+
// are passed across a library boundary for alteration or destruction, memory
|
163
|
+
// errors will result. GNU libstdc++ contains a configuration option,
|
164
|
+
// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which
|
165
|
+
// disables the static data member optimization, but it's a good optimization
|
166
|
+
// and non-STL code is generally at the mercy of the system's STL
|
167
|
+
// configuration. Fully-dynamic strings are not the default for GNU libstdc++
|
168
|
+
// libstdc++ itself or for the libstdc++ installations on the systems we care
|
169
|
+
// about, such as Mac OS X and relevant flavors of Linux.
|
170
|
+
//
|
171
|
+
// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 .
|
172
|
+
//
|
173
|
+
// To avoid problems, string classes need to be explicitly instantiated only
|
174
|
+
// once, in exactly one library. All other string users see it via an "extern"
|
175
|
+
// declaration. This is precisely how GNU libstdc++ handles
|
176
|
+
// std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring).
|
177
|
+
//
|
178
|
+
// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2),
|
179
|
+
// in which the linker does not fully coalesce symbols when dead code
|
180
|
+
// stripping is enabled. This bug causes the memory errors described above
|
181
|
+
// to occur even when a std::basic_string<> does not cross shared library
|
182
|
+
// boundaries, such as in statically-linked executables.
|
183
|
+
//
|
184
|
+
// TODO(mark): File this bug with Apple and update this note with a bug number.
|
185
|
+
|
186
|
+
extern template class std::basic_string<char16, base::string16_char_traits>;
|
187
|
+
|
188
|
+
typedef std::basic_string<char16, base::string16_char_traits> string16;
|
189
|
+
|
190
|
+
extern std::ostream& operator<<(std::ostream& out, const string16& str);
|
191
|
+
|
192
|
+
#endif // !WIN32
|
193
|
+
|
194
|
+
#endif // BASE_STRING16_H_
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include <string>
|
3
|
+
|
4
|
+
#include "url_canon_stdstring.h"
|
5
|
+
#include "url_util.h"
|
6
|
+
|
7
|
+
#define ATTR_PORT "port"
|
8
|
+
#define ATTR_SCHEME "scheme"
|
9
|
+
#define ATTR_HOST "host"
|
10
|
+
#define ATTR_PATH "path"
|
11
|
+
#define ATTR_QUERY "query"
|
12
|
+
#define ATTR_VALID "valid"
|
13
|
+
#define ATTR_URI "uri"
|
14
|
+
|
15
|
+
extern "C" {
|
16
|
+
// Defining a space for information and references about the module to be stored internally
|
17
|
+
static VALUE uri_parser = Qnil;
|
18
|
+
|
19
|
+
typedef VALUE (ruby_method_vararg)(...);
|
20
|
+
|
21
|
+
// Prototype for the initialization method - Ruby calls this, not you
|
22
|
+
void Init_uri_parser();
|
23
|
+
|
24
|
+
bool canonicalize(const std::string& input_spec,
|
25
|
+
std::string* canonical,
|
26
|
+
url_parse::Parsed* parsed)
|
27
|
+
{
|
28
|
+
// Reserve enough room in the output for the input, plus some extra so that
|
29
|
+
// we have room if we have to escape a few things without reallocating.
|
30
|
+
canonical->reserve(input_spec.size() + 32);
|
31
|
+
url_canon::StdStringCanonOutput output(canonical);
|
32
|
+
bool success = url_util::Canonicalize(
|
33
|
+
input_spec.data(), static_cast<int>(input_spec.length()),
|
34
|
+
NULL, &output, parsed);
|
35
|
+
output.Complete(); // Must be done before using string.
|
36
|
+
return success;
|
37
|
+
}
|
38
|
+
|
39
|
+
|
40
|
+
// Returns the substring of the input identified by the given component.
|
41
|
+
VALUE component_rb_str(std::string& url, const url_parse::Component& comp)
|
42
|
+
{
|
43
|
+
if (comp.len <= 0)
|
44
|
+
return rb_str_new2("");
|
45
|
+
else
|
46
|
+
return rb_str_new2(std::string(url, comp.begin, comp.len).c_str());
|
47
|
+
}
|
48
|
+
|
49
|
+
VALUE uri_parser_valid(VALUE self)
|
50
|
+
{
|
51
|
+
return rb_iv_get(self, "@"ATTR_VALID);
|
52
|
+
}
|
53
|
+
|
54
|
+
VALUE uri_parser_initialize(VALUE self, VALUE in)
|
55
|
+
{
|
56
|
+
std::string url(rb_string_value_ptr(&in));
|
57
|
+
std::string canonical;
|
58
|
+
url_parse::Parsed parsed;
|
59
|
+
|
60
|
+
bool valid = canonicalize(url, &canonical, &parsed);
|
61
|
+
|
62
|
+
rb_iv_set(self, "@"ATTR_PORT, component_rb_str(canonical, parsed.port));
|
63
|
+
rb_iv_set(self, "@"ATTR_HOST, component_rb_str(canonical, parsed.host));
|
64
|
+
rb_iv_set(self, "@"ATTR_PATH, component_rb_str(canonical, parsed.path));
|
65
|
+
rb_iv_set(self, "@"ATTR_QUERY, component_rb_str(canonical, parsed.query));
|
66
|
+
rb_iv_set(self, "@"ATTR_SCHEME, component_rb_str(canonical, parsed.scheme));
|
67
|
+
rb_iv_set(self, "@"ATTR_URI, rb_str_new2(canonical.c_str()));
|
68
|
+
rb_iv_set(self, "@"ATTR_VALID, valid ? Qtrue : Qfalse);
|
69
|
+
|
70
|
+
return Qtrue;
|
71
|
+
}
|
72
|
+
|
73
|
+
// The initialization method for this module
|
74
|
+
void Init_uri_parser() {
|
75
|
+
uri_parser= rb_define_class("URIParser", rb_cObject);
|
76
|
+
rb_define_method(uri_parser, "initialize", (ruby_method_vararg*)uri_parser_initialize, 1);
|
77
|
+
rb_define_attr(uri_parser, ATTR_PORT, 1, 0);
|
78
|
+
rb_define_attr(uri_parser, ATTR_HOST, 1, 0);
|
79
|
+
rb_define_attr(uri_parser, ATTR_PATH, 1, 0);
|
80
|
+
rb_define_attr(uri_parser, ATTR_QUERY, 1, 0);
|
81
|
+
rb_define_attr(uri_parser, ATTR_SCHEME, 1, 0);
|
82
|
+
rb_define_attr(uri_parser, ATTR_URI, 1, 0);
|
83
|
+
rb_define_attr(uri_parser, ATTR_VALID, 1, 0);
|
84
|
+
rb_define_method(uri_parser, ATTR_VALID"?", (ruby_method_vararg*)uri_parser_valid, 0);
|
85
|
+
}
|
86
|
+
|
87
|
+
}
|
@@ -0,0 +1,872 @@
|
|
1
|
+
// Copyright 2007, Google Inc.
|
2
|
+
// All rights reserved.
|
3
|
+
//
|
4
|
+
// Redistribution and use in source and binary forms, with or without
|
5
|
+
// modification, are permitted provided that the following conditions are
|
6
|
+
// met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright
|
9
|
+
// notice, this list of conditions and the following disclaimer.
|
10
|
+
// * Redistributions in binary form must reproduce the above
|
11
|
+
// copyright notice, this list of conditions and the following disclaimer
|
12
|
+
// in the documentation and/or other materials provided with the
|
13
|
+
// distribution.
|
14
|
+
// * Neither the name of Google Inc. nor the names of its
|
15
|
+
// contributors may be used to endorse or promote products derived from
|
16
|
+
// this software without specific prior written permission.
|
17
|
+
//
|
18
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
20
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
21
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
22
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
25
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
26
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
27
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
+
#ifndef GOOGLEURL_SRC_URL_CANON_H__
|
30
|
+
#define GOOGLEURL_SRC_URL_CANON_H__
|
31
|
+
|
32
|
+
#include <memory.h>
|
33
|
+
#include <stdlib.h>
|
34
|
+
|
35
|
+
#include "string16.h"
|
36
|
+
#include "url_common.h"
|
37
|
+
#include "url_parse.h"
|
38
|
+
|
39
|
+
namespace url_canon {
|
40
|
+
|
41
|
+
// Canonicalizer output -------------------------------------------------------
|
42
|
+
|
43
|
+
// Base class for the canonicalizer output, this maintains a buffer and
|
44
|
+
// supports simple resizing and append operations on it.
|
45
|
+
//
|
46
|
+
// It is VERY IMPORTANT that no virtual function calls be made on the common
|
47
|
+
// code path. We only have two virtual function calls, the destructor and a
|
48
|
+
// resize function that is called when the existing buffer is not big enough.
|
49
|
+
// The derived class is then in charge of setting up our buffer which we will
|
50
|
+
// manage.
|
51
|
+
template<typename T>
|
52
|
+
class CanonOutputT {
|
53
|
+
public:
|
54
|
+
CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {
|
55
|
+
}
|
56
|
+
virtual ~CanonOutputT() {
|
57
|
+
}
|
58
|
+
|
59
|
+
// Implemented to resize the buffer. This function should update the buffer
|
60
|
+
// pointer to point to the new buffer, and any old data up to |cur_len_| in
|
61
|
+
// the buffer must be copied over.
|
62
|
+
//
|
63
|
+
// The new size |sz| must be larger than buffer_len_.
|
64
|
+
virtual void Resize(int sz) = 0;
|
65
|
+
|
66
|
+
// Accessor for returning a character at a given position. The input offset
|
67
|
+
// must be in the valid range.
|
68
|
+
inline char at(int offset) const {
|
69
|
+
return buffer_[offset];
|
70
|
+
}
|
71
|
+
|
72
|
+
// Sets the character at the given position. The given position MUST be less
|
73
|
+
// than the length().
|
74
|
+
inline void set(int offset, int ch) {
|
75
|
+
buffer_[offset] = ch;
|
76
|
+
}
|
77
|
+
|
78
|
+
// Returns the number of characters currently in the buffer.
|
79
|
+
inline int length() const {
|
80
|
+
return cur_len_;
|
81
|
+
}
|
82
|
+
|
83
|
+
// Returns the current capacity of the buffer. The length() is the number of
|
84
|
+
// characters that have been declared to be written, but the capacity() is
|
85
|
+
// the number that can be written without reallocation. If the caller must
|
86
|
+
// write many characters at once, it can make sure there is enough capacity,
|
87
|
+
// write the data, then use set_size() to declare the new length().
|
88
|
+
int capacity() const {
|
89
|
+
return buffer_len_;
|
90
|
+
}
|
91
|
+
|
92
|
+
// Called by the user of this class to get the output. The output will NOT
|
93
|
+
// be NULL-terminated. Call length() to get the
|
94
|
+
// length.
|
95
|
+
const T* data() const {
|
96
|
+
return buffer_;
|
97
|
+
}
|
98
|
+
T* data() {
|
99
|
+
return buffer_;
|
100
|
+
}
|
101
|
+
|
102
|
+
// Shortens the URL to the new length. Used for "backing up" when processing
|
103
|
+
// relative paths. This can also be used if an external function writes a lot
|
104
|
+
// of data to the buffer (when using the "Raw" version below) beyond the end,
|
105
|
+
// to declare the new length.
|
106
|
+
//
|
107
|
+
// This MUST NOT be used to expand the size of the buffer beyond capacity().
|
108
|
+
void set_length(int new_len) {
|
109
|
+
cur_len_ = new_len;
|
110
|
+
}
|
111
|
+
|
112
|
+
// This is the most performance critical function, since it is called for
|
113
|
+
// every character.
|
114
|
+
void push_back(T ch) {
|
115
|
+
// In VC2005, putting this common case first speeds up execution
|
116
|
+
// dramatically because this branch is predicted as taken.
|
117
|
+
if (cur_len_ < buffer_len_) {
|
118
|
+
buffer_[cur_len_] = ch;
|
119
|
+
cur_len_++;
|
120
|
+
return;
|
121
|
+
}
|
122
|
+
|
123
|
+
// Grow the buffer to hold at least one more item. Hopefully we won't have
|
124
|
+
// to do this very often.
|
125
|
+
if (!Grow(1))
|
126
|
+
return;
|
127
|
+
|
128
|
+
// Actually do the insertion.
|
129
|
+
buffer_[cur_len_] = ch;
|
130
|
+
cur_len_++;
|
131
|
+
}
|
132
|
+
|
133
|
+
// Appends the given string to the output.
|
134
|
+
void Append(const T* str, int str_len) {
|
135
|
+
if (cur_len_ + str_len > buffer_len_) {
|
136
|
+
if (!Grow(cur_len_ + str_len - buffer_len_))
|
137
|
+
return;
|
138
|
+
}
|
139
|
+
for (int i = 0; i < str_len; i++)
|
140
|
+
buffer_[cur_len_ + i] = str[i];
|
141
|
+
cur_len_ += str_len;
|
142
|
+
}
|
143
|
+
|
144
|
+
protected:
|
145
|
+
// Grows the given buffer so that it can fit at least |min_additional|
|
146
|
+
// characters. Returns true if the buffer could be resized, false on OOM.
|
147
|
+
bool Grow(int min_additional) {
|
148
|
+
static const int kMinBufferLen = 16;
|
149
|
+
int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;
|
150
|
+
do {
|
151
|
+
if (new_len >= (1 << 30)) // Prevent overflow below.
|
152
|
+
return false;
|
153
|
+
new_len *= 2;
|
154
|
+
} while (new_len < buffer_len_ + min_additional);
|
155
|
+
Resize(new_len);
|
156
|
+
return true;
|
157
|
+
}
|
158
|
+
|
159
|
+
T* buffer_;
|
160
|
+
int buffer_len_;
|
161
|
+
|
162
|
+
// Used characters in the buffer.
|
163
|
+
int cur_len_;
|
164
|
+
};
|
165
|
+
|
166
|
+
// Simple implementation of the CanonOutput using new[]. This class
|
167
|
+
// also supports a static buffer so if it is allocated on the stack, most
|
168
|
+
// URLs can be canonicalized with no heap allocations.
|
169
|
+
template<typename T, int fixed_capacity = 1024>
|
170
|
+
class RawCanonOutputT : public CanonOutputT<T> {
|
171
|
+
public:
|
172
|
+
RawCanonOutputT() : CanonOutputT<T>() {
|
173
|
+
this->buffer_ = fixed_buffer_;
|
174
|
+
this->buffer_len_ = fixed_capacity;
|
175
|
+
}
|
176
|
+
virtual ~RawCanonOutputT() {
|
177
|
+
if (this->buffer_ != fixed_buffer_)
|
178
|
+
delete[] this->buffer_;
|
179
|
+
}
|
180
|
+
|
181
|
+
virtual void Resize(int sz) {
|
182
|
+
T* new_buf = new T[sz];
|
183
|
+
memcpy(new_buf, this->buffer_,
|
184
|
+
sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));
|
185
|
+
if (this->buffer_ != fixed_buffer_)
|
186
|
+
delete[] this->buffer_;
|
187
|
+
this->buffer_ = new_buf;
|
188
|
+
this->buffer_len_ = sz;
|
189
|
+
}
|
190
|
+
|
191
|
+
protected:
|
192
|
+
T fixed_buffer_[fixed_capacity];
|
193
|
+
};
|
194
|
+
|
195
|
+
// Normally, all canonicalization output is in narrow characters. We support
|
196
|
+
// the templates so it can also be used internally if a wide buffer is
|
197
|
+
// required.
|
198
|
+
typedef CanonOutputT<char> CanonOutput;
|
199
|
+
typedef CanonOutputT<char16> CanonOutputW;
|
200
|
+
|
201
|
+
template<int fixed_capacity>
|
202
|
+
class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
|
203
|
+
template<int fixed_capacity>
|
204
|
+
class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {};
|
205
|
+
|
206
|
+
// Character set converter ----------------------------------------------------
|
207
|
+
//
|
208
|
+
// Converts query strings into a custom encoding. The embedder can supply an
|
209
|
+
// implementation of this class to interface with their own character set
|
210
|
+
// conversion libraries.
|
211
|
+
//
|
212
|
+
// Embedders will want to see the unit test for the ICU version.
|
213
|
+
|
214
|
+
class CharsetConverter {
|
215
|
+
public:
|
216
|
+
CharsetConverter() {}
|
217
|
+
virtual ~CharsetConverter() {}
|
218
|
+
|
219
|
+
// Converts the given input string from UTF-16 to whatever output format the
|
220
|
+
// converter supports. This is used only for the query encoding conversion,
|
221
|
+
// which does not fail. Instead, the converter should insert "invalid
|
222
|
+
// character" characters in the output for invalid sequences, and do the
|
223
|
+
// best it can.
|
224
|
+
//
|
225
|
+
// If the input contains a character not representable in the output
|
226
|
+
// character set, the converter should append the HTML entity sequence in
|
227
|
+
// decimal, (such as "你") with escaping of the ampersand, number
|
228
|
+
// sign, and semicolon (in the previous example it would be
|
229
|
+
// "%26%2320320%3B"). This rule is based on what IE does in this situation.
|
230
|
+
virtual void ConvertFromUTF16(const char16* input,
|
231
|
+
int input_len,
|
232
|
+
CanonOutput* output) = 0;
|
233
|
+
};
|
234
|
+
|
235
|
+
// Whitespace -----------------------------------------------------------------
|
236
|
+
|
237
|
+
// Searches for whitespace that should be removed from the middle of URLs, and
|
238
|
+
// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces
|
239
|
+
// are preserved, which is what most browsers do. A pointer to the output will
|
240
|
+
// be returned, and the length of that output will be in |output_len|.
|
241
|
+
//
|
242
|
+
// This should be called before parsing if whitespace removal is desired (which
|
243
|
+
// it normally is when you are canonicalizing).
|
244
|
+
//
|
245
|
+
// If no whitespace is removed, this function will not use the buffer and will
|
246
|
+
// return a pointer to the input, to avoid the extra copy. If modification is
|
247
|
+
// required, the given |buffer| will be used and the returned pointer will
|
248
|
+
// point to the beginning of the buffer.
|
249
|
+
//
|
250
|
+
// Therefore, callers should not use the buffer, since it may actuall be empty,
|
251
|
+
// use the computed pointer and |*output_len| instead.
|
252
|
+
GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,
|
253
|
+
CanonOutputT<char>* buffer,
|
254
|
+
int* output_len);
|
255
|
+
GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,
|
256
|
+
CanonOutputT<char16>* buffer,
|
257
|
+
int* output_len);
|
258
|
+
|
259
|
+
// IDN ------------------------------------------------------------------------
|
260
|
+
|
261
|
+
// Converts the Unicode input representing a hostname to ASCII using IDN rules.
|
262
|
+
// The output must fall in the ASCII range, but will be encoded in UTF-16.
|
263
|
+
//
|
264
|
+
// On success, the output will be filled with the ASCII host name and it will
|
265
|
+
// return true. Unlike most other canonicalization functions, this assumes that
|
266
|
+
// the output is empty. The beginning of the host will be at offset 0, and
|
267
|
+
// the length of the output will be set to the length of the new host name.
|
268
|
+
//
|
269
|
+
// On error, returns false. The output in this case is undefined.
|
270
|
+
GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
|
271
|
+
|
272
|
+
// Piece-by-piece canonicalizers ----------------------------------------------
|
273
|
+
//
|
274
|
+
// These individual canonicalizers append the canonicalized versions of the
|
275
|
+
// corresponding URL component to the given std::string. The spec and the
|
276
|
+
// previously-identified range of that component are the input. The range of
|
277
|
+
// the canonicalized component will be written to the output component.
|
278
|
+
//
|
279
|
+
// These functions all append to the output so they can be chained. Make sure
|
280
|
+
// the output is empty when you start.
|
281
|
+
//
|
282
|
+
// These functions returns boolean values indicating success. On failure, they
|
283
|
+
// will attempt to write something reasonable to the output so that, if
|
284
|
+
// displayed to the user, they will recognise it as something that's messed up.
|
285
|
+
// Nothing more should ever be done with these invalid URLs, however.
|
286
|
+
|
287
|
+
// Scheme: Appends the scheme and colon to the URL. The output component will
|
288
|
+
// indicate the range of characters up to but not including the colon.
|
289
|
+
//
|
290
|
+
// Canonical URLs always have a scheme. If the scheme is not present in the
|
291
|
+
// input, this will just write the colon to indicate an empty scheme. Does not
|
292
|
+
// append slashes which will be needed before any authority components for most
|
293
|
+
// URLs.
|
294
|
+
//
|
295
|
+
// The 8-bit version requires UTF-8 encoding.
|
296
|
+
GURL_API bool CanonicalizeScheme(const char* spec,
|
297
|
+
const url_parse::Component& scheme,
|
298
|
+
CanonOutput* output,
|
299
|
+
url_parse::Component* out_scheme);
|
300
|
+
GURL_API bool CanonicalizeScheme(const char16* spec,
|
301
|
+
const url_parse::Component& scheme,
|
302
|
+
CanonOutput* output,
|
303
|
+
url_parse::Component* out_scheme);
|
304
|
+
|
305
|
+
// User info: username/password. If present, this will add the delimiters so
|
306
|
+
// the output will be "<username>:<password>@" or "<username>@". Empty
|
307
|
+
// username/password pairs, or empty passwords, will get converted to
|
308
|
+
// nonexistant in the canonical version.
|
309
|
+
//
|
310
|
+
// The components for the username and password refer to ranges in the
|
311
|
+
// respective source strings. Usually, these will be the same string, which
|
312
|
+
// is legal as long as the two components don't overlap.
|
313
|
+
//
|
314
|
+
// The 8-bit version requires UTF-8 encoding.
|
315
|
+
GURL_API bool CanonicalizeUserInfo(const char* username_source,
|
316
|
+
const url_parse::Component& username,
|
317
|
+
const char* password_source,
|
318
|
+
const url_parse::Component& password,
|
319
|
+
CanonOutput* output,
|
320
|
+
url_parse::Component* out_username,
|
321
|
+
url_parse::Component* out_password);
|
322
|
+
GURL_API bool CanonicalizeUserInfo(const char16* username_source,
|
323
|
+
const url_parse::Component& username,
|
324
|
+
const char16* password_source,
|
325
|
+
const url_parse::Component& password,
|
326
|
+
CanonOutput* output,
|
327
|
+
url_parse::Component* out_username,
|
328
|
+
url_parse::Component* out_password);
|
329
|
+
|
330
|
+
|
331
|
+
// This structure holds detailed state exported from the IP/Host canonicalizers.
|
332
|
+
// Additional fields may be added as callers require them.
|
333
|
+
struct CanonHostInfo {
|
334
|
+
CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}
|
335
|
+
|
336
|
+
// Convenience function to test if family is an IP address.
|
337
|
+
bool IsIPAddress() const { return family == IPV4 || family == IPV6; }
|
338
|
+
|
339
|
+
// This field summarizes how the input was classified by the canonicalizer.
|
340
|
+
enum Family {
|
341
|
+
NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
|
342
|
+
// canonicalizer is concerned, it should be treated as a
|
343
|
+
// hostname.
|
344
|
+
BROKEN, // - Almost an IP, but was not canonicalized. This could be an
|
345
|
+
// IPv4 address where truncation occurred, or something
|
346
|
+
// containing the special characters :[] which did not parse
|
347
|
+
// as an IPv6 address. Never attempt to connect to this
|
348
|
+
// address, because it might actually succeed!
|
349
|
+
IPV4, // - Successfully canonicalized as an IPv4 address.
|
350
|
+
IPV6, // - Successfully canonicalized as an IPv6 address.
|
351
|
+
};
|
352
|
+
Family family;
|
353
|
+
|
354
|
+
// If |family| is IPV4, then this is the number of nonempty dot-separated
|
355
|
+
// components in the input text, from 1 to 4. If |family| is not IPV4,
|
356
|
+
// this value is undefined.
|
357
|
+
int num_ipv4_components;
|
358
|
+
|
359
|
+
// Location of host within the canonicalized output.
|
360
|
+
// CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6.
|
361
|
+
// CanonicalizeHostVerbose() always sets it.
|
362
|
+
url_parse::Component out_host;
|
363
|
+
};
|
364
|
+
|
365
|
+
|
366
|
+
// Host.
|
367
|
+
//
|
368
|
+
// The 8-bit version requires UTF-8 encoding. Use this version when you only
|
369
|
+
// need to know whether canonicalization succeeded.
|
370
|
+
GURL_API bool CanonicalizeHost(const char* spec,
|
371
|
+
const url_parse::Component& host,
|
372
|
+
CanonOutput* output,
|
373
|
+
url_parse::Component* out_host);
|
374
|
+
GURL_API bool CanonicalizeHost(const char16* spec,
|
375
|
+
const url_parse::Component& host,
|
376
|
+
CanonOutput* output,
|
377
|
+
url_parse::Component* out_host);
|
378
|
+
|
379
|
+
// Extended version of CanonicalizeHost, which returns additional information.
|
380
|
+
// Use this when you need to know whether the hostname was an IP address.
|
381
|
+
// A successful return is indicated by host_info->family != BROKEN. See the
|
382
|
+
// definition of CanonHostInfo above for details.
|
383
|
+
GURL_API void CanonicalizeHostVerbose(const char* spec,
|
384
|
+
const url_parse::Component& host,
|
385
|
+
CanonOutput* output,
|
386
|
+
CanonHostInfo* host_info);
|
387
|
+
GURL_API void CanonicalizeHostVerbose(const char16* spec,
|
388
|
+
const url_parse::Component& host,
|
389
|
+
CanonOutput* output,
|
390
|
+
CanonHostInfo* host_info);
|
391
|
+
|
392
|
+
|
393
|
+
// IP addresses.
|
394
|
+
//
|
395
|
+
// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is
|
396
|
+
// an IP address, it will canonicalize it as such, appending it to |output|.
|
397
|
+
// Additional status information is returned via the |*host_info| parameter.
|
398
|
+
// See the definition of CanonHostInfo above for details.
|
399
|
+
//
|
400
|
+
// This is called AUTOMATICALLY from the host canonicalizer, which ensures that
|
401
|
+
// the input is unescaped and name-prepped, etc. It should not normally be
|
402
|
+
// necessary or wise to call this directly.
|
403
|
+
GURL_API void CanonicalizeIPAddress(const char* spec,
|
404
|
+
const url_parse::Component& host,
|
405
|
+
CanonOutput* output,
|
406
|
+
CanonHostInfo* host_info);
|
407
|
+
GURL_API void CanonicalizeIPAddress(const char16* spec,
|
408
|
+
const url_parse::Component& host,
|
409
|
+
CanonOutput* output,
|
410
|
+
CanonHostInfo* host_info);
|
411
|
+
|
412
|
+
// Port: this function will add the colon for the port if a port is present.
|
413
|
+
// The caller can pass url_parse::PORT_UNSPECIFIED as the
|
414
|
+
// default_port_for_scheme argument if there is no default port.
|
415
|
+
//
|
416
|
+
// The 8-bit version requires UTF-8 encoding.
|
417
|
+
GURL_API bool CanonicalizePort(const char* spec,
|
418
|
+
const url_parse::Component& port,
|
419
|
+
int default_port_for_scheme,
|
420
|
+
CanonOutput* output,
|
421
|
+
url_parse::Component* out_port);
|
422
|
+
GURL_API bool CanonicalizePort(const char16* spec,
|
423
|
+
const url_parse::Component& port,
|
424
|
+
int default_port_for_scheme,
|
425
|
+
CanonOutput* output,
|
426
|
+
url_parse::Component* out_port);
|
427
|
+
|
428
|
+
// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
|
429
|
+
// if the scheme is unknown.
|
430
|
+
GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);
|
431
|
+
|
432
|
+
// Path. If the input does not begin in a slash (including if the input is
|
433
|
+
// empty), we'll prepend a slash to the path to make it canonical.
|
434
|
+
//
|
435
|
+
// The 8-bit version assumes UTF-8 encoding, but does not verify the validity
|
436
|
+
// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid
|
437
|
+
// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't
|
438
|
+
// an issue. Somebody giving us an 8-bit path is responsible for generating
|
439
|
+
// the path that the server expects (we'll escape high-bit characters), so
|
440
|
+
// if something is invalid, it's their problem.
|
441
|
+
GURL_API bool CanonicalizePath(const char* spec,
|
442
|
+
const url_parse::Component& path,
|
443
|
+
CanonOutput* output,
|
444
|
+
url_parse::Component* out_path);
|
445
|
+
GURL_API bool CanonicalizePath(const char16* spec,
|
446
|
+
const url_parse::Component& path,
|
447
|
+
CanonOutput* output,
|
448
|
+
url_parse::Component* out_path);
|
449
|
+
|
450
|
+
// Canonicalizes the input as a file path. This is like CanonicalizePath except
|
451
|
+
// that it also handles Windows drive specs. For example, the path can begin
|
452
|
+
// with "c|\" and it will get properly canonicalized to "C:/".
|
453
|
+
// The string will be appended to |*output| and |*out_path| will be updated.
|
454
|
+
//
|
455
|
+
// The 8-bit version requires UTF-8 encoding.
|
456
|
+
GURL_API bool FileCanonicalizePath(const char* spec,
|
457
|
+
const url_parse::Component& path,
|
458
|
+
CanonOutput* output,
|
459
|
+
url_parse::Component* out_path);
|
460
|
+
GURL_API bool FileCanonicalizePath(const char16* spec,
|
461
|
+
const url_parse::Component& path,
|
462
|
+
CanonOutput* output,
|
463
|
+
url_parse::Component* out_path);
|
464
|
+
|
465
|
+
// Query: Prepends the ? if needed.
|
466
|
+
//
|
467
|
+
// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly
|
468
|
+
// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode
|
469
|
+
// "invalid character." This function can not fail, we always just try to do
|
470
|
+
// our best for crazy input here since web pages can set it themselves.
|
471
|
+
//
|
472
|
+
// This will convert the given input into the output encoding that the given
|
473
|
+
// character set converter object provides. The converter will only be called
|
474
|
+
// if necessary, for ASCII input, no conversions are necessary.
|
475
|
+
//
|
476
|
+
// The converter can be NULL. In this case, the output encoding will be UTF-8.
|
477
|
+
GURL_API void CanonicalizeQuery(const char* spec,
|
478
|
+
const url_parse::Component& query,
|
479
|
+
CharsetConverter* converter,
|
480
|
+
CanonOutput* output,
|
481
|
+
url_parse::Component* out_query);
|
482
|
+
GURL_API void CanonicalizeQuery(const char16* spec,
|
483
|
+
const url_parse::Component& query,
|
484
|
+
CharsetConverter* converter,
|
485
|
+
CanonOutput* output,
|
486
|
+
url_parse::Component* out_query);
|
487
|
+
|
488
|
+
// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
|
489
|
+
// canonicalizer that does not produce ASCII output). The output is
|
490
|
+
// guaranteed to be valid UTF-8.
|
491
|
+
//
|
492
|
+
// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
|
493
|
+
// the "Unicode replacement character" for the confusing bits and copy the rest.
|
494
|
+
GURL_API void CanonicalizeRef(const char* spec,
|
495
|
+
const url_parse::Component& path,
|
496
|
+
CanonOutput* output,
|
497
|
+
url_parse::Component* out_path);
|
498
|
+
GURL_API void CanonicalizeRef(const char16* spec,
|
499
|
+
const url_parse::Component& path,
|
500
|
+
CanonOutput* output,
|
501
|
+
url_parse::Component* out_path);
|
502
|
+
|
503
|
+
// Full canonicalizer ---------------------------------------------------------
|
504
|
+
//
|
505
|
+
// These functions replace any string contents, rather than append as above.
|
506
|
+
// See the above piece-by-piece functions for information specific to
|
507
|
+
// canonicalizing individual components.
|
508
|
+
//
|
509
|
+
// The output will be ASCII except the reference fragment, which may be UTF-8.
|
510
|
+
//
|
511
|
+
// The 8-bit versions require UTF-8 encoding.
|
512
|
+
|
513
|
+
// Use for standard URLs with authorities and paths.
|
514
|
+
GURL_API bool CanonicalizeStandardURL(const char* spec,
|
515
|
+
int spec_len,
|
516
|
+
const url_parse::Parsed& parsed,
|
517
|
+
CharsetConverter* query_converter,
|
518
|
+
CanonOutput* output,
|
519
|
+
url_parse::Parsed* new_parsed);
|
520
|
+
GURL_API bool CanonicalizeStandardURL(const char16* spec,
|
521
|
+
int spec_len,
|
522
|
+
const url_parse::Parsed& parsed,
|
523
|
+
CharsetConverter* query_converter,
|
524
|
+
CanonOutput* output,
|
525
|
+
url_parse::Parsed* new_parsed);
|
526
|
+
|
527
|
+
// Use for file URLs.
|
528
|
+
GURL_API bool CanonicalizeFileURL(const char* spec,
|
529
|
+
int spec_len,
|
530
|
+
const url_parse::Parsed& parsed,
|
531
|
+
CharsetConverter* query_converter,
|
532
|
+
CanonOutput* output,
|
533
|
+
url_parse::Parsed* new_parsed);
|
534
|
+
GURL_API bool CanonicalizeFileURL(const char16* spec,
|
535
|
+
int spec_len,
|
536
|
+
const url_parse::Parsed& parsed,
|
537
|
+
CharsetConverter* query_converter,
|
538
|
+
CanonOutput* output,
|
539
|
+
url_parse::Parsed* new_parsed);
|
540
|
+
|
541
|
+
// Use for path URLs such as javascript. This does not modify the path in any
|
542
|
+
// way, for example, by escaping it.
|
543
|
+
GURL_API bool CanonicalizePathURL(const char* spec,
|
544
|
+
int spec_len,
|
545
|
+
const url_parse::Parsed& parsed,
|
546
|
+
CanonOutput* output,
|
547
|
+
url_parse::Parsed* new_parsed);
|
548
|
+
GURL_API bool CanonicalizePathURL(const char16* spec,
|
549
|
+
int spec_len,
|
550
|
+
const url_parse::Parsed& parsed,
|
551
|
+
CanonOutput* output,
|
552
|
+
url_parse::Parsed* new_parsed);
|
553
|
+
|
554
|
+
// Use for mailto URLs. This "canonicalizes" the url into a path and query
|
555
|
+
// component. It does not attempt to merge "to" fields. It uses UTF-8 for
|
556
|
+
// the query encoding if there is a query. This is because a mailto URL is
|
557
|
+
// really intended for an external mail program, and the encoding of a page,
|
558
|
+
// etc. which would influence a query encoding normally are irrelevant.
|
559
|
+
GURL_API bool CanonicalizeMailtoURL(const char* spec,
|
560
|
+
int spec_len,
|
561
|
+
const url_parse::Parsed& parsed,
|
562
|
+
CanonOutput* output,
|
563
|
+
url_parse::Parsed* new_parsed);
|
564
|
+
GURL_API bool CanonicalizeMailtoURL(const char16* spec,
|
565
|
+
int spec_len,
|
566
|
+
const url_parse::Parsed& parsed,
|
567
|
+
CanonOutput* output,
|
568
|
+
url_parse::Parsed* new_parsed);
|
569
|
+
|
570
|
+
// Part replacer --------------------------------------------------------------
|
571
|
+
|
572
|
+
// Internal structure used for storing separate strings for each component.
|
573
|
+
// The basic canonicalization functions use this structure internally so that
|
574
|
+
// component remplacement (different strings for different components) can be
|
575
|
+
// treated on the same code path as regular canonicalization (the same string
|
576
|
+
// for each component).
|
577
|
+
//
|
578
|
+
// A url_parse::Parsed structure usually goes along with this. Those
|
579
|
+
// components identify offsets within these strings, so that they can all be
|
580
|
+
// in the same string, or spread arbitrarily across different ones.
|
581
|
+
//
|
582
|
+
// This structures does not own any data. It is the caller's responsibility to
|
583
|
+
// ensure that the data the pointers point to stays in scope and is not
|
584
|
+
// modified.
|
585
|
+
template<typename CHAR>
|
586
|
+
struct URLComponentSource {
|
587
|
+
// Constructor normally used by callers wishing to replace components. This
|
588
|
+
// will make them all NULL, which is no replacement. The caller would then
|
589
|
+
// override the components they want to replace.
|
590
|
+
URLComponentSource()
|
591
|
+
: scheme(NULL),
|
592
|
+
username(NULL),
|
593
|
+
password(NULL),
|
594
|
+
host(NULL),
|
595
|
+
port(NULL),
|
596
|
+
path(NULL),
|
597
|
+
query(NULL),
|
598
|
+
ref(NULL) {
|
599
|
+
}
|
600
|
+
|
601
|
+
// Constructor normally used internally to initialize all the components to
|
602
|
+
// point to the same spec.
|
603
|
+
explicit URLComponentSource(const CHAR* default_value)
|
604
|
+
: scheme(default_value),
|
605
|
+
username(default_value),
|
606
|
+
password(default_value),
|
607
|
+
host(default_value),
|
608
|
+
port(default_value),
|
609
|
+
path(default_value),
|
610
|
+
query(default_value),
|
611
|
+
ref(default_value) {
|
612
|
+
}
|
613
|
+
|
614
|
+
const CHAR* scheme;
|
615
|
+
const CHAR* username;
|
616
|
+
const CHAR* password;
|
617
|
+
const CHAR* host;
|
618
|
+
const CHAR* port;
|
619
|
+
const CHAR* path;
|
620
|
+
const CHAR* query;
|
621
|
+
const CHAR* ref;
|
622
|
+
};
|
623
|
+
|
624
|
+
// This structure encapsulates information on modifying a URL. Each component
|
625
|
+
// may either be left unchanged, replaced, or deleted.
|
626
|
+
//
|
627
|
+
// By default, each component is unchanged. For those components that should be
|
628
|
+
// modified, call either Set* or Clear* to modify it.
|
629
|
+
//
|
630
|
+
// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT
|
631
|
+
// IN SCOPE BY THE CALLER for as long as this object exists!
|
632
|
+
//
|
633
|
+
// Prefer the 8-bit replacement version if possible since it is more efficient.
|
634
|
+
template<typename CHAR>
|
635
|
+
class Replacements {
|
636
|
+
public:
|
637
|
+
Replacements() {
|
638
|
+
}
|
639
|
+
|
640
|
+
// Scheme
|
641
|
+
void SetScheme(const CHAR* s, const url_parse::Component& comp) {
|
642
|
+
sources_.scheme = s;
|
643
|
+
components_.scheme = comp;
|
644
|
+
}
|
645
|
+
// Note: we don't have a ClearScheme since this doesn't make any sense.
|
646
|
+
bool IsSchemeOverridden() const { return sources_.scheme != NULL; }
|
647
|
+
|
648
|
+
// Username
|
649
|
+
void SetUsername(const CHAR* s, const url_parse::Component& comp) {
|
650
|
+
sources_.username = s;
|
651
|
+
components_.username = comp;
|
652
|
+
}
|
653
|
+
void ClearUsername() {
|
654
|
+
sources_.username = Placeholder();
|
655
|
+
components_.username = url_parse::Component();
|
656
|
+
}
|
657
|
+
bool IsUsernameOverridden() const { return sources_.username != NULL; }
|
658
|
+
|
659
|
+
// Password
|
660
|
+
void SetPassword(const CHAR* s, const url_parse::Component& comp) {
|
661
|
+
sources_.password = s;
|
662
|
+
components_.password = comp;
|
663
|
+
}
|
664
|
+
void ClearPassword() {
|
665
|
+
sources_.password = Placeholder();
|
666
|
+
components_.password = url_parse::Component();
|
667
|
+
}
|
668
|
+
bool IsPasswordOverridden() const { return sources_.password != NULL; }
|
669
|
+
|
670
|
+
// Host
|
671
|
+
void SetHost(const CHAR* s, const url_parse::Component& comp) {
|
672
|
+
sources_.host = s;
|
673
|
+
components_.host = comp;
|
674
|
+
}
|
675
|
+
void ClearHost() {
|
676
|
+
sources_.host = Placeholder();
|
677
|
+
components_.host = url_parse::Component();
|
678
|
+
}
|
679
|
+
bool IsHostOverridden() const { return sources_.host != NULL; }
|
680
|
+
|
681
|
+
// Port
|
682
|
+
void SetPort(const CHAR* s, const url_parse::Component& comp) {
|
683
|
+
sources_.port = s;
|
684
|
+
components_.port = comp;
|
685
|
+
}
|
686
|
+
void ClearPort() {
|
687
|
+
sources_.port = Placeholder();
|
688
|
+
components_.port = url_parse::Component();
|
689
|
+
}
|
690
|
+
bool IsPortOverridden() const { return sources_.port != NULL; }
|
691
|
+
|
692
|
+
// Path
|
693
|
+
void SetPath(const CHAR* s, const url_parse::Component& comp) {
|
694
|
+
sources_.path = s;
|
695
|
+
components_.path = comp;
|
696
|
+
}
|
697
|
+
void ClearPath() {
|
698
|
+
sources_.path = Placeholder();
|
699
|
+
components_.path = url_parse::Component();
|
700
|
+
}
|
701
|
+
bool IsPathOverridden() const { return sources_.path != NULL; }
|
702
|
+
|
703
|
+
// Query
|
704
|
+
void SetQuery(const CHAR* s, const url_parse::Component& comp) {
|
705
|
+
sources_.query = s;
|
706
|
+
components_.query = comp;
|
707
|
+
}
|
708
|
+
void ClearQuery() {
|
709
|
+
sources_.query = Placeholder();
|
710
|
+
components_.query = url_parse::Component();
|
711
|
+
}
|
712
|
+
bool IsQueryOverridden() const { return sources_.query != NULL; }
|
713
|
+
|
714
|
+
// Ref
|
715
|
+
void SetRef(const CHAR* s, const url_parse::Component& comp) {
|
716
|
+
sources_.ref = s;
|
717
|
+
components_.ref = comp;
|
718
|
+
}
|
719
|
+
void ClearRef() {
|
720
|
+
sources_.ref = Placeholder();
|
721
|
+
components_.ref = url_parse::Component();
|
722
|
+
}
|
723
|
+
bool IsRefOverridden() const { return sources_.ref != NULL; }
|
724
|
+
|
725
|
+
// Getters for the itnernal data. See the variables below for how the
|
726
|
+
// information is encoded.
|
727
|
+
const URLComponentSource<CHAR>& sources() const { return sources_; }
|
728
|
+
const url_parse::Parsed& components() const { return components_; }
|
729
|
+
|
730
|
+
private:
|
731
|
+
// Returns a pointer to a static empty string that is used as a placeholder
|
732
|
+
// to indicate a component should be deleted (see below).
|
733
|
+
const CHAR* Placeholder() {
|
734
|
+
static const CHAR empty_string = 0;
|
735
|
+
return &empty_string;
|
736
|
+
}
|
737
|
+
|
738
|
+
// We support three states:
|
739
|
+
//
|
740
|
+
// Action | Source Component
|
741
|
+
// -----------------------+--------------------------------------------------
|
742
|
+
// Don't change component | NULL (unused)
|
743
|
+
// Replace component | (replacement string) (replacement component)
|
744
|
+
// Delete component | (non-NULL) (invalid component: (0,-1))
|
745
|
+
//
|
746
|
+
// We use a pointer to the empty string for the source when the component
|
747
|
+
// should be deleted.
|
748
|
+
URLComponentSource<CHAR> sources_;
|
749
|
+
url_parse::Parsed components_;
|
750
|
+
};
|
751
|
+
|
752
|
+
// The base must be an 8-bit canonical URL.
|
753
|
+
GURL_API bool ReplaceStandardURL(const char* base,
|
754
|
+
const url_parse::Parsed& base_parsed,
|
755
|
+
const Replacements<char>& replacements,
|
756
|
+
CharsetConverter* query_converter,
|
757
|
+
CanonOutput* output,
|
758
|
+
url_parse::Parsed* new_parsed);
|
759
|
+
GURL_API bool ReplaceStandardURL(const char* base,
|
760
|
+
const url_parse::Parsed& base_parsed,
|
761
|
+
const Replacements<char16>& replacements,
|
762
|
+
CharsetConverter* query_converter,
|
763
|
+
CanonOutput* output,
|
764
|
+
url_parse::Parsed* new_parsed);
|
765
|
+
|
766
|
+
// Replacing some parts of a file URL is not permitted. Everything except
|
767
|
+
// the host, path, query, and ref will be ignored.
|
768
|
+
GURL_API bool ReplaceFileURL(const char* base,
|
769
|
+
const url_parse::Parsed& base_parsed,
|
770
|
+
const Replacements<char>& replacements,
|
771
|
+
CharsetConverter* query_converter,
|
772
|
+
CanonOutput* output,
|
773
|
+
url_parse::Parsed* new_parsed);
|
774
|
+
GURL_API bool ReplaceFileURL(const char* base,
|
775
|
+
const url_parse::Parsed& base_parsed,
|
776
|
+
const Replacements<char16>& replacements,
|
777
|
+
CharsetConverter* query_converter,
|
778
|
+
CanonOutput* output,
|
779
|
+
url_parse::Parsed* new_parsed);
|
780
|
+
|
781
|
+
// Path URLs can only have the scheme and path replaced. All other components
|
782
|
+
// will be ignored.
|
783
|
+
GURL_API bool ReplacePathURL(const char* base,
|
784
|
+
const url_parse::Parsed& base_parsed,
|
785
|
+
const Replacements<char>& replacements,
|
786
|
+
CanonOutput* output,
|
787
|
+
url_parse::Parsed* new_parsed);
|
788
|
+
GURL_API bool ReplacePathURL(const char* base,
|
789
|
+
const url_parse::Parsed& base_parsed,
|
790
|
+
const Replacements<char16>& replacements,
|
791
|
+
CanonOutput* output,
|
792
|
+
url_parse::Parsed* new_parsed);
|
793
|
+
|
794
|
+
// Mailto URLs can only have the scheme, path, and query replaced.
|
795
|
+
// All other components will be ignored.
|
796
|
+
GURL_API bool ReplaceMailtoURL(const char* base,
|
797
|
+
const url_parse::Parsed& base_parsed,
|
798
|
+
const Replacements<char>& replacements,
|
799
|
+
CanonOutput* output,
|
800
|
+
url_parse::Parsed* new_parsed);
|
801
|
+
GURL_API bool ReplaceMailtoURL(const char* base,
|
802
|
+
const url_parse::Parsed& base_parsed,
|
803
|
+
const Replacements<char16>& replacements,
|
804
|
+
CanonOutput* output,
|
805
|
+
url_parse::Parsed* new_parsed);
|
806
|
+
|
807
|
+
// Relative URL ---------------------------------------------------------------
|
808
|
+
|
809
|
+
// Given an input URL or URL fragment |fragment|, determines if it is a
|
810
|
+
// relative or absolute URL and places the result into |*is_relative|. If it is
|
811
|
+
// relative, the relevant portion of the URL will be placed into
|
812
|
+
// |*relative_component| (there may have been trimmed whitespace, for example).
|
813
|
+
// This value is passed to ResolveRelativeURL. If the input is not relative,
|
814
|
+
// this value is UNDEFINED (it may be changed by the functin).
|
815
|
+
//
|
816
|
+
// Returns true on success (we successfully determined the URL is relative or
|
817
|
+
// not). Failure means that the combination of URLs doesn't make any sense.
|
818
|
+
//
|
819
|
+
// The base URL should always be canonical, therefore is ASCII.
|
820
|
+
GURL_API bool IsRelativeURL(const char* base,
|
821
|
+
const url_parse::Parsed& base_parsed,
|
822
|
+
const char* fragment,
|
823
|
+
int fragment_len,
|
824
|
+
bool is_base_hierarchical,
|
825
|
+
bool* is_relative,
|
826
|
+
url_parse::Component* relative_component);
|
827
|
+
GURL_API bool IsRelativeURL(const char* base,
|
828
|
+
const url_parse::Parsed& base_parsed,
|
829
|
+
const char16* fragment,
|
830
|
+
int fragment_len,
|
831
|
+
bool is_base_hierarchical,
|
832
|
+
bool* is_relative,
|
833
|
+
url_parse::Component* relative_component);
|
834
|
+
|
835
|
+
// Given a canonical parsed source URL, a URL fragment known to be relative,
|
836
|
+
// and the identified relevant portion of the relative URL (computed by
|
837
|
+
// IsRelativeURL), this produces a new parsed canonical URL in |output| and
|
838
|
+
// |out_parsed|.
|
839
|
+
//
|
840
|
+
// It also requires a flag indicating whether the base URL is a file: URL
|
841
|
+
// which triggers additional logic.
|
842
|
+
//
|
843
|
+
// The base URL should be canonical and have a host (may be empty for file
|
844
|
+
// URLs) and a path. If it doesn't have these, we can't resolve relative
|
845
|
+
// URLs off of it and will return the base as the output with an error flag.
|
846
|
+
// Becausee it is canonical is should also be ASCII.
|
847
|
+
//
|
848
|
+
// The query charset converter follows the same rules as CanonicalizeQuery.
|
849
|
+
//
|
850
|
+
// Returns true on success. On failure, the output will be "something
|
851
|
+
// reasonable" that will be consistent and valid, just probably not what
|
852
|
+
// was intended by the web page author or caller.
|
853
|
+
GURL_API bool ResolveRelativeURL(const char* base_url,
|
854
|
+
const url_parse::Parsed& base_parsed,
|
855
|
+
bool base_is_file,
|
856
|
+
const char* relative_url,
|
857
|
+
const url_parse::Component& relative_component,
|
858
|
+
CharsetConverter* query_converter,
|
859
|
+
CanonOutput* output,
|
860
|
+
url_parse::Parsed* out_parsed);
|
861
|
+
GURL_API bool ResolveRelativeURL(const char* base_url,
|
862
|
+
const url_parse::Parsed& base_parsed,
|
863
|
+
bool base_is_file,
|
864
|
+
const char16* relative_url,
|
865
|
+
const url_parse::Component& relative_component,
|
866
|
+
CharsetConverter* query_converter,
|
867
|
+
CanonOutput* output,
|
868
|
+
url_parse::Parsed* out_parsed);
|
869
|
+
|
870
|
+
} // namespace url_canon
|
871
|
+
|
872
|
+
#endif // GOOGLEURL_SRC_URL_CANON_H__
|