uri_parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/.rvmrc +1 -0
- data/Gemfile +6 -0
- data/Rakefile +13 -0
- data/ext/uri_parser/basictypes.h +89 -0
- data/ext/uri_parser/extconf.h +6 -0
- data/ext/uri_parser/extconf.rb +50 -0
- data/ext/uri_parser/logging.h +5 -0
- data/ext/uri_parser/scoped_ptr.h +322 -0
- data/ext/uri_parser/string16.cc +95 -0
- data/ext/uri_parser/string16.h +194 -0
- data/ext/uri_parser/uri_parser.cc +87 -0
- data/ext/uri_parser/url_canon.h +872 -0
- data/ext/uri_parser/url_canon_etc.cc +392 -0
- data/ext/uri_parser/url_canon_fileurl.cc +215 -0
- data/ext/uri_parser/url_canon_host.cc +401 -0
- data/ext/uri_parser/url_canon_icu.cc +207 -0
- data/ext/uri_parser/url_canon_icu.h +63 -0
- data/ext/uri_parser/url_canon_internal.cc +427 -0
- data/ext/uri_parser/url_canon_internal.h +453 -0
- data/ext/uri_parser/url_canon_internal_file.h +157 -0
- data/ext/uri_parser/url_canon_ip.cc +737 -0
- data/ext/uri_parser/url_canon_ip.h +101 -0
- data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
- data/ext/uri_parser/url_canon_path.cc +380 -0
- data/ext/uri_parser/url_canon_pathurl.cc +128 -0
- data/ext/uri_parser/url_canon_query.cc +189 -0
- data/ext/uri_parser/url_canon_relative.cc +572 -0
- data/ext/uri_parser/url_canon_stdstring.h +134 -0
- data/ext/uri_parser/url_canon_stdurl.cc +211 -0
- data/ext/uri_parser/url_common.h +48 -0
- data/ext/uri_parser/url_file.h +108 -0
- data/ext/uri_parser/url_parse.cc +760 -0
- data/ext/uri_parser/url_parse.h +336 -0
- data/ext/uri_parser/url_parse_file.cc +243 -0
- data/ext/uri_parser/url_parse_internal.h +112 -0
- data/ext/uri_parser/url_util.cc +553 -0
- data/ext/uri_parser/url_util.h +222 -0
- data/lib/uri_parser.rb +28 -0
- data/lib/uri_parser/version.rb +3 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/uri_parser_spec.rb +54 -0
- data/uri_parser.gemspec +26 -0
- metadata +117 -0
@@ -0,0 +1,112 @@
|
|
1
|
+
// Copyright 2007, Google Inc.
|
2
|
+
// All rights reserved.
|
3
|
+
//
|
4
|
+
// Redistribution and use in source and binary forms, with or without
|
5
|
+
// modification, are permitted provided that the following conditions are
|
6
|
+
// met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright
|
9
|
+
// notice, this list of conditions and the following disclaimer.
|
10
|
+
// * Redistributions in binary form must reproduce the above
|
11
|
+
// copyright notice, this list of conditions and the following disclaimer
|
12
|
+
// in the documentation and/or other materials provided with the
|
13
|
+
// distribution.
|
14
|
+
// * Neither the name of Google Inc. nor the names of its
|
15
|
+
// contributors may be used to endorse or promote products derived from
|
16
|
+
// this software without specific prior written permission.
|
17
|
+
//
|
18
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
20
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
21
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
22
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
25
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
26
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
27
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
+
|
30
|
+
// Contains common inline helper functions used by the URL parsing routines.
|
31
|
+
|
32
|
+
#ifndef GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
|
33
|
+
#define GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
|
34
|
+
|
35
|
+
#include "url_parse.h"
|
36
|
+
|
37
|
+
namespace url_parse {
|
38
|
+
|
39
|
+
// We treat slashes and backslashes the same for IE compatability.
|
40
|
+
inline bool IsURLSlash(char16 ch) {
|
41
|
+
return ch == '/' || ch == '\\';
|
42
|
+
}
|
43
|
+
|
44
|
+
// Returns true if we should trim this character from the URL because it is a
|
45
|
+
// space or a control character.
|
46
|
+
inline bool ShouldTrimFromURL(char16 ch) {
|
47
|
+
return ch <= ' ';
|
48
|
+
}
|
49
|
+
|
50
|
+
// Given an already-initialized begin index and length, this shrinks the range
|
51
|
+
// to eliminate "should-be-trimmed" characters. Note that the length does *not*
|
52
|
+
// indicate the length of untrimmed data from |*begin|, but rather the position
|
53
|
+
// in the input string (so the string starts at character |*begin| in the spec,
|
54
|
+
// and goes until |*len|).
|
55
|
+
template<typename CHAR>
|
56
|
+
inline void TrimURL(const CHAR* spec, int* begin, int* len) {
|
57
|
+
// Strip leading whitespace and control characters.
|
58
|
+
while (*begin < *len && ShouldTrimFromURL(spec[*begin]))
|
59
|
+
(*begin)++;
|
60
|
+
|
61
|
+
// Strip trailing whitespace and control characters. We need the >i test for
|
62
|
+
// when the input string is all blanks; we don't want to back past the input.
|
63
|
+
while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
|
64
|
+
(*len)--;
|
65
|
+
}
|
66
|
+
|
67
|
+
// Counts the number of consecutive slashes starting at the given offset
|
68
|
+
// in the given string of the given length.
|
69
|
+
template<typename CHAR>
|
70
|
+
inline int CountConsecutiveSlashes(const CHAR *str,
|
71
|
+
int begin_offset, int str_len) {
|
72
|
+
int count = 0;
|
73
|
+
while (begin_offset + count < str_len &&
|
74
|
+
IsURLSlash(str[begin_offset + count]))
|
75
|
+
++count;
|
76
|
+
return count;
|
77
|
+
}
|
78
|
+
|
79
|
+
// Internal functions in url_parse.cc that parse the path, that is, everything
|
80
|
+
// following the authority section. The input is the range of everything
|
81
|
+
// following the authority section, and the output is the identified ranges.
|
82
|
+
//
|
83
|
+
// This is designed for the file URL parser or other consumers who may do
|
84
|
+
// special stuff at the beginning, but want regular path parsing, it just
|
85
|
+
// maps to the internal parsing function for paths.
|
86
|
+
void ParsePathInternal(const char* spec,
|
87
|
+
const Component& path,
|
88
|
+
Component* filepath,
|
89
|
+
Component* query,
|
90
|
+
Component* ref);
|
91
|
+
void ParsePathInternal(const char16* spec,
|
92
|
+
const Component& path,
|
93
|
+
Component* filepath,
|
94
|
+
Component* query,
|
95
|
+
Component* ref);
|
96
|
+
|
97
|
+
|
98
|
+
// Given a spec and a pointer to the character after the colon following the
|
99
|
+
// scheme, this parses it and fills in the structure, Every item in the parsed
|
100
|
+
// structure is filled EXCEPT for the scheme, which is untouched.
|
101
|
+
void ParseAfterScheme(const char* spec,
|
102
|
+
int spec_len,
|
103
|
+
int after_scheme,
|
104
|
+
Parsed* parsed);
|
105
|
+
void ParseAfterScheme(const char16* spec,
|
106
|
+
int spec_len,
|
107
|
+
int after_scheme,
|
108
|
+
Parsed* parsed);
|
109
|
+
|
110
|
+
} // namespace url_parse
|
111
|
+
|
112
|
+
#endif // GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
|
@@ -0,0 +1,553 @@
|
|
1
|
+
// Copyright 2007, Google Inc.
|
2
|
+
// All rights reserved.
|
3
|
+
//
|
4
|
+
// Redistribution and use in source and binary forms, with or without
|
5
|
+
// modification, are permitted provided that the following conditions are
|
6
|
+
// met:
|
7
|
+
//
|
8
|
+
// * Redistributions of source code must retain the above copyright
|
9
|
+
// notice, this list of conditions and the following disclaimer.
|
10
|
+
// * Redistributions in binary form must reproduce the above
|
11
|
+
// copyright notice, this list of conditions and the following disclaimer
|
12
|
+
// in the documentation and/or other materials provided with the
|
13
|
+
// distribution.
|
14
|
+
// * Neither the name of Google Inc. nor the names of its
|
15
|
+
// contributors may be used to endorse or promote products derived from
|
16
|
+
// this software without specific prior written permission.
|
17
|
+
//
|
18
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
19
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
20
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
21
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
22
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
23
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
24
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
25
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
26
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
27
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29
|
+
|
30
|
+
#include <string.h>
|
31
|
+
#include <vector>
|
32
|
+
|
33
|
+
#include "url_util.h"
|
34
|
+
|
35
|
+
#include "logging.h"
|
36
|
+
#include "url_canon_internal.h"
|
37
|
+
#include "url_file.h"
|
38
|
+
|
39
|
+
namespace url_util {
|
40
|
+
|
41
|
+
namespace {
|
42
|
+
|
43
|
+
// ASCII-specific tolower. The standard library's tolower is locale sensitive,
|
44
|
+
// so we don't want to use it here.
|
45
|
+
template <class Char> inline Char ToLowerASCII(Char c) {
|
46
|
+
return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
|
47
|
+
}
|
48
|
+
|
49
|
+
// Backend for LowerCaseEqualsASCII.
|
50
|
+
template<typename Iter>
|
51
|
+
inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
|
52
|
+
for (Iter it = a_begin; it != a_end; ++it, ++b) {
|
53
|
+
if (!*b || ToLowerASCII(*it) != *b)
|
54
|
+
return false;
|
55
|
+
}
|
56
|
+
return *b == 0;
|
57
|
+
}
|
58
|
+
|
59
|
+
const char kFileScheme[] = "file"; // Used in a number of places.
|
60
|
+
const char kMailtoScheme[] = "mailto";
|
61
|
+
|
62
|
+
const int kNumStandardURLSchemes = 7;
|
63
|
+
const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
|
64
|
+
"http",
|
65
|
+
"https",
|
66
|
+
kFileScheme, // Yes, file urls can have a hostname!
|
67
|
+
"ftp",
|
68
|
+
"gopher",
|
69
|
+
"ws", // WebSocket.
|
70
|
+
"wss", // WebSocket secure.
|
71
|
+
};
|
72
|
+
|
73
|
+
// List of the currently installed standard schemes. This list is lazily
|
74
|
+
// initialized by InitStandardSchemes and is leaked on shutdown to prevent
|
75
|
+
// any destructors from being called that will slow us down or cause problems.
|
76
|
+
std::vector<const char*>* standard_schemes = NULL;
|
77
|
+
|
78
|
+
// See the LockStandardSchemes declaration in the header.
|
79
|
+
bool standard_schemes_locked = false;
|
80
|
+
|
81
|
+
// Ensures that the standard_schemes list is initialized, does nothing if it
|
82
|
+
// already has values.
|
83
|
+
void InitStandardSchemes() {
|
84
|
+
if (standard_schemes)
|
85
|
+
return;
|
86
|
+
standard_schemes = new std::vector<const char*>;
|
87
|
+
for (int i = 0; i < kNumStandardURLSchemes; i++)
|
88
|
+
standard_schemes->push_back(kStandardURLSchemes[i]);
|
89
|
+
}
|
90
|
+
|
91
|
+
// Given a string and a range inside the string, compares it to the given
|
92
|
+
// lower-case |compare_to| buffer.
|
93
|
+
template<typename CHAR>
|
94
|
+
inline bool CompareSchemeComponent(const CHAR* spec,
|
95
|
+
const url_parse::Component& component,
|
96
|
+
const char* compare_to) {
|
97
|
+
if (!component.is_nonempty())
|
98
|
+
return compare_to[0] == 0; // When component is empty, match empty scheme.
|
99
|
+
return LowerCaseEqualsASCII(&spec[component.begin],
|
100
|
+
&spec[component.end()],
|
101
|
+
compare_to);
|
102
|
+
}
|
103
|
+
|
104
|
+
// Returns true if the given scheme identified by |scheme| within |spec| is one
|
105
|
+
// of the registered "standard" schemes.
|
106
|
+
template<typename CHAR>
|
107
|
+
bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {
|
108
|
+
if (!scheme.is_nonempty())
|
109
|
+
return false; // Empty or invalid schemes are non-standard.
|
110
|
+
|
111
|
+
InitStandardSchemes();
|
112
|
+
for (size_t i = 0; i < standard_schemes->size(); i++) {
|
113
|
+
if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
|
114
|
+
standard_schemes->at(i)))
|
115
|
+
return true;
|
116
|
+
}
|
117
|
+
return false;
|
118
|
+
}
|
119
|
+
|
120
|
+
template<typename CHAR>
|
121
|
+
bool DoFindAndCompareScheme(const CHAR* str,
|
122
|
+
int str_len,
|
123
|
+
const char* compare,
|
124
|
+
url_parse::Component* found_scheme) {
|
125
|
+
// Before extracting scheme, canonicalize the URL to remove any whitespace.
|
126
|
+
// This matches the canonicalization done in DoCanonicalize function.
|
127
|
+
url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
|
128
|
+
int spec_len;
|
129
|
+
const CHAR* spec = RemoveURLWhitespace(str, str_len,
|
130
|
+
&whitespace_buffer, &spec_len);
|
131
|
+
|
132
|
+
url_parse::Component our_scheme;
|
133
|
+
if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) {
|
134
|
+
// No scheme.
|
135
|
+
if (found_scheme)
|
136
|
+
*found_scheme = url_parse::Component();
|
137
|
+
return false;
|
138
|
+
}
|
139
|
+
if (found_scheme)
|
140
|
+
*found_scheme = our_scheme;
|
141
|
+
return CompareSchemeComponent(spec, our_scheme, compare);
|
142
|
+
}
|
143
|
+
|
144
|
+
template<typename CHAR>
|
145
|
+
bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
|
146
|
+
url_canon::CharsetConverter* charset_converter,
|
147
|
+
url_canon::CanonOutput* output,
|
148
|
+
url_parse::Parsed* output_parsed) {
|
149
|
+
// Remove any whitespace from the middle of the relative URL, possibly
|
150
|
+
// copying to the new buffer.
|
151
|
+
url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
|
152
|
+
int spec_len;
|
153
|
+
const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
|
154
|
+
&whitespace_buffer, &spec_len);
|
155
|
+
|
156
|
+
url_parse::Parsed parsed_input;
|
157
|
+
#ifdef WIN32
|
158
|
+
// For Windows, we allow things that look like absolute Windows paths to be
|
159
|
+
// fixed up magically to file URLs. This is done for IE compatability. For
|
160
|
+
// example, this will change "c:/foo" into a file URL rather than treating
|
161
|
+
// it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
|
162
|
+
// There is similar logic in url_canon_relative.cc for
|
163
|
+
//
|
164
|
+
// For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
|
165
|
+
// has no meaning as an absolute path name. This is because browsers on Mac
|
166
|
+
// & Unix don't generally do this, so there is no compatibility reason for
|
167
|
+
// doing so.
|
168
|
+
if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) ||
|
169
|
+
url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
|
170
|
+
url_parse::ParseFileURL(spec, spec_len, &parsed_input);
|
171
|
+
return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
|
172
|
+
charset_converter,
|
173
|
+
output, output_parsed);
|
174
|
+
}
|
175
|
+
#endif
|
176
|
+
|
177
|
+
url_parse::Component scheme;
|
178
|
+
if (!url_parse::ExtractScheme(spec, spec_len, &scheme))
|
179
|
+
return false;
|
180
|
+
|
181
|
+
// This is the parsed version of the input URL, we have to canonicalize it
|
182
|
+
// before storing it in our object.
|
183
|
+
bool success;
|
184
|
+
if (CompareSchemeComponent(spec, scheme, kFileScheme)) {
|
185
|
+
// File URLs are special.
|
186
|
+
url_parse::ParseFileURL(spec, spec_len, &parsed_input);
|
187
|
+
success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
|
188
|
+
charset_converter,
|
189
|
+
output, output_parsed);
|
190
|
+
|
191
|
+
} else if (DoIsStandard(spec, scheme)) {
|
192
|
+
// All "normal" URLs.
|
193
|
+
url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
|
194
|
+
success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
|
195
|
+
charset_converter,
|
196
|
+
output, output_parsed);
|
197
|
+
|
198
|
+
} else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) {
|
199
|
+
// Mailto are treated like a standard url with only a scheme, path, query
|
200
|
+
url_parse::ParseMailtoURL(spec, spec_len, &parsed_input);
|
201
|
+
success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input,
|
202
|
+
output, output_parsed);
|
203
|
+
|
204
|
+
} else {
|
205
|
+
// "Weird" URLs like data: and javascript:
|
206
|
+
url_parse::ParsePathURL(spec, spec_len, &parsed_input);
|
207
|
+
success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input,
|
208
|
+
output, output_parsed);
|
209
|
+
}
|
210
|
+
return success;
|
211
|
+
}
|
212
|
+
|
213
|
+
template<typename CHAR>
|
214
|
+
bool DoResolveRelative(const char* base_spec,
|
215
|
+
int base_spec_len,
|
216
|
+
const url_parse::Parsed& base_parsed,
|
217
|
+
const CHAR* in_relative,
|
218
|
+
int in_relative_length,
|
219
|
+
url_canon::CharsetConverter* charset_converter,
|
220
|
+
url_canon::CanonOutput* output,
|
221
|
+
url_parse::Parsed* output_parsed) {
|
222
|
+
// Remove any whitespace from the middle of the relative URL, possibly
|
223
|
+
// copying to the new buffer.
|
224
|
+
url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
|
225
|
+
int relative_length;
|
226
|
+
const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
|
227
|
+
&whitespace_buffer,
|
228
|
+
&relative_length);
|
229
|
+
|
230
|
+
// See if our base URL should be treated as "standard".
|
231
|
+
bool standard_base_scheme =
|
232
|
+
base_parsed.scheme.is_nonempty() &&
|
233
|
+
DoIsStandard(base_spec, base_parsed.scheme);
|
234
|
+
|
235
|
+
bool is_relative;
|
236
|
+
url_parse::Component relative_component;
|
237
|
+
if (!url_canon::IsRelativeURL(base_spec, base_parsed,
|
238
|
+
relative, relative_length,
|
239
|
+
standard_base_scheme,
|
240
|
+
&is_relative,
|
241
|
+
&relative_component)) {
|
242
|
+
// Error resolving.
|
243
|
+
return false;
|
244
|
+
}
|
245
|
+
|
246
|
+
if (is_relative) {
|
247
|
+
// Relative, resolve and canonicalize.
|
248
|
+
bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
|
249
|
+
CompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
|
250
|
+
return url_canon::ResolveRelativeURL(base_spec, base_parsed,
|
251
|
+
file_base_scheme, relative,
|
252
|
+
relative_component, charset_converter,
|
253
|
+
output, output_parsed);
|
254
|
+
}
|
255
|
+
|
256
|
+
// Not relative, canonicalize the input.
|
257
|
+
return DoCanonicalize(relative, relative_length, charset_converter,
|
258
|
+
output, output_parsed);
|
259
|
+
}
|
260
|
+
|
261
|
+
template<typename CHAR>
|
262
|
+
bool DoReplaceComponents(const char* spec,
|
263
|
+
int spec_len,
|
264
|
+
const url_parse::Parsed& parsed,
|
265
|
+
const url_canon::Replacements<CHAR>& replacements,
|
266
|
+
url_canon::CharsetConverter* charset_converter,
|
267
|
+
url_canon::CanonOutput* output,
|
268
|
+
url_parse::Parsed* out_parsed) {
|
269
|
+
// If the scheme is overridden, just do a simple string substitution and
|
270
|
+
// reparse the whole thing. There are lots of edge cases that we really don't
|
271
|
+
// want to deal with. Like what happens if I replace "http://e:8080/foo"
|
272
|
+
// with a file. Does it become "file:///E:/8080/foo" where the port number
|
273
|
+
// becomes part of the path? Parsing that string as a file URL says "yes"
|
274
|
+
// but almost no sane rule for dealing with the components individually would
|
275
|
+
// come up with that.
|
276
|
+
//
|
277
|
+
// Why allow these crazy cases at all? Programatically, there is almost no
|
278
|
+
// case for replacing the scheme. The most common case for hitting this is
|
279
|
+
// in JS when building up a URL using the location object. In this case, the
|
280
|
+
// JS code expects the string substitution behavior:
|
281
|
+
// http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
|
282
|
+
if (replacements.IsSchemeOverridden()) {
|
283
|
+
// Canonicalize the new scheme so it is 8-bit and can be concatenated with
|
284
|
+
// the existing spec.
|
285
|
+
url_canon::RawCanonOutput<128> scheme_replaced;
|
286
|
+
url_parse::Component scheme_replaced_parsed;
|
287
|
+
url_canon::CanonicalizeScheme(
|
288
|
+
replacements.sources().scheme,
|
289
|
+
replacements.components().scheme,
|
290
|
+
&scheme_replaced, &scheme_replaced_parsed);
|
291
|
+
|
292
|
+
// We can assume that the input is canonicalized, which means it always has
|
293
|
+
// a colon after the scheme (or where the scheme would be).
|
294
|
+
int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
|
295
|
+
: 1;
|
296
|
+
if (spec_len - spec_after_colon > 0) {
|
297
|
+
scheme_replaced.Append(&spec[spec_after_colon],
|
298
|
+
spec_len - spec_after_colon);
|
299
|
+
}
|
300
|
+
|
301
|
+
// We now need to completely re-parse the resulting string since its meaning
|
302
|
+
// may have changed with the different scheme.
|
303
|
+
url_canon::RawCanonOutput<128> recanonicalized;
|
304
|
+
url_parse::Parsed recanonicalized_parsed;
|
305
|
+
DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
|
306
|
+
charset_converter,
|
307
|
+
&recanonicalized, &recanonicalized_parsed);
|
308
|
+
|
309
|
+
// Recurse using the version with the scheme already replaced. This will now
|
310
|
+
// use the replacement rules for the new scheme.
|
311
|
+
//
|
312
|
+
// Warning: this code assumes that ReplaceComponents will re-check all
|
313
|
+
// components for validity. This is because we can't fail if DoCanonicalize
|
314
|
+
// failed above since theoretically the thing making it fail could be
|
315
|
+
// getting replaced here. If ReplaceComponents didn't re-check everything,
|
316
|
+
// we wouldn't know if something *not* getting replaced is a problem.
|
317
|
+
// If the scheme-specific replacers are made more intelligent so they don't
|
318
|
+
// re-check everything, we should instead recanonicalize the whole thing
|
319
|
+
// after this call to check validity (this assumes replacing the scheme is
|
320
|
+
// much much less common than other types of replacements, like clearing the
|
321
|
+
// ref).
|
322
|
+
url_canon::Replacements<CHAR> replacements_no_scheme = replacements;
|
323
|
+
replacements_no_scheme.SetScheme(NULL, url_parse::Component());
|
324
|
+
return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
|
325
|
+
recanonicalized_parsed, replacements_no_scheme,
|
326
|
+
charset_converter, output, out_parsed);
|
327
|
+
}
|
328
|
+
|
329
|
+
// If we get here, then we know the scheme doesn't need to be replaced, so can
|
330
|
+
// just key off the scheme in the spec to know how to do the replacements.
|
331
|
+
if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {
|
332
|
+
return url_canon::ReplaceFileURL(spec, parsed, replacements,
|
333
|
+
charset_converter, output, out_parsed);
|
334
|
+
}
|
335
|
+
if (DoIsStandard(spec, parsed.scheme)) {
|
336
|
+
return url_canon::ReplaceStandardURL(spec, parsed, replacements,
|
337
|
+
charset_converter, output, out_parsed);
|
338
|
+
}
|
339
|
+
if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {
|
340
|
+
return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
|
341
|
+
output, out_parsed);
|
342
|
+
}
|
343
|
+
|
344
|
+
// Default is a path URL.
|
345
|
+
return url_canon::ReplacePathURL(spec, parsed, replacements,
|
346
|
+
output, out_parsed);
|
347
|
+
}
|
348
|
+
|
349
|
+
} // namespace
|
350
|
+
|
351
|
+
void Initialize() {
|
352
|
+
InitStandardSchemes();
|
353
|
+
}
|
354
|
+
|
355
|
+
void Shutdown() {
|
356
|
+
if (standard_schemes) {
|
357
|
+
delete standard_schemes;
|
358
|
+
standard_schemes = NULL;
|
359
|
+
}
|
360
|
+
}
|
361
|
+
|
362
|
+
void AddStandardScheme(const char* new_scheme) {
|
363
|
+
// If this assert triggers, it means you've called AddStandardScheme after
|
364
|
+
// LockStandardSchemes have been called (see the header file for
|
365
|
+
// LockStandardSchemes for more).
|
366
|
+
//
|
367
|
+
// This normally means you're trying to set up a new standard scheme too late
|
368
|
+
// in your application's init process. Locate where your app does this
|
369
|
+
// initialization and calls LockStandardScheme, and add your new standard
|
370
|
+
// scheme there.
|
371
|
+
DCHECK(!standard_schemes_locked) <<
|
372
|
+
"Trying to add a standard scheme after the list has been locked.";
|
373
|
+
|
374
|
+
size_t scheme_len = strlen(new_scheme);
|
375
|
+
if (scheme_len == 0)
|
376
|
+
return;
|
377
|
+
|
378
|
+
// Dulicate the scheme into a new buffer and add it to the list of standard
|
379
|
+
// schemes. This pointer will be leaked on shutdown.
|
380
|
+
char* dup_scheme = new char[scheme_len + 1];
|
381
|
+
memcpy(dup_scheme, new_scheme, scheme_len + 1);
|
382
|
+
|
383
|
+
InitStandardSchemes();
|
384
|
+
standard_schemes->push_back(dup_scheme);
|
385
|
+
}
|
386
|
+
|
387
|
+
void LockStandardSchemes() {
|
388
|
+
standard_schemes_locked = true;
|
389
|
+
}
|
390
|
+
|
391
|
+
bool IsStandard(const char* spec, const url_parse::Component& scheme) {
|
392
|
+
return DoIsStandard(spec, scheme);
|
393
|
+
}
|
394
|
+
|
395
|
+
bool IsStandard(const char16* spec, const url_parse::Component& scheme) {
|
396
|
+
return DoIsStandard(spec, scheme);
|
397
|
+
}
|
398
|
+
|
399
|
+
bool FindAndCompareScheme(const char* str,
|
400
|
+
int str_len,
|
401
|
+
const char* compare,
|
402
|
+
url_parse::Component* found_scheme) {
|
403
|
+
return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
|
404
|
+
}
|
405
|
+
|
406
|
+
bool FindAndCompareScheme(const char16* str,
|
407
|
+
int str_len,
|
408
|
+
const char* compare,
|
409
|
+
url_parse::Component* found_scheme) {
|
410
|
+
return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
|
411
|
+
}
|
412
|
+
|
413
|
+
bool Canonicalize(const char* spec,
|
414
|
+
int spec_len,
|
415
|
+
url_canon::CharsetConverter* charset_converter,
|
416
|
+
url_canon::CanonOutput* output,
|
417
|
+
url_parse::Parsed* output_parsed) {
|
418
|
+
return DoCanonicalize(spec, spec_len, charset_converter,
|
419
|
+
output, output_parsed);
|
420
|
+
}
|
421
|
+
|
422
|
+
bool Canonicalize(const char16* spec,
|
423
|
+
int spec_len,
|
424
|
+
url_canon::CharsetConverter* charset_converter,
|
425
|
+
url_canon::CanonOutput* output,
|
426
|
+
url_parse::Parsed* output_parsed) {
|
427
|
+
return DoCanonicalize(spec, spec_len, charset_converter,
|
428
|
+
output, output_parsed);
|
429
|
+
}
|
430
|
+
|
431
|
+
bool ResolveRelative(const char* base_spec,
|
432
|
+
int base_spec_len,
|
433
|
+
const url_parse::Parsed& base_parsed,
|
434
|
+
const char* relative,
|
435
|
+
int relative_length,
|
436
|
+
url_canon::CharsetConverter* charset_converter,
|
437
|
+
url_canon::CanonOutput* output,
|
438
|
+
url_parse::Parsed* output_parsed) {
|
439
|
+
return DoResolveRelative(base_spec, base_spec_len, base_parsed,
|
440
|
+
relative, relative_length,
|
441
|
+
charset_converter, output, output_parsed);
|
442
|
+
}
|
443
|
+
|
444
|
+
bool ResolveRelative(const char* base_spec,
|
445
|
+
int base_spec_len,
|
446
|
+
const url_parse::Parsed& base_parsed,
|
447
|
+
const char16* relative,
|
448
|
+
int relative_length,
|
449
|
+
url_canon::CharsetConverter* charset_converter,
|
450
|
+
url_canon::CanonOutput* output,
|
451
|
+
url_parse::Parsed* output_parsed) {
|
452
|
+
return DoResolveRelative(base_spec, base_spec_len, base_parsed,
|
453
|
+
relative, relative_length,
|
454
|
+
charset_converter, output, output_parsed);
|
455
|
+
}
|
456
|
+
|
457
|
+
bool ReplaceComponents(const char* spec,
|
458
|
+
int spec_len,
|
459
|
+
const url_parse::Parsed& parsed,
|
460
|
+
const url_canon::Replacements<char>& replacements,
|
461
|
+
url_canon::CharsetConverter* charset_converter,
|
462
|
+
url_canon::CanonOutput* output,
|
463
|
+
url_parse::Parsed* out_parsed) {
|
464
|
+
return DoReplaceComponents(spec, spec_len, parsed, replacements,
|
465
|
+
charset_converter, output, out_parsed);
|
466
|
+
}
|
467
|
+
|
468
|
+
bool ReplaceComponents(const char* spec,
|
469
|
+
int spec_len,
|
470
|
+
const url_parse::Parsed& parsed,
|
471
|
+
const url_canon::Replacements<char16>& replacements,
|
472
|
+
url_canon::CharsetConverter* charset_converter,
|
473
|
+
url_canon::CanonOutput* output,
|
474
|
+
url_parse::Parsed* out_parsed) {
|
475
|
+
return DoReplaceComponents(spec, spec_len, parsed, replacements,
|
476
|
+
charset_converter, output, out_parsed);
|
477
|
+
}
|
478
|
+
|
479
|
+
// Front-ends for LowerCaseEqualsASCII.
|
480
|
+
bool LowerCaseEqualsASCII(const char* a_begin,
|
481
|
+
const char* a_end,
|
482
|
+
const char* b) {
|
483
|
+
return DoLowerCaseEqualsASCII(a_begin, a_end, b);
|
484
|
+
}
|
485
|
+
|
486
|
+
bool LowerCaseEqualsASCII(const char* a_begin,
|
487
|
+
const char* a_end,
|
488
|
+
const char* b_begin,
|
489
|
+
const char* b_end) {
|
490
|
+
while (a_begin != a_end && b_begin != b_end &&
|
491
|
+
ToLowerASCII(*a_begin) == *b_begin) {
|
492
|
+
a_begin++;
|
493
|
+
b_begin++;
|
494
|
+
}
|
495
|
+
return a_begin == a_end && b_begin == b_end;
|
496
|
+
}
|
497
|
+
|
498
|
+
bool LowerCaseEqualsASCII(const char16* a_begin,
|
499
|
+
const char16* a_end,
|
500
|
+
const char* b) {
|
501
|
+
return DoLowerCaseEqualsASCII(a_begin, a_end, b);
|
502
|
+
}
|
503
|
+
|
504
|
+
void DecodeURLEscapeSequences(const char* input, int length,
|
505
|
+
url_canon::CanonOutputW* output) {
|
506
|
+
url_canon::RawCanonOutputT<char> unescaped_chars;
|
507
|
+
for (int i = 0; i < length; i++) {
|
508
|
+
if (input[i] == '%') {
|
509
|
+
unsigned char ch;
|
510
|
+
if (url_canon::DecodeEscaped(input, &i, length, &ch)) {
|
511
|
+
unescaped_chars.push_back(ch);
|
512
|
+
} else {
|
513
|
+
// Invalid escape sequence, copy the percent literal.
|
514
|
+
unescaped_chars.push_back('%');
|
515
|
+
}
|
516
|
+
} else {
|
517
|
+
// Regular non-escaped 8-bit character.
|
518
|
+
unescaped_chars.push_back(input[i]);
|
519
|
+
}
|
520
|
+
}
|
521
|
+
|
522
|
+
// Convert that 8-bit to UTF-16. It's not clear IE does this at all to
|
523
|
+
// JavaScript URLs, but Firefox and Safari do.
|
524
|
+
for (int i = 0; i < unescaped_chars.length(); i++) {
|
525
|
+
unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
|
526
|
+
if (uch < 0x80) {
|
527
|
+
// Non-UTF-8, just append directly
|
528
|
+
output->push_back(uch);
|
529
|
+
} else {
|
530
|
+
// next_ch will point to the last character of the decoded
|
531
|
+
// character.
|
532
|
+
int next_character = i;
|
533
|
+
unsigned code_point;
|
534
|
+
if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character,
|
535
|
+
unescaped_chars.length(), &code_point)) {
|
536
|
+
// Valid UTF-8 character, convert to UTF-16.
|
537
|
+
url_canon::AppendUTF16Value(code_point, output);
|
538
|
+
i = next_character;
|
539
|
+
} else {
|
540
|
+
// If there are any sequences that are not valid UTF-8, we keep
|
541
|
+
// invalid code points and promote to UTF-16. We copy all characters
|
542
|
+
// from the current position to the end of the identified sequence.
|
543
|
+
while (i < next_character) {
|
544
|
+
output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
|
545
|
+
i++;
|
546
|
+
}
|
547
|
+
output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
|
548
|
+
}
|
549
|
+
}
|
550
|
+
}
|
551
|
+
}
|
552
|
+
|
553
|
+
} // namespace url_util
|