uri_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,128 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Functions for canonicalizing "path" URLs. Not to be confused with the path
31
+ // of a URL, these are URLs that have no authority section, only a path. For
32
+ // example, "javascript:" and "data:".
33
+
34
+ #include "url_canon.h"
35
+ #include "url_canon_internal.h"
36
+
37
+ namespace url_canon {
38
+
39
+ namespace {
40
+
41
+ template<typename CHAR, typename UCHAR>
42
+ bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
43
+ const url_parse::Parsed& parsed,
44
+ CanonOutput* output,
45
+ url_parse::Parsed* new_parsed) {
46
+ // Scheme: this will append the colon.
47
+ bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
48
+ output, &new_parsed->scheme);
49
+
50
+ // We assume there's no authority for path URLs. Note that hosts should never
51
+ // have -1 length.
52
+ new_parsed->username.reset();
53
+ new_parsed->password.reset();
54
+ new_parsed->host.reset();
55
+ new_parsed->port.reset();
56
+
57
+ if (parsed.path.is_valid()) {
58
+ // Copy the path using path URL's more lax escaping rules (think for
59
+ // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
60
+ // ASCII characters alone. This helps readability of JavaStript.
61
+ new_parsed->path.begin = output->length();
62
+ int end = parsed.path.end();
63
+ for (int i = parsed.path.begin; i < end; i++) {
64
+ UCHAR uch = static_cast<UCHAR>(source.path[i]);
65
+ if (uch < 0x20 || uch >= 0x80)
66
+ success &= AppendUTF8EscapedChar(source.path, &i, end, output);
67
+ else
68
+ output->push_back(static_cast<char>(uch));
69
+ }
70
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
71
+ } else {
72
+ // Empty path.
73
+ new_parsed->path.reset();
74
+ }
75
+
76
+ // Assume there's no query or ref.
77
+ new_parsed->query.reset();
78
+ new_parsed->ref.reset();
79
+
80
+ return success;
81
+ }
82
+
83
+ } // namespace
84
+
85
+ bool CanonicalizePathURL(const char* spec,
86
+ int spec_len,
87
+ const url_parse::Parsed& parsed,
88
+ CanonOutput* output,
89
+ url_parse::Parsed* new_parsed) {
90
+ return DoCanonicalizePathURL<char, unsigned char>(
91
+ URLComponentSource<char>(spec), parsed, output, new_parsed);
92
+ }
93
+
94
+ bool CanonicalizePathURL(const char16* spec,
95
+ int spec_len,
96
+ const url_parse::Parsed& parsed,
97
+ CanonOutput* output,
98
+ url_parse::Parsed* new_parsed) {
99
+ return DoCanonicalizePathURL<char16, char16>(
100
+ URLComponentSource<char16>(spec), parsed, output, new_parsed);
101
+ }
102
+
103
+ bool ReplacePathURL(const char* base,
104
+ const url_parse::Parsed& base_parsed,
105
+ const Replacements<char>& replacements,
106
+ CanonOutput* output,
107
+ url_parse::Parsed* new_parsed) {
108
+ URLComponentSource<char> source(base);
109
+ url_parse::Parsed parsed(base_parsed);
110
+ SetupOverrideComponents(base, replacements, &source, &parsed);
111
+ return DoCanonicalizePathURL<char, unsigned char>(
112
+ source, parsed, output, new_parsed);
113
+ }
114
+
115
+ bool ReplacePathURL(const char* base,
116
+ const url_parse::Parsed& base_parsed,
117
+ const Replacements<char16>& replacements,
118
+ CanonOutput* output,
119
+ url_parse::Parsed* new_parsed) {
120
+ RawCanonOutput<1024> utf8;
121
+ URLComponentSource<char> source(base);
122
+ url_parse::Parsed parsed(base_parsed);
123
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
124
+ return DoCanonicalizePathURL<char, unsigned char>(
125
+ source, parsed, output, new_parsed);
126
+ }
127
+
128
+ } // namespace url_canon
@@ -0,0 +1,189 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include "url_canon.h"
31
+ #include "url_canon_internal.h"
32
+
33
+ // Query canonicalization in IE
34
+ // ----------------------------
35
+ // IE is very permissive for query parameters specified in links on the page
36
+ // (in contrast to links that it constructs itself based on form data). It does
37
+ // not unescape any character. It does not reject any escape sequence (be they
38
+ // invalid like "%2y" or freaky like %00).
39
+ //
40
+ // IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09),
41
+ // LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier
42
+ // layer since they are removed from all portions of the URL). All other
43
+ // characters are passed unmodified. Invalid UTF-16 sequences are preserved as
44
+ // well, with each character in the input being converted to UTF-8. It is the
45
+ // server's job to make sense of this invalid query.
46
+ //
47
+ // Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page)
48
+ // are converted to the invalid character and sent as unescaped UTF-8 (0xef,
49
+ // 0xbf, 0xbd). This may not be canonicalization, the parser may generate these
50
+ // strings before the URL handler ever sees them.
51
+ //
52
+ // Our query canonicalization
53
+ // --------------------------
54
+ // We escape all non-ASCII characters and control characters, like Firefox.
55
+ // This is more conformant to the URL spec, and there do not seem to be many
56
+ // problems relating to Firefox's behavior.
57
+ //
58
+ // Like IE, we will never unescape (although the application may want to try
59
+ // unescaping to present the user with a more understandable URL). We will
60
+ // replace all invalid sequences (including invalid UTF-16 sequences, which IE
61
+ // doesn't) with the "invalid character," and we will escape it.
62
+
63
+ namespace url_canon {
64
+
65
+ namespace {
66
+
67
+ // Returns true if the characters starting at |begin| and going until |end|
68
+ // (non-inclusive) are all representable in 7-bits.
69
+ template<typename CHAR, typename UCHAR>
70
+ bool IsAllASCII(const CHAR* spec, const url_parse::Component& query) {
71
+ int end = query.end();
72
+ for (int i = query.begin; i < end; i++) {
73
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
74
+ return false;
75
+ }
76
+ return true;
77
+ }
78
+
79
+ // Appends the given string to the output, escaping characters that do not
80
+ // match the given |type| in SharedCharTypes. This version will accept 8 or 16
81
+ // bit characters, but assumes that they have only 7-bit values. It also assumes
82
+ // that all UTF-8 values are correct, so doesn't bother checking
83
+ template<typename CHAR>
84
+ void AppendRaw8BitQueryString(const CHAR* source, int length,
85
+ CanonOutput* output) {
86
+ for (int i = 0; i < length; i++) {
87
+ if (!IsQueryChar(static_cast<unsigned char>(source[i])))
88
+ AppendEscapedChar(static_cast<unsigned char>(source[i]), output);
89
+ else // Doesn't need escaping.
90
+ output->push_back(static_cast<char>(source[i]));
91
+ }
92
+ }
93
+
94
+ // Runs the converter on the given UTF-8 input. Since the converter expects
95
+ // UTF-16, we have to convert first. The converter must be non-NULL.
96
+ void RunConverter(const char* spec,
97
+ const url_parse::Component& query,
98
+ CharsetConverter* converter,
99
+ CanonOutput* output) {
100
+ // This function will replace any misencoded values with the invalid
101
+ // character. This is what we want so we don't have to check for error.
102
+ RawCanonOutputW<1024> utf16;
103
+ ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16);
104
+ converter->ConvertFromUTF16(utf16.data(), utf16.length(), output);
105
+ }
106
+
107
+ // Runs the converter with the given UTF-16 input. We don't have to do
108
+ // anything, but this overriddden function allows us to use the same code
109
+ // for both UTF-8 and UTF-16 input.
110
+ void RunConverter(const char16* spec,
111
+ const url_parse::Component& query,
112
+ CharsetConverter* converter,
113
+ CanonOutput* output) {
114
+ converter->ConvertFromUTF16(&spec[query.begin], query.len, output);
115
+ }
116
+
117
+ template<typename CHAR, typename UCHAR>
118
+ void DoConvertToQueryEncoding(const CHAR* spec,
119
+ const url_parse::Component& query,
120
+ CharsetConverter* converter,
121
+ CanonOutput* output) {
122
+ if (IsAllASCII<CHAR, UCHAR>(spec, query)) {
123
+ // Easy: the input can just appended with no character set conversions.
124
+ AppendRaw8BitQueryString(&spec[query.begin], query.len, output);
125
+
126
+ } else {
127
+ // Harder: convert to the proper encoding first.
128
+ if (converter) {
129
+ // Run the converter to get an 8-bit string, then append it, escaping
130
+ // necessary values.
131
+ RawCanonOutput<1024> eight_bit;
132
+ RunConverter(spec, query, converter, &eight_bit);
133
+ AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output);
134
+
135
+ } else {
136
+ // No converter, do our own UTF-8 conversion.
137
+ AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);
138
+ }
139
+ }
140
+ }
141
+
142
+ template<typename CHAR, typename UCHAR>
143
+ void DoCanonicalizeQuery(const CHAR* spec,
144
+ const url_parse::Component& query,
145
+ CharsetConverter* converter,
146
+ CanonOutput* output,
147
+ url_parse::Component* out_query) {
148
+ if (query.len < 0) {
149
+ *out_query = url_parse::Component();
150
+ return;
151
+ }
152
+
153
+ output->push_back('?');
154
+ out_query->begin = output->length();
155
+
156
+ DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output);
157
+
158
+ out_query->len = output->length() - out_query->begin;
159
+ }
160
+
161
+ } // namespace
162
+
163
+ void CanonicalizeQuery(const char* spec,
164
+ const url_parse::Component& query,
165
+ CharsetConverter* converter,
166
+ CanonOutput* output,
167
+ url_parse::Component* out_query) {
168
+ DoCanonicalizeQuery<char, unsigned char>(spec, query, converter,
169
+ output, out_query);
170
+ }
171
+
172
+ void CanonicalizeQuery(const char16* spec,
173
+ const url_parse::Component& query,
174
+ CharsetConverter* converter,
175
+ CanonOutput* output,
176
+ url_parse::Component* out_query) {
177
+ DoCanonicalizeQuery<char16, char16>(spec, query, converter,
178
+ output, out_query);
179
+ }
180
+
181
+ void ConvertUTF16ToQueryEncoding(const char16* input,
182
+ const url_parse::Component& query,
183
+ CharsetConverter* converter,
184
+ CanonOutput* output) {
185
+ DoConvertToQueryEncoding<char16, char16>(input, query,
186
+ converter, output);
187
+ }
188
+
189
+ } // namespace url_canon
@@ -0,0 +1,572 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Canonicalizer functions for working with and resolving relative URLs.
31
+
32
+ #include "logging.h"
33
+ #include "url_canon.h"
34
+ #include "url_canon_internal.h"
35
+ #include "url_file.h"
36
+ #include "url_parse_internal.h"
37
+
38
+ namespace url_canon {
39
+
40
+ namespace {
41
+
42
+ // Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
43
+ // 379034), whereas IE is case-insensetive.
44
+ //
45
+ // We choose to be more permissive like IE. We don't need to worry about
46
+ // unescaping or anything here: neither IE or Firefox allow this. We also
47
+ // don't have to worry about invalid scheme characters since we are comparing
48
+ // against the canonical scheme of the base.
49
+ //
50
+ // The base URL should always be canonical, therefore is ASCII.
51
+ template<typename CHAR>
52
+ bool AreSchemesEqual(const char* base,
53
+ const url_parse::Component& base_scheme,
54
+ const CHAR* cmp,
55
+ const url_parse::Component& cmp_scheme) {
56
+ if (base_scheme.len != cmp_scheme.len)
57
+ return false;
58
+ for (int i = 0; i < base_scheme.len; i++) {
59
+ // We assume the base is already canonical, so we don't have to
60
+ // canonicalize it.
61
+ if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) !=
62
+ base[base_scheme.begin + i])
63
+ return false;
64
+ }
65
+ return true;
66
+ }
67
+
68
+ #ifdef WIN32
69
+
70
+ // Here, we also allow Windows paths to be represented as "/C:/" so we can be
71
+ // consistent about URL paths beginning with slashes. This function is like
72
+ // DoesBeginWindowsDrivePath except that it also requires a slash at the
73
+ // beginning.
74
+ template<typename CHAR>
75
+ bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset,
76
+ int spec_len) {
77
+ if (start_offset >= spec_len)
78
+ return false;
79
+ return url_parse::IsURLSlash(spec[start_offset]) &&
80
+ url_parse::DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len);
81
+ }
82
+
83
+ #endif // WIN32
84
+
85
+ // See IsRelativeURL in the header file for usage.
86
+ template<typename CHAR>
87
+ bool DoIsRelativeURL(const char* base,
88
+ const url_parse::Parsed& base_parsed,
89
+ const CHAR* url,
90
+ int url_len,
91
+ bool is_base_hierarchical,
92
+ bool* is_relative,
93
+ url_parse::Component* relative_component) {
94
+ *is_relative = false; // So we can default later to not relative.
95
+
96
+ // Trim whitespace and construct a new range for the substring.
97
+ int begin = 0;
98
+ url_parse::TrimURL(url, &begin, &url_len);
99
+ if (begin >= url_len) {
100
+ // Empty URLs are relative, but do nothing.
101
+ *relative_component = url_parse::Component(begin, 0);
102
+ *is_relative = true;
103
+ return true;
104
+ }
105
+
106
+ #ifdef WIN32
107
+ // We special case paths like "C:\foo" so they can link directly to the
108
+ // file on Windows (IE compatability). The security domain stuff should
109
+ // prevent a link like this from actually being followed if its on a
110
+ // web page.
111
+ //
112
+ // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/"
113
+ // as relative, as this will just replace the path when the base scheme
114
+ // is a file and the answer will still be correct.
115
+ //
116
+ // We require strict backslashes when detecting UNC since two forward
117
+ // shashes should be treated a a relative URL with a hostname.
118
+ if (url_parse::DoesBeginWindowsDriveSpec(url, begin, url_len) ||
119
+ url_parse::DoesBeginUNCPath(url, begin, url_len, true))
120
+ return true;
121
+ #endif // WIN32
122
+
123
+ // See if we've got a scheme, if not, we know this is a relative URL.
124
+ // BUT: Just because we have a scheme, doesn't make it absolute.
125
+ // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
126
+ // empty, we treat it as relative (":foo") like IE does.
127
+ url_parse::Component scheme;
128
+ if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
129
+ // Don't allow relative URLs if the base scheme doesn't support it.
130
+ if (!is_base_hierarchical)
131
+ return false;
132
+
133
+ *relative_component = url_parse::MakeRange(begin, url_len);
134
+ *is_relative = true;
135
+ return true;
136
+ }
137
+
138
+ // If the scheme isn't valid, then it's relative.
139
+ int scheme_end = scheme.end();
140
+ for (int i = scheme.begin; i < scheme_end; i++) {
141
+ if (!CanonicalSchemeChar(url[i])) {
142
+ *relative_component = url_parse::MakeRange(begin, url_len);
143
+ *is_relative = true;
144
+ return true;
145
+ }
146
+ }
147
+
148
+ // If the scheme is not the same, then we can't count it as relative.
149
+ if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme))
150
+ return true;
151
+
152
+ // When the scheme that they both share is not hierarchical, treat the
153
+ // incoming scheme as absolute (this way with the base of "data:foo",
154
+ // "data:bar" will be reported as absolute.
155
+ if (!is_base_hierarchical)
156
+ return true;
157
+
158
+ // ExtractScheme guarantees that the colon immediately follows what it
159
+ // considers to be the scheme. CountConsecutiveSlashes will handle the
160
+ // case where the begin offset is the end of the input.
161
+ int colon_offset = scheme.end();
162
+ int num_slashes = url_parse::CountConsecutiveSlashes(url, colon_offset + 1,
163
+ url_len);
164
+
165
+ if (num_slashes == 0 || num_slashes == 1) {
166
+ // No slashes means it's a relative path like "http:foo.html". One slash
167
+ // is an absolute path. "http:/home/foo.html"
168
+ *is_relative = true;
169
+ *relative_component = url_parse::MakeRange(colon_offset + 1, url_len);
170
+ return true;
171
+ }
172
+
173
+ // Two or more slashes after the scheme we treat as absolute.
174
+ return true;
175
+ }
176
+
177
+ // Copies all characters in the range [begin, end) of |spec| to the output,
178
+ // up until and including the last slash. There should be a slash in the
179
+ // range, if not, nothing will be copied.
180
+ //
181
+ // The input is assumed to be canonical, so we search only for exact slashes
182
+ // and not backslashes as well. We also know that it's ASCII.
183
+ void CopyToLastSlash(const char* spec,
184
+ int begin,
185
+ int end,
186
+ CanonOutput* output) {
187
+ // Find the last slash.
188
+ int last_slash = -1;
189
+ for (int i = end - 1; i >= begin; i--) {
190
+ if (spec[i] == '/') {
191
+ last_slash = i;
192
+ break;
193
+ }
194
+ }
195
+ if (last_slash < 0)
196
+ return; // No slash.
197
+
198
+ // Copy.
199
+ for (int i = begin; i <= last_slash; i++)
200
+ output->push_back(spec[i]);
201
+ }
202
+
203
+ // Copies a single component from the source to the output. This is used
204
+ // when resolving relative URLs and a given component is unchanged. Since the
205
+ // source should already be canonical, we don't have to do anything special,
206
+ // and the input is ASCII.
207
+ void CopyOneComponent(const char* source,
208
+ const url_parse::Component& source_component,
209
+ CanonOutput* output,
210
+ url_parse::Component* output_component) {
211
+ if (source_component.len < 0) {
212
+ // This component is not present.
213
+ *output_component = url_parse::Component();
214
+ return;
215
+ }
216
+
217
+ output_component->begin = output->length();
218
+ int source_end = source_component.end();
219
+ for (int i = source_component.begin; i < source_end; i++)
220
+ output->push_back(source[i]);
221
+ output_component->len = output->length() - output_component->begin;
222
+ }
223
+
224
+ #ifdef WIN32
225
+
226
+ // Called on Windows when the base URL is a file URL, this will copy the "C:"
227
+ // to the output, if there is a drive letter and if that drive letter is not
228
+ // being overridden by the relative URL. Otherwise, do nothing.
229
+ //
230
+ // It will return the index of the beginning of the next character in the
231
+ // base to be processed: if there is a "C:", the slash after it, or if
232
+ // there is no drive letter, the slash at the beginning of the path, or
233
+ // the end of the base. This can be used as the starting offset for further
234
+ // path processing.
235
+ template<typename CHAR>
236
+ int CopyBaseDriveSpecIfNecessary(const char* base_url,
237
+ int base_path_begin,
238
+ int base_path_end,
239
+ const CHAR* relative_url,
240
+ int path_start,
241
+ int relative_url_len,
242
+ CanonOutput* output) {
243
+ if (base_path_begin >= base_path_end)
244
+ return base_path_begin; // No path.
245
+
246
+ // If the relative begins with a drive spec, don't do anything. The existing
247
+ // drive spec in the base will be replaced.
248
+ if (url_parse::DoesBeginWindowsDriveSpec(relative_url,
249
+ path_start, relative_url_len)) {
250
+ return base_path_begin; // Relative URL path is "C:/foo"
251
+ }
252
+
253
+ // The path should begin with a slash (as all canonical paths do). We check
254
+ // if it is followed by a drive letter and copy it.
255
+ if (DoesBeginSlashWindowsDriveSpec(base_url,
256
+ base_path_begin,
257
+ base_path_end)) {
258
+ // Copy the two-character drive spec to the output. It will now look like
259
+ // "file:///C:" so the rest of it can be treated like a standard path.
260
+ output->push_back('/');
261
+ output->push_back(base_url[base_path_begin + 1]);
262
+ output->push_back(base_url[base_path_begin + 2]);
263
+ return base_path_begin + 3;
264
+ }
265
+
266
+ return base_path_begin;
267
+ }
268
+
269
+ #endif // WIN32
270
+
271
+ // A subroutine of DoResolveRelativeURL, this resolves the URL knowning that
272
+ // the input is a relative path or less (qyuery or ref).
273
+ template<typename CHAR>
274
+ bool DoResolveRelativePath(const char* base_url,
275
+ const url_parse::Parsed& base_parsed,
276
+ bool base_is_file,
277
+ const CHAR* relative_url,
278
+ const url_parse::Component& relative_component,
279
+ CharsetConverter* query_converter,
280
+ CanonOutput* output,
281
+ url_parse::Parsed* out_parsed) {
282
+ bool success = true;
283
+
284
+ // We know the authority section didn't change, copy it to the output. We
285
+ // also know we have a path so can copy up to there.
286
+ url_parse::Component path, query, ref;
287
+ url_parse::ParsePathInternal(relative_url,
288
+ relative_component,
289
+ &path,
290
+ &query,
291
+ &ref);
292
+ // Canonical URLs always have a path, so we can use that offset.
293
+ output->Append(base_url, base_parsed.path.begin);
294
+
295
+ if (path.len > 0) {
296
+ // The path is replaced or modified.
297
+ int true_path_begin = output->length();
298
+
299
+ // For file: URLs on Windows, we don't want to treat the drive letter and
300
+ // colon as part of the path for relative file resolution when the
301
+ // incoming URL does not provide a drive spec. We save the true path
302
+ // beginning so we can fix it up after we are done.
303
+ int base_path_begin = base_parsed.path.begin;
304
+ #ifdef WIN32
305
+ if (base_is_file) {
306
+ base_path_begin = CopyBaseDriveSpecIfNecessary(
307
+ base_url, base_parsed.path.begin, base_parsed.path.end(),
308
+ relative_url, relative_component.begin, relative_component.end(),
309
+ output);
310
+ // Now the output looks like either "file://" or "file:///C:"
311
+ // and we can start appending the rest of the path. |base_path_begin|
312
+ // points to the character in the base that comes next.
313
+ }
314
+ #endif // WIN32
315
+
316
+ if (url_parse::IsURLSlash(relative_url[path.begin])) {
317
+ // Easy case: the path is an absolute path on the server, so we can
318
+ // just replace everything from the path on with the new versions.
319
+ // Since the input should be canonical hierarchical URL, we should
320
+ // always have a path.
321
+ success &= CanonicalizePath(relative_url, path,
322
+ output, &out_parsed->path);
323
+ } else {
324
+ // Relative path, replace the query, and reference. We take the
325
+ // original path with the file part stripped, and append the new path.
326
+ // The canonicalizer will take care of resolving ".." and "."
327
+ int path_begin = output->length();
328
+ CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(),
329
+ output);
330
+ success &= CanonicalizePartialPath(relative_url, path, path_begin,
331
+ output);
332
+ out_parsed->path = url_parse::MakeRange(path_begin, output->length());
333
+
334
+ // Copy the rest of the stuff after the path from the relative path.
335
+ }
336
+
337
+ // Finish with the query and reference part (these can't fail).
338
+ CanonicalizeQuery(relative_url, query, query_converter,
339
+ output, &out_parsed->query);
340
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
341
+
342
+ // Fix the path beginning to add back the "C:" we may have written above.
343
+ out_parsed->path = url_parse::MakeRange(true_path_begin,
344
+ out_parsed->path.end());
345
+ return success;
346
+ }
347
+
348
+ // If we get here, the path is unchanged: copy to output.
349
+ CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path);
350
+
351
+ if (query.is_valid()) {
352
+ // Just the query specified, replace the query and reference (ignore
353
+ // failures for refs)
354
+ CanonicalizeQuery(relative_url, query, query_converter,
355
+ output, &out_parsed->query);
356
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
357
+ return success;
358
+ }
359
+
360
+ // If we get here, the query is unchanged: copy to output. Note that the
361
+ // range of the query parameter doesn't include the question mark, so we
362
+ // have to add it manually if there is a component.
363
+ if (base_parsed.query.is_valid())
364
+ output->push_back('?');
365
+ CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query);
366
+
367
+ if (ref.is_valid()) {
368
+ // Just the reference specified: replace it (ignoring failures).
369
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
370
+ return success;
371
+ }
372
+
373
+ // We should always have something to do in this function, the caller checks
374
+ // that some component is being replaced.
375
+ DCHECK(false) << "Not reached";
376
+ return success;
377
+ }
378
+
379
+ // Resolves a relative URL that contains a host. Typically, these will
380
+ // be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which
381
+ // should be kept from the original URL is the scheme.
382
+ template<typename CHAR>
383
+ bool DoResolveRelativeHost(const char* base_url,
384
+ const url_parse::Parsed& base_parsed,
385
+ const CHAR* relative_url,
386
+ const url_parse::Component& relative_component,
387
+ CharsetConverter* query_converter,
388
+ CanonOutput* output,
389
+ url_parse::Parsed* out_parsed) {
390
+ // Parse the relative URL, just like we would for anything following a
391
+ // scheme.
392
+ url_parse::Parsed relative_parsed; // Everything but the scheme is valid.
393
+ url_parse::ParseAfterScheme(&relative_url[relative_component.begin],
394
+ relative_component.len, relative_component.begin,
395
+ &relative_parsed);
396
+
397
+ // Now we can just use the replacement function to replace all the necessary
398
+ // parts of the old URL with the new one.
399
+ Replacements<CHAR> replacements;
400
+ replacements.SetUsername(relative_url, relative_parsed.username);
401
+ replacements.SetPassword(relative_url, relative_parsed.password);
402
+ replacements.SetHost(relative_url, relative_parsed.host);
403
+ replacements.SetPort(relative_url, relative_parsed.port);
404
+ replacements.SetPath(relative_url, relative_parsed.path);
405
+ replacements.SetQuery(relative_url, relative_parsed.query);
406
+ replacements.SetRef(relative_url, relative_parsed.ref);
407
+
408
+ return ReplaceStandardURL(base_url, base_parsed, replacements,
409
+ query_converter, output, out_parsed);
410
+ }
411
+
412
+ // Resolves a relative URL that happens to be an absolute file path. Examples
413
+ // include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
414
+ template<typename CHAR>
415
+ bool DoResolveAbsoluteFile(const CHAR* relative_url,
416
+ const url_parse::Component& relative_component,
417
+ CharsetConverter* query_converter,
418
+ CanonOutput* output,
419
+ url_parse::Parsed* out_parsed) {
420
+ // Parse the file URL. The file URl parsing function uses the same logic
421
+ // as we do for determining if the file is absolute, in which case it will
422
+ // not bother to look for a scheme.
423
+ url_parse::Parsed relative_parsed;
424
+ url_parse::ParseFileURL(&relative_url[relative_component.begin],
425
+ relative_component.len, &relative_parsed);
426
+
427
+ return CanonicalizeFileURL(&relative_url[relative_component.begin],
428
+ relative_component.len, relative_parsed,
429
+ query_converter, output, out_parsed);
430
+ }
431
+
432
+ // TODO(brettw) treat two slashes as root like Mozilla for FTP?
433
+ template<typename CHAR>
434
+ bool DoResolveRelativeURL(const char* base_url,
435
+ const url_parse::Parsed& base_parsed,
436
+ bool base_is_file,
437
+ const CHAR* relative_url,
438
+ const url_parse::Component& relative_component,
439
+ CharsetConverter* query_converter,
440
+ CanonOutput* output,
441
+ url_parse::Parsed* out_parsed) {
442
+ // Starting point for our output parsed. We'll fix what we change.
443
+ *out_parsed = base_parsed;
444
+
445
+ // Sanity check: the input should have a host or we'll break badly below.
446
+ // We can only resolve relative URLs with base URLs that have hosts and
447
+ // paths (even the default path of "/" is OK).
448
+ //
449
+ // We allow hosts with no length so we can handle file URLs, for example.
450
+ if (base_parsed.path.len <= 0) {
451
+ // On error, return the input (resolving a relative URL on a non-relative
452
+ // base = the base).
453
+ int base_len = base_parsed.Length();
454
+ for (int i = 0; i < base_len; i++)
455
+ output->push_back(base_url[i]);
456
+ return false;
457
+ }
458
+
459
+ if (relative_component.len <= 0) {
460
+ // Empty relative URL, leave unchanged, only removing the ref component.
461
+ int base_len = base_parsed.Length();
462
+ base_len -= base_parsed.ref.len + 1;
463
+ out_parsed->ref.reset();
464
+ output->Append(base_url, base_len);
465
+ return true;
466
+ }
467
+
468
+ int num_slashes = url_parse::CountConsecutiveSlashes(
469
+ relative_url, relative_component.begin, relative_component.end());
470
+
471
+ #ifdef WIN32
472
+ // On Windows, two slashes for a file path (regardless of which direction
473
+ // they are) means that it's UNC. Two backslashes on any base scheme mean
474
+ // that it's an absolute UNC path (we use the base_is_file flag to control
475
+ // how strict the UNC finder is).
476
+ //
477
+ // We also allow Windows absolute drive specs on any scheme (for example
478
+ // "c:\foo") like IE does. There must be no preceeding slashes in this
479
+ // case (we reject anything like "/c:/foo") because that should be treated
480
+ // as a path. For file URLs, we allow any number of slashes since that would
481
+ // be setting the path.
482
+ //
483
+ // This assumes the absolute path resolver handles absolute URLs like this
484
+ // properly. url_util::DoCanonicalize does this.
485
+ int after_slashes = relative_component.begin + num_slashes;
486
+ if (url_parse::DoesBeginUNCPath(relative_url, relative_component.begin,
487
+ relative_component.end(), !base_is_file) ||
488
+ ((num_slashes == 0 || base_is_file) &&
489
+ url_parse::DoesBeginWindowsDriveSpec(relative_url, after_slashes,
490
+ relative_component.end()))) {
491
+ return DoResolveAbsoluteFile(relative_url, relative_component,
492
+ query_converter, output, out_parsed);
493
+ }
494
+ #else
495
+ // Other platforms need explicit handling for file: URLs with multiple
496
+ // slashes because the generic scheme parsing always extracts a host, but a
497
+ // file: URL only has a host if it has exactly 2 slashes. This also
498
+ // handles the special case where the URL is only slashes, since that
499
+ // doesn't have a host part either.
500
+ if (base_is_file &&
501
+ (num_slashes > 2 || num_slashes == relative_component.len)) {
502
+ return DoResolveAbsoluteFile(relative_url, relative_component,
503
+ query_converter, output, out_parsed);
504
+ }
505
+ #endif
506
+
507
+ // Any other double-slashes mean that this is relative to the scheme.
508
+ if (num_slashes >= 2) {
509
+ return DoResolveRelativeHost(base_url, base_parsed,
510
+ relative_url, relative_component,
511
+ query_converter, output, out_parsed);
512
+ }
513
+
514
+ // When we get here, we know that the relative URL is on the same host.
515
+ return DoResolveRelativePath(base_url, base_parsed, base_is_file,
516
+ relative_url, relative_component,
517
+ query_converter, output, out_parsed);
518
+ }
519
+
520
+ } // namespace
521
+
522
+ bool IsRelativeURL(const char* base,
523
+ const url_parse::Parsed& base_parsed,
524
+ const char* fragment,
525
+ int fragment_len,
526
+ bool is_base_hierarchical,
527
+ bool* is_relative,
528
+ url_parse::Component* relative_component) {
529
+ return DoIsRelativeURL<char>(
530
+ base, base_parsed, fragment, fragment_len, is_base_hierarchical,
531
+ is_relative, relative_component);
532
+ }
533
+
534
+ bool IsRelativeURL(const char* base,
535
+ const url_parse::Parsed& base_parsed,
536
+ const char16* fragment,
537
+ int fragment_len,
538
+ bool is_base_hierarchical,
539
+ bool* is_relative,
540
+ url_parse::Component* relative_component) {
541
+ return DoIsRelativeURL<char16>(
542
+ base, base_parsed, fragment, fragment_len, is_base_hierarchical,
543
+ is_relative, relative_component);
544
+ }
545
+
546
+ bool ResolveRelativeURL(const char* base_url,
547
+ const url_parse::Parsed& base_parsed,
548
+ bool base_is_file,
549
+ const char* relative_url,
550
+ const url_parse::Component& relative_component,
551
+ CharsetConverter* query_converter,
552
+ CanonOutput* output,
553
+ url_parse::Parsed* out_parsed) {
554
+ return DoResolveRelativeURL<char>(
555
+ base_url, base_parsed, base_is_file, relative_url,
556
+ relative_component, query_converter, output, out_parsed);
557
+ }
558
+
559
+ bool ResolveRelativeURL(const char* base_url,
560
+ const url_parse::Parsed& base_parsed,
561
+ bool base_is_file,
562
+ const char16* relative_url,
563
+ const url_parse::Component& relative_component,
564
+ CharsetConverter* query_converter,
565
+ CanonOutput* output,
566
+ url_parse::Parsed* out_parsed) {
567
+ return DoResolveRelativeURL<char16>(
568
+ base_url, base_parsed, base_is_file, relative_url,
569
+ relative_component, query_converter, output, out_parsed);
570
+ }
571
+
572
+ } // namespace url_canon