uri_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,128 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Functions for canonicalizing "path" URLs. Not to be confused with the path
31
+ // of a URL, these are URLs that have no authority section, only a path. For
32
+ // example, "javascript:" and "data:".
33
+
34
+ #include "url_canon.h"
35
+ #include "url_canon_internal.h"
36
+
37
+ namespace url_canon {
38
+
39
+ namespace {
40
+
41
+ template<typename CHAR, typename UCHAR>
42
+ bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
43
+ const url_parse::Parsed& parsed,
44
+ CanonOutput* output,
45
+ url_parse::Parsed* new_parsed) {
46
+ // Scheme: this will append the colon.
47
+ bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
48
+ output, &new_parsed->scheme);
49
+
50
+ // We assume there's no authority for path URLs. Note that hosts should never
51
+ // have -1 length.
52
+ new_parsed->username.reset();
53
+ new_parsed->password.reset();
54
+ new_parsed->host.reset();
55
+ new_parsed->port.reset();
56
+
57
+ if (parsed.path.is_valid()) {
58
+ // Copy the path using path URL's more lax escaping rules (think for
59
+ // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
60
+ // ASCII characters alone. This helps readability of JavaStript.
61
+ new_parsed->path.begin = output->length();
62
+ int end = parsed.path.end();
63
+ for (int i = parsed.path.begin; i < end; i++) {
64
+ UCHAR uch = static_cast<UCHAR>(source.path[i]);
65
+ if (uch < 0x20 || uch >= 0x80)
66
+ success &= AppendUTF8EscapedChar(source.path, &i, end, output);
67
+ else
68
+ output->push_back(static_cast<char>(uch));
69
+ }
70
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
71
+ } else {
72
+ // Empty path.
73
+ new_parsed->path.reset();
74
+ }
75
+
76
+ // Assume there's no query or ref.
77
+ new_parsed->query.reset();
78
+ new_parsed->ref.reset();
79
+
80
+ return success;
81
+ }
82
+
83
+ } // namespace
84
+
85
+ bool CanonicalizePathURL(const char* spec,
86
+ int spec_len,
87
+ const url_parse::Parsed& parsed,
88
+ CanonOutput* output,
89
+ url_parse::Parsed* new_parsed) {
90
+ return DoCanonicalizePathURL<char, unsigned char>(
91
+ URLComponentSource<char>(spec), parsed, output, new_parsed);
92
+ }
93
+
94
+ bool CanonicalizePathURL(const char16* spec,
95
+ int spec_len,
96
+ const url_parse::Parsed& parsed,
97
+ CanonOutput* output,
98
+ url_parse::Parsed* new_parsed) {
99
+ return DoCanonicalizePathURL<char16, char16>(
100
+ URLComponentSource<char16>(spec), parsed, output, new_parsed);
101
+ }
102
+
103
+ bool ReplacePathURL(const char* base,
104
+ const url_parse::Parsed& base_parsed,
105
+ const Replacements<char>& replacements,
106
+ CanonOutput* output,
107
+ url_parse::Parsed* new_parsed) {
108
+ URLComponentSource<char> source(base);
109
+ url_parse::Parsed parsed(base_parsed);
110
+ SetupOverrideComponents(base, replacements, &source, &parsed);
111
+ return DoCanonicalizePathURL<char, unsigned char>(
112
+ source, parsed, output, new_parsed);
113
+ }
114
+
115
+ bool ReplacePathURL(const char* base,
116
+ const url_parse::Parsed& base_parsed,
117
+ const Replacements<char16>& replacements,
118
+ CanonOutput* output,
119
+ url_parse::Parsed* new_parsed) {
120
+ RawCanonOutput<1024> utf8;
121
+ URLComponentSource<char> source(base);
122
+ url_parse::Parsed parsed(base_parsed);
123
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
124
+ return DoCanonicalizePathURL<char, unsigned char>(
125
+ source, parsed, output, new_parsed);
126
+ }
127
+
128
+ } // namespace url_canon
@@ -0,0 +1,189 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include "url_canon.h"
31
+ #include "url_canon_internal.h"
32
+
33
+ // Query canonicalization in IE
34
+ // ----------------------------
35
+ // IE is very permissive for query parameters specified in links on the page
36
+ // (in contrast to links that it constructs itself based on form data). It does
37
+ // not unescape any character. It does not reject any escape sequence (be they
38
+ // invalid like "%2y" or freaky like %00).
39
+ //
40
+ // IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09),
41
+ // LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier
42
+ // layer since they are removed from all portions of the URL). All other
43
+ // characters are passed unmodified. Invalid UTF-16 sequences are preserved as
44
+ // well, with each character in the input being converted to UTF-8. It is the
45
+ // server's job to make sense of this invalid query.
46
+ //
47
+ // Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page)
48
+ // are converted to the invalid character and sent as unescaped UTF-8 (0xef,
49
+ // 0xbf, 0xbd). This may not be canonicalization, the parser may generate these
50
+ // strings before the URL handler ever sees them.
51
+ //
52
+ // Our query canonicalization
53
+ // --------------------------
54
+ // We escape all non-ASCII characters and control characters, like Firefox.
55
+ // This is more conformant to the URL spec, and there do not seem to be many
56
+ // problems relating to Firefox's behavior.
57
+ //
58
+ // Like IE, we will never unescape (although the application may want to try
59
+ // unescaping to present the user with a more understandable URL). We will
60
+ // replace all invalid sequences (including invalid UTF-16 sequences, which IE
61
+ // doesn't) with the "invalid character," and we will escape it.
62
+
63
+ namespace url_canon {
64
+
65
+ namespace {
66
+
67
+ // Returns true if the characters starting at |begin| and going until |end|
68
+ // (non-inclusive) are all representable in 7-bits.
69
+ template<typename CHAR, typename UCHAR>
70
+ bool IsAllASCII(const CHAR* spec, const url_parse::Component& query) {
71
+ int end = query.end();
72
+ for (int i = query.begin; i < end; i++) {
73
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
74
+ return false;
75
+ }
76
+ return true;
77
+ }
78
+
79
+ // Appends the given string to the output, escaping characters that do not
80
+ // match the given |type| in SharedCharTypes. This version will accept 8 or 16
81
+ // bit characters, but assumes that they have only 7-bit values. It also assumes
82
+ // that all UTF-8 values are correct, so doesn't bother checking
83
+ template<typename CHAR>
84
+ void AppendRaw8BitQueryString(const CHAR* source, int length,
85
+ CanonOutput* output) {
86
+ for (int i = 0; i < length; i++) {
87
+ if (!IsQueryChar(static_cast<unsigned char>(source[i])))
88
+ AppendEscapedChar(static_cast<unsigned char>(source[i]), output);
89
+ else // Doesn't need escaping.
90
+ output->push_back(static_cast<char>(source[i]));
91
+ }
92
+ }
93
+
94
+ // Runs the converter on the given UTF-8 input. Since the converter expects
95
+ // UTF-16, we have to convert first. The converter must be non-NULL.
96
+ void RunConverter(const char* spec,
97
+ const url_parse::Component& query,
98
+ CharsetConverter* converter,
99
+ CanonOutput* output) {
100
+ // This function will replace any misencoded values with the invalid
101
+ // character. This is what we want so we don't have to check for error.
102
+ RawCanonOutputW<1024> utf16;
103
+ ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16);
104
+ converter->ConvertFromUTF16(utf16.data(), utf16.length(), output);
105
+ }
106
+
107
+ // Runs the converter with the given UTF-16 input. We don't have to do
108
+ // anything, but this overriddden function allows us to use the same code
109
+ // for both UTF-8 and UTF-16 input.
110
+ void RunConverter(const char16* spec,
111
+ const url_parse::Component& query,
112
+ CharsetConverter* converter,
113
+ CanonOutput* output) {
114
+ converter->ConvertFromUTF16(&spec[query.begin], query.len, output);
115
+ }
116
+
117
+ template<typename CHAR, typename UCHAR>
118
+ void DoConvertToQueryEncoding(const CHAR* spec,
119
+ const url_parse::Component& query,
120
+ CharsetConverter* converter,
121
+ CanonOutput* output) {
122
+ if (IsAllASCII<CHAR, UCHAR>(spec, query)) {
123
+ // Easy: the input can just appended with no character set conversions.
124
+ AppendRaw8BitQueryString(&spec[query.begin], query.len, output);
125
+
126
+ } else {
127
+ // Harder: convert to the proper encoding first.
128
+ if (converter) {
129
+ // Run the converter to get an 8-bit string, then append it, escaping
130
+ // necessary values.
131
+ RawCanonOutput<1024> eight_bit;
132
+ RunConverter(spec, query, converter, &eight_bit);
133
+ AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output);
134
+
135
+ } else {
136
+ // No converter, do our own UTF-8 conversion.
137
+ AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);
138
+ }
139
+ }
140
+ }
141
+
142
+ template<typename CHAR, typename UCHAR>
143
+ void DoCanonicalizeQuery(const CHAR* spec,
144
+ const url_parse::Component& query,
145
+ CharsetConverter* converter,
146
+ CanonOutput* output,
147
+ url_parse::Component* out_query) {
148
+ if (query.len < 0) {
149
+ *out_query = url_parse::Component();
150
+ return;
151
+ }
152
+
153
+ output->push_back('?');
154
+ out_query->begin = output->length();
155
+
156
+ DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output);
157
+
158
+ out_query->len = output->length() - out_query->begin;
159
+ }
160
+
161
+ } // namespace
162
+
163
+ void CanonicalizeQuery(const char* spec,
164
+ const url_parse::Component& query,
165
+ CharsetConverter* converter,
166
+ CanonOutput* output,
167
+ url_parse::Component* out_query) {
168
+ DoCanonicalizeQuery<char, unsigned char>(spec, query, converter,
169
+ output, out_query);
170
+ }
171
+
172
+ void CanonicalizeQuery(const char16* spec,
173
+ const url_parse::Component& query,
174
+ CharsetConverter* converter,
175
+ CanonOutput* output,
176
+ url_parse::Component* out_query) {
177
+ DoCanonicalizeQuery<char16, char16>(spec, query, converter,
178
+ output, out_query);
179
+ }
180
+
181
+ void ConvertUTF16ToQueryEncoding(const char16* input,
182
+ const url_parse::Component& query,
183
+ CharsetConverter* converter,
184
+ CanonOutput* output) {
185
+ DoConvertToQueryEncoding<char16, char16>(input, query,
186
+ converter, output);
187
+ }
188
+
189
+ } // namespace url_canon
@@ -0,0 +1,572 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Canonicalizer functions for working with and resolving relative URLs.
31
+
32
+ #include "logging.h"
33
+ #include "url_canon.h"
34
+ #include "url_canon_internal.h"
35
+ #include "url_file.h"
36
+ #include "url_parse_internal.h"
37
+
38
+ namespace url_canon {
39
+
40
+ namespace {
41
+
42
+ // Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
43
+ // 379034), whereas IE is case-insensetive.
44
+ //
45
+ // We choose to be more permissive like IE. We don't need to worry about
46
+ // unescaping or anything here: neither IE or Firefox allow this. We also
47
+ // don't have to worry about invalid scheme characters since we are comparing
48
+ // against the canonical scheme of the base.
49
+ //
50
+ // The base URL should always be canonical, therefore is ASCII.
51
+ template<typename CHAR>
52
+ bool AreSchemesEqual(const char* base,
53
+ const url_parse::Component& base_scheme,
54
+ const CHAR* cmp,
55
+ const url_parse::Component& cmp_scheme) {
56
+ if (base_scheme.len != cmp_scheme.len)
57
+ return false;
58
+ for (int i = 0; i < base_scheme.len; i++) {
59
+ // We assume the base is already canonical, so we don't have to
60
+ // canonicalize it.
61
+ if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) !=
62
+ base[base_scheme.begin + i])
63
+ return false;
64
+ }
65
+ return true;
66
+ }
67
+
68
+ #ifdef WIN32
69
+
70
+ // Here, we also allow Windows paths to be represented as "/C:/" so we can be
71
+ // consistent about URL paths beginning with slashes. This function is like
72
+ // DoesBeginWindowsDrivePath except that it also requires a slash at the
73
+ // beginning.
74
+ template<typename CHAR>
75
+ bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset,
76
+ int spec_len) {
77
+ if (start_offset >= spec_len)
78
+ return false;
79
+ return url_parse::IsURLSlash(spec[start_offset]) &&
80
+ url_parse::DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len);
81
+ }
82
+
83
+ #endif // WIN32
84
+
85
+ // See IsRelativeURL in the header file for usage.
86
+ template<typename CHAR>
87
+ bool DoIsRelativeURL(const char* base,
88
+ const url_parse::Parsed& base_parsed,
89
+ const CHAR* url,
90
+ int url_len,
91
+ bool is_base_hierarchical,
92
+ bool* is_relative,
93
+ url_parse::Component* relative_component) {
94
+ *is_relative = false; // So we can default later to not relative.
95
+
96
+ // Trim whitespace and construct a new range for the substring.
97
+ int begin = 0;
98
+ url_parse::TrimURL(url, &begin, &url_len);
99
+ if (begin >= url_len) {
100
+ // Empty URLs are relative, but do nothing.
101
+ *relative_component = url_parse::Component(begin, 0);
102
+ *is_relative = true;
103
+ return true;
104
+ }
105
+
106
+ #ifdef WIN32
107
+ // We special case paths like "C:\foo" so they can link directly to the
108
+ // file on Windows (IE compatability). The security domain stuff should
109
+ // prevent a link like this from actually being followed if its on a
110
+ // web page.
111
+ //
112
+ // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/"
113
+ // as relative, as this will just replace the path when the base scheme
114
+ // is a file and the answer will still be correct.
115
+ //
116
+ // We require strict backslashes when detecting UNC since two forward
117
+ // shashes should be treated a a relative URL with a hostname.
118
+ if (url_parse::DoesBeginWindowsDriveSpec(url, begin, url_len) ||
119
+ url_parse::DoesBeginUNCPath(url, begin, url_len, true))
120
+ return true;
121
+ #endif // WIN32
122
+
123
+ // See if we've got a scheme, if not, we know this is a relative URL.
124
+ // BUT: Just because we have a scheme, doesn't make it absolute.
125
+ // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
126
+ // empty, we treat it as relative (":foo") like IE does.
127
+ url_parse::Component scheme;
128
+ if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
129
+ // Don't allow relative URLs if the base scheme doesn't support it.
130
+ if (!is_base_hierarchical)
131
+ return false;
132
+
133
+ *relative_component = url_parse::MakeRange(begin, url_len);
134
+ *is_relative = true;
135
+ return true;
136
+ }
137
+
138
+ // If the scheme isn't valid, then it's relative.
139
+ int scheme_end = scheme.end();
140
+ for (int i = scheme.begin; i < scheme_end; i++) {
141
+ if (!CanonicalSchemeChar(url[i])) {
142
+ *relative_component = url_parse::MakeRange(begin, url_len);
143
+ *is_relative = true;
144
+ return true;
145
+ }
146
+ }
147
+
148
+ // If the scheme is not the same, then we can't count it as relative.
149
+ if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme))
150
+ return true;
151
+
152
+ // When the scheme that they both share is not hierarchical, treat the
153
+ // incoming scheme as absolute (this way with the base of "data:foo",
154
+ // "data:bar" will be reported as absolute.
155
+ if (!is_base_hierarchical)
156
+ return true;
157
+
158
+ // ExtractScheme guarantees that the colon immediately follows what it
159
+ // considers to be the scheme. CountConsecutiveSlashes will handle the
160
+ // case where the begin offset is the end of the input.
161
+ int colon_offset = scheme.end();
162
+ int num_slashes = url_parse::CountConsecutiveSlashes(url, colon_offset + 1,
163
+ url_len);
164
+
165
+ if (num_slashes == 0 || num_slashes == 1) {
166
+ // No slashes means it's a relative path like "http:foo.html". One slash
167
+ // is an absolute path. "http:/home/foo.html"
168
+ *is_relative = true;
169
+ *relative_component = url_parse::MakeRange(colon_offset + 1, url_len);
170
+ return true;
171
+ }
172
+
173
+ // Two or more slashes after the scheme we treat as absolute.
174
+ return true;
175
+ }
176
+
177
+ // Copies all characters in the range [begin, end) of |spec| to the output,
178
+ // up until and including the last slash. There should be a slash in the
179
+ // range, if not, nothing will be copied.
180
+ //
181
+ // The input is assumed to be canonical, so we search only for exact slashes
182
+ // and not backslashes as well. We also know that it's ASCII.
183
+ void CopyToLastSlash(const char* spec,
184
+ int begin,
185
+ int end,
186
+ CanonOutput* output) {
187
+ // Find the last slash.
188
+ int last_slash = -1;
189
+ for (int i = end - 1; i >= begin; i--) {
190
+ if (spec[i] == '/') {
191
+ last_slash = i;
192
+ break;
193
+ }
194
+ }
195
+ if (last_slash < 0)
196
+ return; // No slash.
197
+
198
+ // Copy.
199
+ for (int i = begin; i <= last_slash; i++)
200
+ output->push_back(spec[i]);
201
+ }
202
+
203
+ // Copies a single component from the source to the output. This is used
204
+ // when resolving relative URLs and a given component is unchanged. Since the
205
+ // source should already be canonical, we don't have to do anything special,
206
+ // and the input is ASCII.
207
+ void CopyOneComponent(const char* source,
208
+ const url_parse::Component& source_component,
209
+ CanonOutput* output,
210
+ url_parse::Component* output_component) {
211
+ if (source_component.len < 0) {
212
+ // This component is not present.
213
+ *output_component = url_parse::Component();
214
+ return;
215
+ }
216
+
217
+ output_component->begin = output->length();
218
+ int source_end = source_component.end();
219
+ for (int i = source_component.begin; i < source_end; i++)
220
+ output->push_back(source[i]);
221
+ output_component->len = output->length() - output_component->begin;
222
+ }
223
+
224
+ #ifdef WIN32
225
+
226
+ // Called on Windows when the base URL is a file URL, this will copy the "C:"
227
+ // to the output, if there is a drive letter and if that drive letter is not
228
+ // being overridden by the relative URL. Otherwise, do nothing.
229
+ //
230
+ // It will return the index of the beginning of the next character in the
231
+ // base to be processed: if there is a "C:", the slash after it, or if
232
+ // there is no drive letter, the slash at the beginning of the path, or
233
+ // the end of the base. This can be used as the starting offset for further
234
+ // path processing.
235
+ template<typename CHAR>
236
+ int CopyBaseDriveSpecIfNecessary(const char* base_url,
237
+ int base_path_begin,
238
+ int base_path_end,
239
+ const CHAR* relative_url,
240
+ int path_start,
241
+ int relative_url_len,
242
+ CanonOutput* output) {
243
+ if (base_path_begin >= base_path_end)
244
+ return base_path_begin; // No path.
245
+
246
+ // If the relative begins with a drive spec, don't do anything. The existing
247
+ // drive spec in the base will be replaced.
248
+ if (url_parse::DoesBeginWindowsDriveSpec(relative_url,
249
+ path_start, relative_url_len)) {
250
+ return base_path_begin; // Relative URL path is "C:/foo"
251
+ }
252
+
253
+ // The path should begin with a slash (as all canonical paths do). We check
254
+ // if it is followed by a drive letter and copy it.
255
+ if (DoesBeginSlashWindowsDriveSpec(base_url,
256
+ base_path_begin,
257
+ base_path_end)) {
258
+ // Copy the two-character drive spec to the output. It will now look like
259
+ // "file:///C:" so the rest of it can be treated like a standard path.
260
+ output->push_back('/');
261
+ output->push_back(base_url[base_path_begin + 1]);
262
+ output->push_back(base_url[base_path_begin + 2]);
263
+ return base_path_begin + 3;
264
+ }
265
+
266
+ return base_path_begin;
267
+ }
268
+
269
+ #endif // WIN32
270
+
271
+ // A subroutine of DoResolveRelativeURL, this resolves the URL knowning that
272
+ // the input is a relative path or less (qyuery or ref).
273
+ template<typename CHAR>
274
+ bool DoResolveRelativePath(const char* base_url,
275
+ const url_parse::Parsed& base_parsed,
276
+ bool base_is_file,
277
+ const CHAR* relative_url,
278
+ const url_parse::Component& relative_component,
279
+ CharsetConverter* query_converter,
280
+ CanonOutput* output,
281
+ url_parse::Parsed* out_parsed) {
282
+ bool success = true;
283
+
284
+ // We know the authority section didn't change, copy it to the output. We
285
+ // also know we have a path so can copy up to there.
286
+ url_parse::Component path, query, ref;
287
+ url_parse::ParsePathInternal(relative_url,
288
+ relative_component,
289
+ &path,
290
+ &query,
291
+ &ref);
292
+ // Canonical URLs always have a path, so we can use that offset.
293
+ output->Append(base_url, base_parsed.path.begin);
294
+
295
+ if (path.len > 0) {
296
+ // The path is replaced or modified.
297
+ int true_path_begin = output->length();
298
+
299
+ // For file: URLs on Windows, we don't want to treat the drive letter and
300
+ // colon as part of the path for relative file resolution when the
301
+ // incoming URL does not provide a drive spec. We save the true path
302
+ // beginning so we can fix it up after we are done.
303
+ int base_path_begin = base_parsed.path.begin;
304
+ #ifdef WIN32
305
+ if (base_is_file) {
306
+ base_path_begin = CopyBaseDriveSpecIfNecessary(
307
+ base_url, base_parsed.path.begin, base_parsed.path.end(),
308
+ relative_url, relative_component.begin, relative_component.end(),
309
+ output);
310
+ // Now the output looks like either "file://" or "file:///C:"
311
+ // and we can start appending the rest of the path. |base_path_begin|
312
+ // points to the character in the base that comes next.
313
+ }
314
+ #endif // WIN32
315
+
316
+ if (url_parse::IsURLSlash(relative_url[path.begin])) {
317
+ // Easy case: the path is an absolute path on the server, so we can
318
+ // just replace everything from the path on with the new versions.
319
+ // Since the input should be canonical hierarchical URL, we should
320
+ // always have a path.
321
+ success &= CanonicalizePath(relative_url, path,
322
+ output, &out_parsed->path);
323
+ } else {
324
+ // Relative path, replace the query, and reference. We take the
325
+ // original path with the file part stripped, and append the new path.
326
+ // The canonicalizer will take care of resolving ".." and "."
327
+ int path_begin = output->length();
328
+ CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(),
329
+ output);
330
+ success &= CanonicalizePartialPath(relative_url, path, path_begin,
331
+ output);
332
+ out_parsed->path = url_parse::MakeRange(path_begin, output->length());
333
+
334
+ // Copy the rest of the stuff after the path from the relative path.
335
+ }
336
+
337
+ // Finish with the query and reference part (these can't fail).
338
+ CanonicalizeQuery(relative_url, query, query_converter,
339
+ output, &out_parsed->query);
340
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
341
+
342
+ // Fix the path beginning to add back the "C:" we may have written above.
343
+ out_parsed->path = url_parse::MakeRange(true_path_begin,
344
+ out_parsed->path.end());
345
+ return success;
346
+ }
347
+
348
+ // If we get here, the path is unchanged: copy to output.
349
+ CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path);
350
+
351
+ if (query.is_valid()) {
352
+ // Just the query specified, replace the query and reference (ignore
353
+ // failures for refs)
354
+ CanonicalizeQuery(relative_url, query, query_converter,
355
+ output, &out_parsed->query);
356
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
357
+ return success;
358
+ }
359
+
360
+ // If we get here, the query is unchanged: copy to output. Note that the
361
+ // range of the query parameter doesn't include the question mark, so we
362
+ // have to add it manually if there is a component.
363
+ if (base_parsed.query.is_valid())
364
+ output->push_back('?');
365
+ CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query);
366
+
367
+ if (ref.is_valid()) {
368
+ // Just the reference specified: replace it (ignoring failures).
369
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
370
+ return success;
371
+ }
372
+
373
+ // We should always have something to do in this function, the caller checks
374
+ // that some component is being replaced.
375
+ DCHECK(false) << "Not reached";
376
+ return success;
377
+ }
378
+
379
+ // Resolves a relative URL that contains a host. Typically, these will
380
+ // be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which
381
+ // should be kept from the original URL is the scheme.
382
+ template<typename CHAR>
383
+ bool DoResolveRelativeHost(const char* base_url,
384
+ const url_parse::Parsed& base_parsed,
385
+ const CHAR* relative_url,
386
+ const url_parse::Component& relative_component,
387
+ CharsetConverter* query_converter,
388
+ CanonOutput* output,
389
+ url_parse::Parsed* out_parsed) {
390
+ // Parse the relative URL, just like we would for anything following a
391
+ // scheme.
392
+ url_parse::Parsed relative_parsed; // Everything but the scheme is valid.
393
+ url_parse::ParseAfterScheme(&relative_url[relative_component.begin],
394
+ relative_component.len, relative_component.begin,
395
+ &relative_parsed);
396
+
397
+ // Now we can just use the replacement function to replace all the necessary
398
+ // parts of the old URL with the new one.
399
+ Replacements<CHAR> replacements;
400
+ replacements.SetUsername(relative_url, relative_parsed.username);
401
+ replacements.SetPassword(relative_url, relative_parsed.password);
402
+ replacements.SetHost(relative_url, relative_parsed.host);
403
+ replacements.SetPort(relative_url, relative_parsed.port);
404
+ replacements.SetPath(relative_url, relative_parsed.path);
405
+ replacements.SetQuery(relative_url, relative_parsed.query);
406
+ replacements.SetRef(relative_url, relative_parsed.ref);
407
+
408
+ return ReplaceStandardURL(base_url, base_parsed, replacements,
409
+ query_converter, output, out_parsed);
410
+ }
411
+
412
+ // Resolves a relative URL that happens to be an absolute file path. Examples
413
+ // include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
414
+ template<typename CHAR>
415
+ bool DoResolveAbsoluteFile(const CHAR* relative_url,
416
+ const url_parse::Component& relative_component,
417
+ CharsetConverter* query_converter,
418
+ CanonOutput* output,
419
+ url_parse::Parsed* out_parsed) {
420
+ // Parse the file URL. The file URl parsing function uses the same logic
421
+ // as we do for determining if the file is absolute, in which case it will
422
+ // not bother to look for a scheme.
423
+ url_parse::Parsed relative_parsed;
424
+ url_parse::ParseFileURL(&relative_url[relative_component.begin],
425
+ relative_component.len, &relative_parsed);
426
+
427
+ return CanonicalizeFileURL(&relative_url[relative_component.begin],
428
+ relative_component.len, relative_parsed,
429
+ query_converter, output, out_parsed);
430
+ }
431
+
432
+ // TODO(brettw) treat two slashes as root like Mozilla for FTP?
433
+ template<typename CHAR>
434
+ bool DoResolveRelativeURL(const char* base_url,
435
+ const url_parse::Parsed& base_parsed,
436
+ bool base_is_file,
437
+ const CHAR* relative_url,
438
+ const url_parse::Component& relative_component,
439
+ CharsetConverter* query_converter,
440
+ CanonOutput* output,
441
+ url_parse::Parsed* out_parsed) {
442
+ // Starting point for our output parsed. We'll fix what we change.
443
+ *out_parsed = base_parsed;
444
+
445
+ // Sanity check: the input should have a host or we'll break badly below.
446
+ // We can only resolve relative URLs with base URLs that have hosts and
447
+ // paths (even the default path of "/" is OK).
448
+ //
449
+ // We allow hosts with no length so we can handle file URLs, for example.
450
+ if (base_parsed.path.len <= 0) {
451
+ // On error, return the input (resolving a relative URL on a non-relative
452
+ // base = the base).
453
+ int base_len = base_parsed.Length();
454
+ for (int i = 0; i < base_len; i++)
455
+ output->push_back(base_url[i]);
456
+ return false;
457
+ }
458
+
459
+ if (relative_component.len <= 0) {
460
+ // Empty relative URL, leave unchanged, only removing the ref component.
461
+ int base_len = base_parsed.Length();
462
+ base_len -= base_parsed.ref.len + 1;
463
+ out_parsed->ref.reset();
464
+ output->Append(base_url, base_len);
465
+ return true;
466
+ }
467
+
468
+ int num_slashes = url_parse::CountConsecutiveSlashes(
469
+ relative_url, relative_component.begin, relative_component.end());
470
+
471
+ #ifdef WIN32
472
+ // On Windows, two slashes for a file path (regardless of which direction
473
+ // they are) means that it's UNC. Two backslashes on any base scheme mean
474
+ // that it's an absolute UNC path (we use the base_is_file flag to control
475
+ // how strict the UNC finder is).
476
+ //
477
+ // We also allow Windows absolute drive specs on any scheme (for example
478
+ // "c:\foo") like IE does. There must be no preceeding slashes in this
479
+ // case (we reject anything like "/c:/foo") because that should be treated
480
+ // as a path. For file URLs, we allow any number of slashes since that would
481
+ // be setting the path.
482
+ //
483
+ // This assumes the absolute path resolver handles absolute URLs like this
484
+ // properly. url_util::DoCanonicalize does this.
485
+ int after_slashes = relative_component.begin + num_slashes;
486
+ if (url_parse::DoesBeginUNCPath(relative_url, relative_component.begin,
487
+ relative_component.end(), !base_is_file) ||
488
+ ((num_slashes == 0 || base_is_file) &&
489
+ url_parse::DoesBeginWindowsDriveSpec(relative_url, after_slashes,
490
+ relative_component.end()))) {
491
+ return DoResolveAbsoluteFile(relative_url, relative_component,
492
+ query_converter, output, out_parsed);
493
+ }
494
+ #else
495
+ // Other platforms need explicit handling for file: URLs with multiple
496
+ // slashes because the generic scheme parsing always extracts a host, but a
497
+ // file: URL only has a host if it has exactly 2 slashes. This also
498
+ // handles the special case where the URL is only slashes, since that
499
+ // doesn't have a host part either.
500
+ if (base_is_file &&
501
+ (num_slashes > 2 || num_slashes == relative_component.len)) {
502
+ return DoResolveAbsoluteFile(relative_url, relative_component,
503
+ query_converter, output, out_parsed);
504
+ }
505
+ #endif
506
+
507
+ // Any other double-slashes mean that this is relative to the scheme.
508
+ if (num_slashes >= 2) {
509
+ return DoResolveRelativeHost(base_url, base_parsed,
510
+ relative_url, relative_component,
511
+ query_converter, output, out_parsed);
512
+ }
513
+
514
+ // When we get here, we know that the relative URL is on the same host.
515
+ return DoResolveRelativePath(base_url, base_parsed, base_is_file,
516
+ relative_url, relative_component,
517
+ query_converter, output, out_parsed);
518
+ }
519
+
520
+ } // namespace
521
+
522
+ bool IsRelativeURL(const char* base,
523
+ const url_parse::Parsed& base_parsed,
524
+ const char* fragment,
525
+ int fragment_len,
526
+ bool is_base_hierarchical,
527
+ bool* is_relative,
528
+ url_parse::Component* relative_component) {
529
+ return DoIsRelativeURL<char>(
530
+ base, base_parsed, fragment, fragment_len, is_base_hierarchical,
531
+ is_relative, relative_component);
532
+ }
533
+
534
+ bool IsRelativeURL(const char* base,
535
+ const url_parse::Parsed& base_parsed,
536
+ const char16* fragment,
537
+ int fragment_len,
538
+ bool is_base_hierarchical,
539
+ bool* is_relative,
540
+ url_parse::Component* relative_component) {
541
+ return DoIsRelativeURL<char16>(
542
+ base, base_parsed, fragment, fragment_len, is_base_hierarchical,
543
+ is_relative, relative_component);
544
+ }
545
+
546
+ bool ResolveRelativeURL(const char* base_url,
547
+ const url_parse::Parsed& base_parsed,
548
+ bool base_is_file,
549
+ const char* relative_url,
550
+ const url_parse::Component& relative_component,
551
+ CharsetConverter* query_converter,
552
+ CanonOutput* output,
553
+ url_parse::Parsed* out_parsed) {
554
+ return DoResolveRelativeURL<char>(
555
+ base_url, base_parsed, base_is_file, relative_url,
556
+ relative_component, query_converter, output, out_parsed);
557
+ }
558
+
559
+ bool ResolveRelativeURL(const char* base_url,
560
+ const url_parse::Parsed& base_parsed,
561
+ bool base_is_file,
562
+ const char16* relative_url,
563
+ const url_parse::Component& relative_component,
564
+ CharsetConverter* query_converter,
565
+ CanonOutput* output,
566
+ url_parse::Parsed* out_parsed) {
567
+ return DoResolveRelativeURL<char16>(
568
+ base_url, base_parsed, base_is_file, relative_url,
569
+ relative_component, query_converter, output, out_parsed);
570
+ }
571
+
572
+ } // namespace url_canon