uri_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,101 @@
1
+ // Copyright 2008, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #ifndef GOOGLEURL_SRC_URL_CANON_IP_H__
31
+ #define GOOGLEURL_SRC_URL_CANON_IP_H__
32
+
33
+ #include "string16.h"
34
+ #include "url_canon.h"
35
+ #include "url_common.h"
36
+ #include "url_parse.h"
37
+
38
+ namespace url_canon {
39
+
40
+ // Searches the host name for the portions of the IPv4 address. On success,
41
+ // each component will be placed into |components| and it will return true.
42
+ // It will return false if the host can not be separated as an IPv4 address
43
+ // or if there are any non-7-bit characters or other characters that can not
44
+ // be in an IP address. (This is important so we fail as early as possible for
45
+ // common non-IP hostnames.)
46
+ //
47
+ // Not all components may exist. If there are only 3 components, for example,
48
+ // the last one will have a length of -1 or 0 to indicate it does not exist.
49
+ //
50
+ // Note that many platform's inet_addr will ignore everything after a space
51
+ // in certain curcumstances if the stuff before the space looks like an IP
52
+ // address. IE6 is included in this. We do NOT handle this case. In many cases,
53
+ // the browser's canonicalization will get run before this which converts
54
+ // spaces to %20 (in the case of IE7) or rejects them (in the case of
55
+ // Mozilla), so this code path never gets hit. Our host canonicalization will
56
+ // notice these spaces and escape them, which will make IP address finding
57
+ // fail. This seems like better behavior than stripping after a space.
58
+ GURL_API bool FindIPv4Components(const char* spec,
59
+ const url_parse::Component& host,
60
+ url_parse::Component components[4]);
61
+ GURL_API bool FindIPv4Components(const char16* spec,
62
+ const url_parse::Component& host,
63
+ url_parse::Component components[4]);
64
+
65
+ // Converts an IPv4 address to a 32-bit number (network byte order).
66
+ //
67
+ // Possible return values:
68
+ // IPV4 - IPv4 address was successfully parsed.
69
+ // BROKEN - Input was formatted like an IPv4 address, but overflow occurred
70
+ // during parsing.
71
+ // NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address.
72
+ // It might be an IPv6 address, or a hostname.
73
+ //
74
+ // On success, |num_ipv4_components| will be populated with the number of
75
+ // components in the IPv4 address.
76
+ GURL_API CanonHostInfo::Family IPv4AddressToNumber(
77
+ const char* spec,
78
+ const url_parse::Component& host,
79
+ unsigned char address[4],
80
+ int* num_ipv4_components);
81
+ GURL_API CanonHostInfo::Family IPv4AddressToNumber(
82
+ const char16* spec,
83
+ const url_parse::Component& host,
84
+ unsigned char address[4],
85
+ int* num_ipv4_components);
86
+
87
+ // Converts an IPv6 address to a 128-bit number (network byte order), returning
88
+ // true on success. False means that the input was not a valid IPv6 address.
89
+ //
90
+ // NOTE that |host| is expected to be surrounded by square brackets.
91
+ // i.e. "[::1]" rather than "::1".
92
+ GURL_API bool IPv6AddressToNumber(const char* spec,
93
+ const url_parse::Component& host,
94
+ unsigned char address[16]);
95
+ GURL_API bool IPv6AddressToNumber(const char16* spec,
96
+ const url_parse::Component& host,
97
+ unsigned char address[16]);
98
+
99
+ } // namespace url_canon
100
+
101
+ #endif // GOOGLEURL_SRC_URL_CANON_IP_H__
@@ -0,0 +1,137 @@
1
+ // Copyright 2008, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // Functions for canonicalizing "mailto:" URLs.
31
+
32
+ #include "url_canon.h"
33
+ #include "url_canon_internal.h"
34
+ #include "url_file.h"
35
+ #include "url_parse_internal.h"
36
+
37
+ namespace url_canon {
38
+
39
+ namespace {
40
+
41
+
42
+ template<typename CHAR, typename UCHAR>
43
+ bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source,
44
+ const url_parse::Parsed& parsed,
45
+ CanonOutput* output,
46
+ url_parse::Parsed* new_parsed) {
47
+
48
+ // mailto: only uses {scheme, path, query} -- clear the rest.
49
+ new_parsed->username = url_parse::Component();
50
+ new_parsed->password = url_parse::Component();
51
+ new_parsed->host = url_parse::Component();
52
+ new_parsed->port = url_parse::Component();
53
+ new_parsed->ref = url_parse::Component();
54
+
55
+ // Scheme (known, so we don't bother running it through the more
56
+ // complicated scheme canonicalizer).
57
+ new_parsed->scheme.begin = output->length();
58
+ output->Append("mailto:", 7);
59
+ new_parsed->scheme.len = 6;
60
+
61
+ bool success = true;
62
+
63
+ // Path
64
+ if (parsed.path.is_valid()) {
65
+ new_parsed->path.begin = output->length();
66
+
67
+ // Copy the path using path URL's more lax escaping rules.
68
+ // We convert to UTF-8 and escape non-ASCII, but leave all
69
+ // ASCII characters alone.
70
+ int end = parsed.path.end();
71
+ for (int i = parsed.path.begin; i < end; ++i) {
72
+ UCHAR uch = static_cast<UCHAR>(source.path[i]);
73
+ if (uch < 0x20 || uch >= 0x80)
74
+ success &= AppendUTF8EscapedChar(source.path, &i, end, output);
75
+ else
76
+ output->push_back(static_cast<char>(uch));
77
+ }
78
+
79
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
80
+ } else {
81
+ // No path at all
82
+ new_parsed->path.reset();
83
+ }
84
+
85
+ // Query -- always use the default utf8 charset converter.
86
+ CanonicalizeQuery(source.query, parsed.query, NULL,
87
+ output, &new_parsed->query);
88
+
89
+ return success;
90
+ }
91
+
92
+ } // namespace
93
+
94
+ bool CanonicalizeMailtoURL(const char* spec,
95
+ int spec_len,
96
+ const url_parse::Parsed& parsed,
97
+ CanonOutput* output,
98
+ url_parse::Parsed* new_parsed) {
99
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
100
+ URLComponentSource<char>(spec), parsed, output, new_parsed);
101
+ }
102
+
103
+ bool CanonicalizeMailtoURL(const char16* spec,
104
+ int spec_len,
105
+ const url_parse::Parsed& parsed,
106
+ CanonOutput* output,
107
+ url_parse::Parsed* new_parsed) {
108
+ return DoCanonicalizeMailtoURL<char16, char16>(
109
+ URLComponentSource<char16>(spec), parsed, output, new_parsed);
110
+ }
111
+
112
+ bool ReplaceMailtoURL(const char* base,
113
+ const url_parse::Parsed& base_parsed,
114
+ const Replacements<char>& replacements,
115
+ CanonOutput* output,
116
+ url_parse::Parsed* new_parsed) {
117
+ URLComponentSource<char> source(base);
118
+ url_parse::Parsed parsed(base_parsed);
119
+ SetupOverrideComponents(base, replacements, &source, &parsed);
120
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
121
+ source, parsed, output, new_parsed);
122
+ }
123
+
124
+ bool ReplaceMailtoURL(const char* base,
125
+ const url_parse::Parsed& base_parsed,
126
+ const Replacements<char16>& replacements,
127
+ CanonOutput* output,
128
+ url_parse::Parsed* new_parsed) {
129
+ RawCanonOutput<1024> utf8;
130
+ URLComponentSource<char> source(base);
131
+ url_parse::Parsed parsed(base_parsed);
132
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
133
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
134
+ source, parsed, output, new_parsed);
135
+ }
136
+
137
+ } // namespace url_canon
@@ -0,0 +1,380 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ // Canonicalization functions for the paths of URLs.
30
+
31
+ #include "logging.h"
32
+ #include "url_canon.h"
33
+ #include "url_canon_internal.h"
34
+ #include "url_parse_internal.h"
35
+
36
+ namespace url_canon {
37
+
38
+ namespace {
39
+
40
+ enum CharacterFlags {
41
+ // Pass through unchanged, whether escaped or unescaped. This doesn't
42
+ // actually set anything so you can't OR it to check, it's just to make the
43
+ // table below more clear when neither ESCAPE or UNESCAPE is set.
44
+ PASS = 0,
45
+
46
+ // This character requires special handling in DoPartialPath. Doing this test
47
+ // first allows us to filter out the common cases of regular characters that
48
+ // can be directly copied.
49
+ SPECIAL = 1,
50
+
51
+ // This character must be escaped in the canonical output. Note that all
52
+ // escaped chars also have the "special" bit set so that the code that looks
53
+ // for this is triggered. Not valid with PASS or ESCAPE
54
+ ESCAPE_BIT = 2,
55
+ ESCAPE = ESCAPE_BIT | SPECIAL,
56
+
57
+ // This character must be unescaped in canonical output. Not valid with
58
+ // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these
59
+ // characters unescaped, they should just be copied.
60
+ UNESCAPE = 4,
61
+
62
+ // This character is disallowed in URLs. Note that the "special" bit is also
63
+ // set to trigger handling.
64
+ INVALID_BIT = 8,
65
+ INVALID = INVALID_BIT | SPECIAL,
66
+ };
67
+
68
+ // This table contains one of the above flag values. Note some flags are more
69
+ // than one bits because they also turn on the "special" flag. Special is the
70
+ // only flag that may be combined with others.
71
+ //
72
+ // This table is designed to match exactly what IE does with the characters.
73
+ //
74
+ // Dot is even more special, and the escaped version is handled specially by
75
+ // IsDot. Therefore, we don't need the "escape" flag, and even the "unescape"
76
+ // bit is never handled (we just need the "special") bit.
77
+ const unsigned char kPathCharLookup[0x100] = {
78
+ // NULL control chars...
79
+ INVALID, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
80
+ // control chars...
81
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
82
+ // ' ' ! " # $ % & ' ( ) * + , - . /
83
+ ESCAPE, PASS, ESCAPE, ESCAPE, PASS, ESCAPE, PASS, PASS, PASS, PASS, PASS, PASS, PASS, UNESCAPE,SPECIAL, PASS,
84
+ // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
85
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE,
86
+ // @ A B C D E F G H I J K L M N O
87
+ PASS, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
88
+ // P Q R S T U V W X Y Z [ \ ] ^ _
89
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE,
90
+ // ` a b c d e f g h i j k l m n o
91
+ ESCAPE, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
92
+ // p q r s t u v w x y z { | } ~ <NBSP>
93
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE, ESCAPE, ESCAPE, UNESCAPE,ESCAPE,
94
+ // ...all the high-bit characters are escaped
95
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
96
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
97
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
98
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
99
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
100
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
101
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
102
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE};
103
+
104
+ enum DotDisposition {
105
+ // The given dot is just part of a filename and is not special.
106
+ NOT_A_DIRECTORY,
107
+
108
+ // The given dot is the current directory.
109
+ DIRECTORY_CUR,
110
+
111
+ // The given dot is the first of a double dot that should take us up one.
112
+ DIRECTORY_UP
113
+ };
114
+
115
+ // When the path resolver finds a dot, this function is called with the
116
+ // character following that dot to see what it is. The return value
117
+ // indicates what type this dot is (see above). This code handles the case
118
+ // where the dot is at the end of the input.
119
+ //
120
+ // |*consumed_len| will contain the number of characters in the input that
121
+ // express what we found.
122
+ //
123
+ // If the input is "../foo", |after_dot| = 1, |end| = 6, and
124
+ // at the end, |*consumed_len| = 2 for the "./" this function consumed. The
125
+ // original dot length should be handled by the caller.
126
+ template<typename CHAR>
127
+ DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot,
128
+ int end, int* consumed_len) {
129
+ if (after_dot == end) {
130
+ // Single dot at the end.
131
+ *consumed_len = 0;
132
+ return DIRECTORY_CUR;
133
+ }
134
+ if (url_parse::IsURLSlash(spec[after_dot])) {
135
+ // Single dot followed by a slash.
136
+ *consumed_len = 1; // Consume the slash
137
+ return DIRECTORY_CUR;
138
+ }
139
+
140
+ int second_dot_len = IsDot(spec, after_dot, end);
141
+ if (second_dot_len) {
142
+ int after_second_dot = after_dot + second_dot_len;
143
+ if (after_second_dot == end) {
144
+ // Double dot at the end.
145
+ *consumed_len = second_dot_len;
146
+ return DIRECTORY_UP;
147
+ }
148
+ if (url_parse::IsURLSlash(spec[after_second_dot])) {
149
+ // Double dot followed by a slash.
150
+ *consumed_len = second_dot_len + 1;
151
+ return DIRECTORY_UP;
152
+ }
153
+ }
154
+
155
+ // The dots are followed by something else, not a directory.
156
+ *consumed_len = 0;
157
+ return NOT_A_DIRECTORY;
158
+ }
159
+
160
+ // Rewinds the output to the previous slash. It is assumed that the output
161
+ // ends with a slash and this doesn't count (we call this when we are
162
+ // appending directory paths, so the previous path component has and ending
163
+ // slash).
164
+ //
165
+ // This will stop at the first slash (assumed to be at position
166
+ // |path_begin_in_output| and not go any higher than that. Some web pages
167
+ // do ".." too many times, so we need to handle that brokenness.
168
+ //
169
+ // It searches for a literal slash rather than including a backslash as well
170
+ // because it is run only on the canonical output.
171
+ //
172
+ // The output is guaranteed to end in a slash when this function completes.
173
+ void BackUpToPreviousSlash(int path_begin_in_output,
174
+ CanonOutput* output) {
175
+ DCHECK(output->length() > 0);
176
+
177
+ int i = output->length() - 1;
178
+ DCHECK(output->at(i) == '/');
179
+ if (i == path_begin_in_output)
180
+ return; // We're at the first slash, nothing to do.
181
+
182
+ // Now back up (skipping the trailing slash) until we find another slash.
183
+ i--;
184
+ while (output->at(i) != '/' && i > path_begin_in_output)
185
+ i--;
186
+
187
+ // Now shrink the output to just include that last slash we found.
188
+ output->set_length(i + 1);
189
+ }
190
+
191
+ // Appends the given path to the output. It assumes that if the input path
192
+ // starts with a slash, it should be copied to the output. If no path has
193
+ // already been appended to the output (the case when not resolving
194
+ // relative URLs), the path should begin with a slash.
195
+ //
196
+ // If there are already path components (this mode is used when appending
197
+ // relative paths for resolving), it assumes that the output already has
198
+ // a trailing slash and that if the input begins with a slash, it should be
199
+ // copied to the output.
200
+ //
201
+ // We do not collapse multiple slashes in a row to a single slash. It seems
202
+ // no web browsers do this, and we don't want incompababilities, even though
203
+ // it would be correct for most systems.
204
+ template<typename CHAR, typename UCHAR>
205
+ bool DoPartialPath(const CHAR* spec,
206
+ const url_parse::Component& path,
207
+ int path_begin_in_output,
208
+ CanonOutput* output) {
209
+ int end = path.end();
210
+
211
+ bool success = true;
212
+ for (int i = path.begin; i < end; i++) {
213
+ UCHAR uch = static_cast<UCHAR>(spec[i]);
214
+ if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) {
215
+ // We only need to test wide input for having non-ASCII characters. For
216
+ // narrow input, we'll always just use the lookup table. We don't try to
217
+ // do anything tricky with decoding/validating UTF-8. This function will
218
+ // read one or two UTF-16 characters and append the output as UTF-8. This
219
+ // call will be removed in 8-bit mode.
220
+ success &= AppendUTF8EscapedChar(spec, &i, end, output);
221
+ } else {
222
+ // Normal ASCII character or 8-bit input, use the lookup table.
223
+ unsigned char out_ch = static_cast<unsigned char>(uch);
224
+ unsigned char flags = kPathCharLookup[out_ch];
225
+ if (flags & SPECIAL) {
226
+ // Needs special handling of some sort.
227
+ int dotlen;
228
+ if ((dotlen = IsDot(spec, i, end)) > 0) {
229
+ // See if this dot was preceeded by a slash in the output. We
230
+ // assume that when canonicalizing paths, they will always
231
+ // start with a slash and not a dot, so we don't have to
232
+ // bounds check the output.
233
+ //
234
+ // Note that we check this in the case of dots so we don't have to
235
+ // special case slashes. Since slashes are much more common than
236
+ // dots, this actually increases performance measurably (though
237
+ // slightly).
238
+ DCHECK(output->length() > path_begin_in_output);
239
+ if (output->length() > path_begin_in_output &&
240
+ output->at(output->length() - 1) == '/') {
241
+ // Slash followed by a dot, check to see if this is means relative
242
+ int consumed_len;
243
+ switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end,
244
+ &consumed_len)) {
245
+ case NOT_A_DIRECTORY:
246
+ // Copy the dot to the output, it means nothing special.
247
+ output->push_back('.');
248
+ i += dotlen - 1;
249
+ break;
250
+ case DIRECTORY_CUR: // Current directory, just skip the input.
251
+ i += dotlen + consumed_len - 1;
252
+ break;
253
+ case DIRECTORY_UP:
254
+ BackUpToPreviousSlash(path_begin_in_output, output);
255
+ i += dotlen + consumed_len - 1;
256
+ break;
257
+ }
258
+ } else {
259
+ // This dot is not preceeded by a slash, it is just part of some
260
+ // file name.
261
+ output->push_back('.');
262
+ i += dotlen - 1;
263
+ }
264
+
265
+ } else if (out_ch == '\\') {
266
+ // Convert backslashes to forward slashes
267
+ output->push_back('/');
268
+
269
+ } else if (out_ch == '%') {
270
+ // Handle escape sequences.
271
+ unsigned char unescaped_value;
272
+ if (DecodeEscaped(spec, &i, end, &unescaped_value)) {
273
+ // Valid escape sequence, see if we keep, reject, or unescape it.
274
+ char unescaped_flags = kPathCharLookup[unescaped_value];
275
+
276
+ if (unescaped_flags & UNESCAPE) {
277
+ // This escaped value shouldn't be escaped, copy it.
278
+ output->push_back(unescaped_value);
279
+ } else if (unescaped_flags & INVALID_BIT) {
280
+ // Invalid escaped character, copy it and remember the error.
281
+ output->push_back('%');
282
+ output->push_back(static_cast<char>(spec[i - 1]));
283
+ output->push_back(static_cast<char>(spec[i]));
284
+ success = false;
285
+ } else {
286
+ // Valid escaped character but we should keep it escaped. We
287
+ // don't want to change the case of any hex letters in case
288
+ // the server is sensitive to that, so we just copy the two
289
+ // characters without checking (DecodeEscape will have advanced
290
+ // to the last character of the pair).
291
+ output->push_back('%');
292
+ output->push_back(static_cast<char>(spec[i - 1]));
293
+ output->push_back(static_cast<char>(spec[i]));
294
+ }
295
+ } else {
296
+ // Invalid escape sequence. IE7 rejects any URLs with such
297
+ // sequences, while Firefox, IE6, and Safari all pass it through
298
+ // unchanged. We are more permissive unlike IE7. I don't think this
299
+ // can cause significant problems, if it does, we should change
300
+ // to be more like IE7.
301
+ output->push_back('%');
302
+ }
303
+
304
+ } else if (flags & INVALID_BIT) {
305
+ // For NULLs, etc. fail.
306
+ AppendEscapedChar(out_ch, output);
307
+ success = false;
308
+
309
+ } else if (flags & ESCAPE_BIT) {
310
+ // This character should be escaped.
311
+ AppendEscapedChar(out_ch, output);
312
+ }
313
+ } else {
314
+ // Nothing special about this character, just append it.
315
+ output->push_back(out_ch);
316
+ }
317
+ }
318
+ }
319
+ return success;
320
+ }
321
+
322
+ template<typename CHAR, typename UCHAR>
323
+ bool DoPath(const CHAR* spec,
324
+ const url_parse::Component& path,
325
+ CanonOutput* output,
326
+ url_parse::Component* out_path) {
327
+ bool success = true;
328
+ if (path.len > 0) {
329
+ out_path->begin = output->length();
330
+
331
+ // Write out an initial slash if the input has none. If we just parse a URL
332
+ // and then canonicalize it, it will of course have a slash already. This
333
+ // check is for the replacement and relative URL resolving cases of file
334
+ // URLs.
335
+ if (!url_parse::IsURLSlash(spec[path.begin]))
336
+ output->push_back('/');
337
+
338
+ success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output);
339
+ out_path->len = output->length() - out_path->begin;
340
+ } else {
341
+ // No input, canonical path is a slash.
342
+ output->push_back('/');
343
+ *out_path = url_parse::Component();
344
+ }
345
+ return success;
346
+ }
347
+
348
+ } // namespace
349
+
350
+ bool CanonicalizePath(const char* spec,
351
+ const url_parse::Component& path,
352
+ CanonOutput* output,
353
+ url_parse::Component* out_path) {
354
+ return DoPath<char, unsigned char>(spec, path, output, out_path);
355
+ }
356
+
357
+ bool CanonicalizePath(const char16* spec,
358
+ const url_parse::Component& path,
359
+ CanonOutput* output,
360
+ url_parse::Component* out_path) {
361
+ return DoPath<char16, char16>(spec, path, output, out_path);
362
+ }
363
+
364
+ bool CanonicalizePartialPath(const char* spec,
365
+ const url_parse::Component& path,
366
+ int path_begin_in_output,
367
+ CanonOutput* output) {
368
+ return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output,
369
+ output);
370
+ }
371
+
372
+ bool CanonicalizePartialPath(const char16* spec,
373
+ const url_parse::Component& path,
374
+ int path_begin_in_output,
375
+ CanonOutput* output) {
376
+ return DoPartialPath<char16, char16>(spec, path, path_begin_in_output,
377
+ output);
378
+ }
379
+
380
+ } // namespace url_canon