uri_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,336 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #ifndef GOOGLEURL_SRC_URL_PARSE_H__
31
+ #define GOOGLEURL_SRC_URL_PARSE_H__
32
+
33
+ #include <string>
34
+
35
+ #include "basictypes.h"
36
+ #include "string16.h"
37
+ #include "url_common.h"
38
+
39
+ namespace url_parse {
40
+
41
+ // Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and
42
+ // KURLGoogle.cpp still rely on this type.
43
+ typedef char16 UTF16Char;
44
+
45
+ // Component ------------------------------------------------------------------
46
+
47
+ // Represents a substring for URL parsing.
48
+ struct Component {
49
+ Component() : begin(0), len(-1) {}
50
+
51
+ // Normal constructor: takes an offset and a length.
52
+ Component(int b, int l) : begin(b), len(l) {}
53
+
54
+ int end() const {
55
+ return begin + len;
56
+ }
57
+
58
+ // Returns true if this component is valid, meaning the length is given. Even
59
+ // valid components may be empty to record the fact that they exist.
60
+ bool is_valid() const {
61
+ return (len != -1);
62
+ }
63
+
64
+ // Returns true if the given component is specified on false, the component
65
+ // is either empty or invalid.
66
+ bool is_nonempty() const {
67
+ return (len > 0);
68
+ }
69
+
70
+ void reset() {
71
+ begin = 0;
72
+ len = -1;
73
+ }
74
+
75
+ bool operator==(const Component& other) const {
76
+ return begin == other.begin && len == other.len;
77
+ }
78
+
79
+ int begin; // Byte offset in the string of this component.
80
+ int len; // Will be -1 if the component is unspecified.
81
+ };
82
+
83
+ // Helper that returns a component created with the given begin and ending
84
+ // points. The ending point is non-inclusive.
85
+ inline Component MakeRange(int begin, int end) {
86
+ return Component(begin, end - begin);
87
+ }
88
+
89
+ // Parsed ---------------------------------------------------------------------
90
+
91
+ // A structure that holds the identified parts of an input URL. This structure
92
+ // does NOT store the URL itself. The caller will have to store the URL text
93
+ // and its corresponding Parsed structure separately.
94
+ //
95
+ // Typical usage would be:
96
+ //
97
+ // url_parse::Parsed parsed;
98
+ // url_parse::Component scheme;
99
+ // if (!url_parse::ExtractScheme(url, url_len, &scheme))
100
+ // return I_CAN_NOT_FIND_THE_SCHEME_DUDE;
101
+ //
102
+ // if (IsStandardScheme(url, scheme)) // Not provided by this component
103
+ // url_parseParseStandardURL(url, url_len, &parsed);
104
+ // else if (IsFileURL(url, scheme)) // Not provided by this component
105
+ // url_parse::ParseFileURL(url, url_len, &parsed);
106
+ // else
107
+ // url_parse::ParsePathURL(url, url_len, &parsed);
108
+ //
109
+ struct Parsed {
110
+ // Identifies different components.
111
+ enum ComponentType {
112
+ SCHEME,
113
+ USERNAME,
114
+ PASSWORD,
115
+ HOST,
116
+ PORT,
117
+ PATH,
118
+ QUERY,
119
+ REF,
120
+ };
121
+
122
+ // The default constructor is sufficient for the components.
123
+ GURL_API Parsed();
124
+
125
+ // Returns the length of the URL (the end of the last component).
126
+ //
127
+ // Note that for some invalid, non-canonical URLs, this may not be the length
128
+ // of the string. For example "http://": the parsed structure will only
129
+ // contain an entry for the four-character scheme, and it doesn't know about
130
+ // the "://". For all other last-components, it will return the real length.
131
+ GURL_API int Length() const;
132
+
133
+ // Returns the number of characters before the given component if it exists,
134
+ // or where the component would be if it did exist. This will return the
135
+ // string length if the component would be appended to the end.
136
+ //
137
+ // Note that this can get a little funny for the port, query, and ref
138
+ // components which have a delimiter that is not counted as part of the
139
+ // component. The |include_delimiter| flag controls if you want this counted
140
+ // as part of the component or not when the component exists.
141
+ //
142
+ // This example shows the difference between the two flags for two of these
143
+ // delimited components that is present (the port and query) and one that
144
+ // isn't (the reference). The components that this flag affects are marked
145
+ // with a *.
146
+ // 0 1 2
147
+ // 012345678901234567890
148
+ // Example input: http://foo:80/?query
149
+ // include_delim=true, ...=false ("<-" indicates different)
150
+ // SCHEME: 0 0
151
+ // USERNAME: 5 5
152
+ // PASSWORD: 5 5
153
+ // HOST: 7 7
154
+ // *PORT: 10 11 <-
155
+ // PATH: 13 13
156
+ // *QUERY: 14 15 <-
157
+ // *REF: 20 20
158
+ //
159
+ GURL_API int CountCharactersBefore(ComponentType type,
160
+ bool include_delimiter) const;
161
+
162
+ // Scheme without the colon: "http://foo"/ would have a scheme of "http".
163
+ // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there
164
+ // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed
165
+ // to start at the beginning of the string if there are preceeding whitespace
166
+ // or control characters.
167
+ Component scheme;
168
+
169
+ // Username. Specified in URLs with an @ sign before the host. See |password|
170
+ Component username;
171
+
172
+ // Password. The length will be -1 if unspecified, 0 if specified but empty.
173
+ // Not all URLs with a username have a password, as in "http://me@host/".
174
+ // The password is separated form the username with a colon, as in
175
+ // "http://me:secret@host/"
176
+ Component password;
177
+
178
+ // Host name.
179
+ Component host;
180
+
181
+ // Port number.
182
+ Component port;
183
+
184
+ // Path, this is everything following the host name. Length will be -1 if
185
+ // unspecified. This includes the preceeding slash, so the path on
186
+ // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to
187
+ // have a 0 length path, it will be -1 in cases like "http://host?foo".
188
+ // Note that we treat backslashes the same as slashes.
189
+ Component path;
190
+
191
+ // Stuff between the ? and the # after the path. This does not include the
192
+ // preceeding ? character. Length will be -1 if unspecified, 0 if there is
193
+ // a question mark but no query string.
194
+ Component query;
195
+
196
+ // Indicated by a #, this is everything following the hash sign (not
197
+ // including it). If there are multiple hash signs, we'll use the last one.
198
+ // Length will be -1 if there is no hash sign, or 0 if there is one but
199
+ // nothing follows it.
200
+ Component ref;
201
+ };
202
+
203
+ // Initialization functions ---------------------------------------------------
204
+ //
205
+ // These functions parse the given URL, filling in all of the structure's
206
+ // components. These functions can not fail, they will always do their best
207
+ // at interpreting the input given.
208
+ //
209
+ // The string length of the URL MUST be specified, we do not check for NULLs
210
+ // at any point in the process, and will actually handle embedded NULLs.
211
+ //
212
+ // IMPORTANT: These functions do NOT hang on to the given pointer or copy it
213
+ // in any way. See the comment above the struct.
214
+ //
215
+ // The 8-bit versions require UTF-8 encoding.
216
+
217
+ // StandardURL is for when the scheme is known to be one that has an
218
+ // authority (host) like "http". This function will not handle weird ones
219
+ // like "about:" and "javascript:", or do the right thing for "file:" URLs.
220
+ GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
221
+ GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);
222
+
223
+ // PathURL is for when the scheme is known not to have an authority (host)
224
+ // section but that aren't file URLs either. The scheme is parsed, and
225
+ // everything after the scheme is considered as the path. This is used for
226
+ // things like "about:" and "javascript:"
227
+ GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed);
228
+ GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed);
229
+
230
+ // FileURL is for file URLs. There are some special rules for interpreting
231
+ // these.
232
+ GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed);
233
+ GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed);
234
+
235
+ // MailtoURL is for mailto: urls. They are made up scheme,path,query
236
+ GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
237
+ GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
238
+
239
+ // Helper functions -----------------------------------------------------------
240
+
241
+ // Locates the scheme according to the URL parser's rules. This function is
242
+ // designed so the caller can find the scheme and call the correct Init*
243
+ // function according to their known scheme types.
244
+ //
245
+ // It also does not perform any validation on the scheme.
246
+ //
247
+ // This function will return true if the scheme is found and will put the
248
+ // scheme's range into *scheme. False means no scheme could be found. Note
249
+ // that a URL beginning with a colon has a scheme, but it is empty, so this
250
+ // function will return true but *scheme will = (0,0).
251
+ //
252
+ // The scheme is found by skipping spaces and control characters at the
253
+ // beginning, and taking everything from there to the first colon to be the
254
+ // scheme. The character at scheme.end() will be the colon (we may enhance
255
+ // this to handle full width colons or something, so don't count on the
256
+ // actual character value). The character at scheme.end()+1 will be the
257
+ // beginning of the rest of the URL, be it the authority or the path (or the
258
+ // end of the string).
259
+ //
260
+ // The 8-bit version requires UTF-8 encoding.
261
+ GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme);
262
+ GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme);
263
+
264
+ // Returns true if ch is a character that terminates the authority segment
265
+ // of a URL.
266
+ GURL_API bool IsAuthorityTerminator(char16 ch);
267
+
268
+ // Does a best effort parse of input |spec|, in range |auth|. If a particular
269
+ // component is not found, it will be set to invalid.
270
+ GURL_API void ParseAuthority(const char* spec,
271
+ const Component& auth,
272
+ Component* username,
273
+ Component* password,
274
+ Component* hostname,
275
+ Component* port_num);
276
+ GURL_API void ParseAuthority(const char16* spec,
277
+ const Component& auth,
278
+ Component* username,
279
+ Component* password,
280
+ Component* hostname,
281
+ Component* port_num);
282
+
283
+ // Computes the integer port value from the given port component. The port
284
+ // component should have been identified by one of the init functions on
285
+ // |Parsed| for the given input url.
286
+ //
287
+ // The return value will be a positive integer between 0 and 64K, or one of
288
+ // the two special values below.
289
+ enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
290
+ GURL_API int ParsePort(const char* url, const Component& port);
291
+ GURL_API int ParsePort(const char16* url, const Component& port);
292
+
293
+ // Extracts the range of the file name in the given url. The path must
294
+ // already have been computed by the parse function, and the matching URL
295
+ // and extracted path are provided to this function. The filename is
296
+ // defined as being everything from the last slash/backslash of the path
297
+ // to the end of the path.
298
+ //
299
+ // The file name will be empty if the path is empty or there is nothing
300
+ // following the last slash.
301
+ //
302
+ // The 8-bit version requires UTF-8 encoding.
303
+ GURL_API void ExtractFileName(const char* url,
304
+ const Component& path,
305
+ Component* file_name);
306
+ GURL_API void ExtractFileName(const char16* url,
307
+ const Component& path,
308
+ Component* file_name);
309
+
310
+ // Extract the first key/value from the range defined by |*query|. Updates
311
+ // |*query| to start at the end of the extracted key/value pair. This is
312
+ // designed for use in a loop: you can keep calling it with the same query
313
+ // object and it will iterate over all items in the query.
314
+ //
315
+ // Some key/value pairs may have the key, the value, or both be empty (for
316
+ // example, the query string "?&"). These will be returned. Note that an empty
317
+ // last parameter "foo.com?" or foo.com?a&" will not be returned, this case
318
+ // is the same as "done."
319
+ //
320
+ // The initial query component should not include the '?' (this is the default
321
+ // for parsed URLs).
322
+ //
323
+ // If no key/value are found |*key| and |*value| will be unchanged and it will
324
+ // return false.
325
+ GURL_API bool ExtractQueryKeyValue(const char* url,
326
+ Component* query,
327
+ Component* key,
328
+ Component* value);
329
+ GURL_API bool ExtractQueryKeyValue(const char16* url,
330
+ Component* query,
331
+ Component* key,
332
+ Component* value);
333
+
334
+ } // namespace url_parse
335
+
336
+ #endif // GOOGLEURL_SRC_URL_PARSE_H__
@@ -0,0 +1,243 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include "logging.h"
31
+ #include "url_file.h"
32
+ #include "url_parse.h"
33
+ #include "url_parse_internal.h"
34
+
35
+ // Interesting IE file:isms...
36
+ //
37
+ // INPUT OUTPUT
38
+ // ========================= ==============================
39
+ // file:/foo/bar file:///foo/bar
40
+ // The result here seems totally invalid!?!? This isn't UNC.
41
+ //
42
+ // file:/
43
+ // file:// or any other number of slashes
44
+ // IE6 doesn't do anything at all if you click on this link. No error:
45
+ // nothing. IE6's history system seems to always color this link, so I'm
46
+ // guessing that it maps internally to the empty URL.
47
+ //
48
+ // C:\ file:///C:/
49
+ // When on a file: URL source page, this link will work. When over HTTP,
50
+ // the file: URL will appear in the status bar but the link will not work
51
+ // (security restriction for all file URLs).
52
+ //
53
+ // file:foo/ file:foo/ (invalid?!?!?)
54
+ // file:/foo/ file:///foo/ (invalid?!?!?)
55
+ // file://foo/ file://foo/ (UNC to server "foo")
56
+ // file:///foo/ file:///foo/ (invalid, seems to be a file)
57
+ // file:////foo/ file://foo/ (UNC to server "foo")
58
+ // Any more than four slashes is also treated as UNC.
59
+ //
60
+ // file:C:/ file://C:/
61
+ // file:/C:/ file://C:/
62
+ // The number of slashes after "file:" don't matter if the thing following
63
+ // it looks like an absolute drive path. Also, slashes and backslashes are
64
+ // equally valid here.
65
+
66
+ namespace url_parse {
67
+
68
+ namespace {
69
+
70
+ // A subcomponent of DoInitFileURL, the input of this function should be a UNC
71
+ // path name, with the index of the first character after the slashes following
72
+ // the scheme given in |after_slashes|. This will initialize the host, path,
73
+ // query, and ref, and leave the other output components untouched
74
+ // (DoInitFileURL handles these for us).
75
+ template<typename CHAR>
76
+ void DoParseUNC(const CHAR* spec,
77
+ int after_slashes,
78
+ int spec_len,
79
+ Parsed* parsed) {
80
+ int next_slash = FindNextSlash(spec, after_slashes, spec_len);
81
+ if (next_slash == spec_len) {
82
+ // No additional slash found, as in "file://foo", treat the text as the
83
+ // host with no path (this will end up being UNC to server "foo").
84
+ int host_len = spec_len - after_slashes;
85
+ if (host_len)
86
+ parsed->host = Component(after_slashes, host_len);
87
+ else
88
+ parsed->host.reset();
89
+ parsed->path.reset();
90
+ return;
91
+ }
92
+
93
+ #ifdef WIN32
94
+ // See if we have something that looks like a path following the first
95
+ // component. As in "file://localhost/c:/", we get "c:/" out. We want to
96
+ // treat this as a having no host but the path given. Works on Windows only.
97
+ if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
98
+ parsed->host.reset();
99
+ ParsePathInternal(spec, MakeRange(next_slash, spec_len),
100
+ &parsed->path, &parsed->query, &parsed->ref);
101
+ return;
102
+ }
103
+ #endif
104
+
105
+ // Otherwise, everything up until that first slash we found is the host name,
106
+ // which will end up being the UNC host. For example "file://foo/bar.txt"
107
+ // will get a server name of "foo" and a path of "/bar". Later, on Windows,
108
+ // this should be treated as the filename "\\foo\bar.txt" in proper UNC
109
+ // notation.
110
+ int host_len = next_slash - after_slashes;
111
+ if (host_len)
112
+ parsed->host = MakeRange(after_slashes, next_slash);
113
+ else
114
+ parsed->host.reset();
115
+ if (next_slash < spec_len) {
116
+ ParsePathInternal(spec, MakeRange(next_slash, spec_len),
117
+ &parsed->path, &parsed->query, &parsed->ref);
118
+ } else {
119
+ parsed->path.reset();
120
+ }
121
+ }
122
+
123
+ // A subcomponent of DoParseFileURL, the input should be a local file, with the
124
+ // beginning of the path indicated by the index in |path_begin|. This will
125
+ // initialize the host, path, query, and ref, and leave the other output
126
+ // components untouched (DoInitFileURL handles these for us).
127
+ template<typename CHAR>
128
+ void DoParseLocalFile(const CHAR* spec,
129
+ int path_begin,
130
+ int spec_len,
131
+ Parsed* parsed) {
132
+ parsed->host.reset();
133
+ ParsePathInternal(spec, MakeRange(path_begin, spec_len),
134
+ &parsed->path, &parsed->query, &parsed->ref);
135
+ }
136
+
137
+ // Backend for the external functions that operates on either char type.
138
+ // We are handed the character after the "file:" at the beginning of the spec.
139
+ // Usually this is a slash, but needn't be; we allow paths like "file:c:\foo".
140
+ template<typename CHAR>
141
+ void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
142
+ DCHECK(spec_len >= 0);
143
+
144
+ // Get the parts we never use for file URLs out of the way.
145
+ parsed->username.reset();
146
+ parsed->password.reset();
147
+ parsed->port.reset();
148
+
149
+ // Many of the code paths don't set these, so it's convenient to just clear
150
+ // them. We'll write them in those cases we need them.
151
+ parsed->query.reset();
152
+ parsed->ref.reset();
153
+
154
+ // Strip leading & trailing spaces and control characters.
155
+ int begin = 0;
156
+ TrimURL(spec, &begin, &spec_len);
157
+
158
+ // Find the scheme.
159
+ int num_slashes;
160
+ int after_scheme;
161
+ int after_slashes;
162
+ #ifdef WIN32
163
+ // See how many slashes there are. We want to handle cases like UNC but also
164
+ // "/c:/foo". This is when there is no scheme, so we can allow pages to do
165
+ // links like "c:/foo/bar" or "//foo/bar". This is also called by the
166
+ // relative URL resolver when it determines there is an absolute URL, which
167
+ // may give us input like "/c:/foo".
168
+ num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
169
+ after_slashes = begin + num_slashes;
170
+ if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
171
+ // Windows path, don't try to extract the scheme (for example, "c:\foo").
172
+ parsed->scheme.reset();
173
+ after_scheme = after_slashes;
174
+ } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
175
+ // Windows UNC path: don't try to extract the scheme, but keep the slashes.
176
+ parsed->scheme.reset();
177
+ after_scheme = begin;
178
+ } else
179
+ #endif
180
+ {
181
+ if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
182
+ // Offset the results since we gave ExtractScheme a substring.
183
+ parsed->scheme.begin += begin;
184
+ after_scheme = parsed->scheme.end() + 1;
185
+ } else {
186
+ // No scheme found, remember that.
187
+ parsed->scheme.reset();
188
+ after_scheme = begin;
189
+ }
190
+ }
191
+
192
+ // Handle empty specs ones that contain only whitespace or control chars,
193
+ // or that are just the scheme (for example "file:").
194
+ if (after_scheme == spec_len) {
195
+ parsed->host.reset();
196
+ parsed->path.reset();
197
+ return;
198
+ }
199
+
200
+ num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
201
+
202
+ after_slashes = after_scheme + num_slashes;
203
+ #ifdef WIN32
204
+ // Check whether the input is a drive again. We checked above for windows
205
+ // drive specs, but that's only at the very beginning to see if we have a
206
+ // scheme at all. This test will be duplicated in that case, but will
207
+ // additionally handle all cases with a real scheme such as "file:///C:/".
208
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
209
+ num_slashes != 3) {
210
+ // Anything not beginning with a drive spec ("c:\") on Windows is treated
211
+ // as UNC, with the exception of three slashes which always means a file.
212
+ // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
213
+ DoParseUNC(spec, after_slashes, spec_len, parsed);
214
+ return;
215
+ }
216
+ #else
217
+ // file: URL with exactly 2 slashes is considered to have a host component.
218
+ if (num_slashes == 2) {
219
+ DoParseUNC(spec, after_slashes, spec_len, parsed);
220
+ return;
221
+ }
222
+ #endif // WIN32
223
+
224
+ // Easy and common case, the full path immediately follows the scheme
225
+ // (modulo slashes), as in "file://c:/foo". Just treat everything from
226
+ // there to the end as the path. Empty hosts have 0 length instead of -1.
227
+ // We include the last slash as part of the path if there is one.
228
+ DoParseLocalFile(spec,
229
+ num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
230
+ spec_len, parsed);
231
+ }
232
+
233
+ } // namespace
234
+
235
+ void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
236
+ DoParseFileURL(url, url_len, parsed);
237
+ }
238
+
239
+ void ParseFileURL(const char16* url, int url_len, Parsed* parsed) {
240
+ DoParseFileURL(url, url_len, parsed);
241
+ }
242
+
243
+ } // namespace url_parse