uri_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,336 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #ifndef GOOGLEURL_SRC_URL_PARSE_H__
31
+ #define GOOGLEURL_SRC_URL_PARSE_H__
32
+
33
+ #include <string>
34
+
35
+ #include "basictypes.h"
36
+ #include "string16.h"
37
+ #include "url_common.h"
38
+
39
+ namespace url_parse {
40
+
41
+ // Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and
42
+ // KURLGoogle.cpp still rely on this type.
43
+ typedef char16 UTF16Char;
44
+
45
+ // Component ------------------------------------------------------------------
46
+
47
+ // Represents a substring for URL parsing.
48
+ struct Component {
49
+ Component() : begin(0), len(-1) {}
50
+
51
+ // Normal constructor: takes an offset and a length.
52
+ Component(int b, int l) : begin(b), len(l) {}
53
+
54
+ int end() const {
55
+ return begin + len;
56
+ }
57
+
58
+ // Returns true if this component is valid, meaning the length is given. Even
59
+ // valid components may be empty to record the fact that they exist.
60
+ bool is_valid() const {
61
+ return (len != -1);
62
+ }
63
+
64
+ // Returns true if the given component is specified on false, the component
65
+ // is either empty or invalid.
66
+ bool is_nonempty() const {
67
+ return (len > 0);
68
+ }
69
+
70
+ void reset() {
71
+ begin = 0;
72
+ len = -1;
73
+ }
74
+
75
+ bool operator==(const Component& other) const {
76
+ return begin == other.begin && len == other.len;
77
+ }
78
+
79
+ int begin; // Byte offset in the string of this component.
80
+ int len; // Will be -1 if the component is unspecified.
81
+ };
82
+
83
+ // Helper that returns a component created with the given begin and ending
84
+ // points. The ending point is non-inclusive.
85
+ inline Component MakeRange(int begin, int end) {
86
+ return Component(begin, end - begin);
87
+ }
88
+
89
+ // Parsed ---------------------------------------------------------------------
90
+
91
+ // A structure that holds the identified parts of an input URL. This structure
92
+ // does NOT store the URL itself. The caller will have to store the URL text
93
+ // and its corresponding Parsed structure separately.
94
+ //
95
+ // Typical usage would be:
96
+ //
97
+ // url_parse::Parsed parsed;
98
+ // url_parse::Component scheme;
99
+ // if (!url_parse::ExtractScheme(url, url_len, &scheme))
100
+ // return I_CAN_NOT_FIND_THE_SCHEME_DUDE;
101
+ //
102
+ // if (IsStandardScheme(url, scheme)) // Not provided by this component
103
+ // url_parseParseStandardURL(url, url_len, &parsed);
104
+ // else if (IsFileURL(url, scheme)) // Not provided by this component
105
+ // url_parse::ParseFileURL(url, url_len, &parsed);
106
+ // else
107
+ // url_parse::ParsePathURL(url, url_len, &parsed);
108
+ //
109
+ struct Parsed {
110
+ // Identifies different components.
111
+ enum ComponentType {
112
+ SCHEME,
113
+ USERNAME,
114
+ PASSWORD,
115
+ HOST,
116
+ PORT,
117
+ PATH,
118
+ QUERY,
119
+ REF,
120
+ };
121
+
122
+ // The default constructor is sufficient for the components.
123
+ GURL_API Parsed();
124
+
125
+ // Returns the length of the URL (the end of the last component).
126
+ //
127
+ // Note that for some invalid, non-canonical URLs, this may not be the length
128
+ // of the string. For example "http://": the parsed structure will only
129
+ // contain an entry for the four-character scheme, and it doesn't know about
130
+ // the "://". For all other last-components, it will return the real length.
131
+ GURL_API int Length() const;
132
+
133
+ // Returns the number of characters before the given component if it exists,
134
+ // or where the component would be if it did exist. This will return the
135
+ // string length if the component would be appended to the end.
136
+ //
137
+ // Note that this can get a little funny for the port, query, and ref
138
+ // components which have a delimiter that is not counted as part of the
139
+ // component. The |include_delimiter| flag controls if you want this counted
140
+ // as part of the component or not when the component exists.
141
+ //
142
+ // This example shows the difference between the two flags for two of these
143
+ // delimited components that is present (the port and query) and one that
144
+ // isn't (the reference). The components that this flag affects are marked
145
+ // with a *.
146
+ // 0 1 2
147
+ // 012345678901234567890
148
+ // Example input: http://foo:80/?query
149
+ // include_delim=true, ...=false ("<-" indicates different)
150
+ // SCHEME: 0 0
151
+ // USERNAME: 5 5
152
+ // PASSWORD: 5 5
153
+ // HOST: 7 7
154
+ // *PORT: 10 11 <-
155
+ // PATH: 13 13
156
+ // *QUERY: 14 15 <-
157
+ // *REF: 20 20
158
+ //
159
+ GURL_API int CountCharactersBefore(ComponentType type,
160
+ bool include_delimiter) const;
161
+
162
+ // Scheme without the colon: "http://foo"/ would have a scheme of "http".
163
+ // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there
164
+ // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed
165
+ // to start at the beginning of the string if there are preceeding whitespace
166
+ // or control characters.
167
+ Component scheme;
168
+
169
+ // Username. Specified in URLs with an @ sign before the host. See |password|
170
+ Component username;
171
+
172
+ // Password. The length will be -1 if unspecified, 0 if specified but empty.
173
+ // Not all URLs with a username have a password, as in "http://me@host/".
174
+ // The password is separated form the username with a colon, as in
175
+ // "http://me:secret@host/"
176
+ Component password;
177
+
178
+ // Host name.
179
+ Component host;
180
+
181
+ // Port number.
182
+ Component port;
183
+
184
+ // Path, this is everything following the host name. Length will be -1 if
185
+ // unspecified. This includes the preceeding slash, so the path on
186
+ // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to
187
+ // have a 0 length path, it will be -1 in cases like "http://host?foo".
188
+ // Note that we treat backslashes the same as slashes.
189
+ Component path;
190
+
191
+ // Stuff between the ? and the # after the path. This does not include the
192
+ // preceeding ? character. Length will be -1 if unspecified, 0 if there is
193
+ // a question mark but no query string.
194
+ Component query;
195
+
196
+ // Indicated by a #, this is everything following the hash sign (not
197
+ // including it). If there are multiple hash signs, we'll use the last one.
198
+ // Length will be -1 if there is no hash sign, or 0 if there is one but
199
+ // nothing follows it.
200
+ Component ref;
201
+ };
202
+
203
+ // Initialization functions ---------------------------------------------------
204
+ //
205
+ // These functions parse the given URL, filling in all of the structure's
206
+ // components. These functions can not fail, they will always do their best
207
+ // at interpreting the input given.
208
+ //
209
+ // The string length of the URL MUST be specified, we do not check for NULLs
210
+ // at any point in the process, and will actually handle embedded NULLs.
211
+ //
212
+ // IMPORTANT: These functions do NOT hang on to the given pointer or copy it
213
+ // in any way. See the comment above the struct.
214
+ //
215
+ // The 8-bit versions require UTF-8 encoding.
216
+
217
+ // StandardURL is for when the scheme is known to be one that has an
218
+ // authority (host) like "http". This function will not handle weird ones
219
+ // like "about:" and "javascript:", or do the right thing for "file:" URLs.
220
+ GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
221
+ GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);
222
+
223
+ // PathURL is for when the scheme is known not to have an authority (host)
224
+ // section but that aren't file URLs either. The scheme is parsed, and
225
+ // everything after the scheme is considered as the path. This is used for
226
+ // things like "about:" and "javascript:"
227
+ GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed);
228
+ GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed);
229
+
230
+ // FileURL is for file URLs. There are some special rules for interpreting
231
+ // these.
232
+ GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed);
233
+ GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed);
234
+
235
+ // MailtoURL is for mailto: urls. They are made up scheme,path,query
236
+ GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
237
+ GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
238
+
239
+ // Helper functions -----------------------------------------------------------
240
+
241
+ // Locates the scheme according to the URL parser's rules. This function is
242
+ // designed so the caller can find the scheme and call the correct Init*
243
+ // function according to their known scheme types.
244
+ //
245
+ // It also does not perform any validation on the scheme.
246
+ //
247
+ // This function will return true if the scheme is found and will put the
248
+ // scheme's range into *scheme. False means no scheme could be found. Note
249
+ // that a URL beginning with a colon has a scheme, but it is empty, so this
250
+ // function will return true but *scheme will = (0,0).
251
+ //
252
+ // The scheme is found by skipping spaces and control characters at the
253
+ // beginning, and taking everything from there to the first colon to be the
254
+ // scheme. The character at scheme.end() will be the colon (we may enhance
255
+ // this to handle full width colons or something, so don't count on the
256
+ // actual character value). The character at scheme.end()+1 will be the
257
+ // beginning of the rest of the URL, be it the authority or the path (or the
258
+ // end of the string).
259
+ //
260
+ // The 8-bit version requires UTF-8 encoding.
261
+ GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme);
262
+ GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme);
263
+
264
+ // Returns true if ch is a character that terminates the authority segment
265
+ // of a URL.
266
+ GURL_API bool IsAuthorityTerminator(char16 ch);
267
+
268
+ // Does a best effort parse of input |spec|, in range |auth|. If a particular
269
+ // component is not found, it will be set to invalid.
270
+ GURL_API void ParseAuthority(const char* spec,
271
+ const Component& auth,
272
+ Component* username,
273
+ Component* password,
274
+ Component* hostname,
275
+ Component* port_num);
276
+ GURL_API void ParseAuthority(const char16* spec,
277
+ const Component& auth,
278
+ Component* username,
279
+ Component* password,
280
+ Component* hostname,
281
+ Component* port_num);
282
+
283
+ // Computes the integer port value from the given port component. The port
284
+ // component should have been identified by one of the init functions on
285
+ // |Parsed| for the given input url.
286
+ //
287
+ // The return value will be a positive integer between 0 and 64K, or one of
288
+ // the two special values below.
289
+ enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
290
+ GURL_API int ParsePort(const char* url, const Component& port);
291
+ GURL_API int ParsePort(const char16* url, const Component& port);
292
+
293
+ // Extracts the range of the file name in the given url. The path must
294
+ // already have been computed by the parse function, and the matching URL
295
+ // and extracted path are provided to this function. The filename is
296
+ // defined as being everything from the last slash/backslash of the path
297
+ // to the end of the path.
298
+ //
299
+ // The file name will be empty if the path is empty or there is nothing
300
+ // following the last slash.
301
+ //
302
+ // The 8-bit version requires UTF-8 encoding.
303
+ GURL_API void ExtractFileName(const char* url,
304
+ const Component& path,
305
+ Component* file_name);
306
+ GURL_API void ExtractFileName(const char16* url,
307
+ const Component& path,
308
+ Component* file_name);
309
+
310
+ // Extract the first key/value from the range defined by |*query|. Updates
311
+ // |*query| to start at the end of the extracted key/value pair. This is
312
+ // designed for use in a loop: you can keep calling it with the same query
313
+ // object and it will iterate over all items in the query.
314
+ //
315
+ // Some key/value pairs may have the key, the value, or both be empty (for
316
+ // example, the query string "?&"). These will be returned. Note that an empty
317
+ // last parameter "foo.com?" or foo.com?a&" will not be returned, this case
318
+ // is the same as "done."
319
+ //
320
+ // The initial query component should not include the '?' (this is the default
321
+ // for parsed URLs).
322
+ //
323
+ // If no key/value are found |*key| and |*value| will be unchanged and it will
324
+ // return false.
325
+ GURL_API bool ExtractQueryKeyValue(const char* url,
326
+ Component* query,
327
+ Component* key,
328
+ Component* value);
329
+ GURL_API bool ExtractQueryKeyValue(const char16* url,
330
+ Component* query,
331
+ Component* key,
332
+ Component* value);
333
+
334
+ } // namespace url_parse
335
+
336
+ #endif // GOOGLEURL_SRC_URL_PARSE_H__
@@ -0,0 +1,243 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include "logging.h"
31
+ #include "url_file.h"
32
+ #include "url_parse.h"
33
+ #include "url_parse_internal.h"
34
+
35
+ // Interesting IE file:isms...
36
+ //
37
+ // INPUT OUTPUT
38
+ // ========================= ==============================
39
+ // file:/foo/bar file:///foo/bar
40
+ // The result here seems totally invalid!?!? This isn't UNC.
41
+ //
42
+ // file:/
43
+ // file:// or any other number of slashes
44
+ // IE6 doesn't do anything at all if you click on this link. No error:
45
+ // nothing. IE6's history system seems to always color this link, so I'm
46
+ // guessing that it maps internally to the empty URL.
47
+ //
48
+ // C:\ file:///C:/
49
+ // When on a file: URL source page, this link will work. When over HTTP,
50
+ // the file: URL will appear in the status bar but the link will not work
51
+ // (security restriction for all file URLs).
52
+ //
53
+ // file:foo/ file:foo/ (invalid?!?!?)
54
+ // file:/foo/ file:///foo/ (invalid?!?!?)
55
+ // file://foo/ file://foo/ (UNC to server "foo")
56
+ // file:///foo/ file:///foo/ (invalid, seems to be a file)
57
+ // file:////foo/ file://foo/ (UNC to server "foo")
58
+ // Any more than four slashes is also treated as UNC.
59
+ //
60
+ // file:C:/ file://C:/
61
+ // file:/C:/ file://C:/
62
+ // The number of slashes after "file:" don't matter if the thing following
63
+ // it looks like an absolute drive path. Also, slashes and backslashes are
64
+ // equally valid here.
65
+
66
+ namespace url_parse {
67
+
68
+ namespace {
69
+
70
+ // A subcomponent of DoInitFileURL, the input of this function should be a UNC
71
+ // path name, with the index of the first character after the slashes following
72
+ // the scheme given in |after_slashes|. This will initialize the host, path,
73
+ // query, and ref, and leave the other output components untouched
74
+ // (DoInitFileURL handles these for us).
75
+ template<typename CHAR>
76
+ void DoParseUNC(const CHAR* spec,
77
+ int after_slashes,
78
+ int spec_len,
79
+ Parsed* parsed) {
80
+ int next_slash = FindNextSlash(spec, after_slashes, spec_len);
81
+ if (next_slash == spec_len) {
82
+ // No additional slash found, as in "file://foo", treat the text as the
83
+ // host with no path (this will end up being UNC to server "foo").
84
+ int host_len = spec_len - after_slashes;
85
+ if (host_len)
86
+ parsed->host = Component(after_slashes, host_len);
87
+ else
88
+ parsed->host.reset();
89
+ parsed->path.reset();
90
+ return;
91
+ }
92
+
93
+ #ifdef WIN32
94
+ // See if we have something that looks like a path following the first
95
+ // component. As in "file://localhost/c:/", we get "c:/" out. We want to
96
+ // treat this as a having no host but the path given. Works on Windows only.
97
+ if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
98
+ parsed->host.reset();
99
+ ParsePathInternal(spec, MakeRange(next_slash, spec_len),
100
+ &parsed->path, &parsed->query, &parsed->ref);
101
+ return;
102
+ }
103
+ #endif
104
+
105
+ // Otherwise, everything up until that first slash we found is the host name,
106
+ // which will end up being the UNC host. For example "file://foo/bar.txt"
107
+ // will get a server name of "foo" and a path of "/bar". Later, on Windows,
108
+ // this should be treated as the filename "\\foo\bar.txt" in proper UNC
109
+ // notation.
110
+ int host_len = next_slash - after_slashes;
111
+ if (host_len)
112
+ parsed->host = MakeRange(after_slashes, next_slash);
113
+ else
114
+ parsed->host.reset();
115
+ if (next_slash < spec_len) {
116
+ ParsePathInternal(spec, MakeRange(next_slash, spec_len),
117
+ &parsed->path, &parsed->query, &parsed->ref);
118
+ } else {
119
+ parsed->path.reset();
120
+ }
121
+ }
122
+
123
+ // A subcomponent of DoParseFileURL, the input should be a local file, with the
124
+ // beginning of the path indicated by the index in |path_begin|. This will
125
+ // initialize the host, path, query, and ref, and leave the other output
126
+ // components untouched (DoInitFileURL handles these for us).
127
+ template<typename CHAR>
128
+ void DoParseLocalFile(const CHAR* spec,
129
+ int path_begin,
130
+ int spec_len,
131
+ Parsed* parsed) {
132
+ parsed->host.reset();
133
+ ParsePathInternal(spec, MakeRange(path_begin, spec_len),
134
+ &parsed->path, &parsed->query, &parsed->ref);
135
+ }
136
+
137
+ // Backend for the external functions that operates on either char type.
138
+ // We are handed the character after the "file:" at the beginning of the spec.
139
+ // Usually this is a slash, but needn't be; we allow paths like "file:c:\foo".
140
+ template<typename CHAR>
141
+ void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
142
+ DCHECK(spec_len >= 0);
143
+
144
+ // Get the parts we never use for file URLs out of the way.
145
+ parsed->username.reset();
146
+ parsed->password.reset();
147
+ parsed->port.reset();
148
+
149
+ // Many of the code paths don't set these, so it's convenient to just clear
150
+ // them. We'll write them in those cases we need them.
151
+ parsed->query.reset();
152
+ parsed->ref.reset();
153
+
154
+ // Strip leading & trailing spaces and control characters.
155
+ int begin = 0;
156
+ TrimURL(spec, &begin, &spec_len);
157
+
158
+ // Find the scheme.
159
+ int num_slashes;
160
+ int after_scheme;
161
+ int after_slashes;
162
+ #ifdef WIN32
163
+ // See how many slashes there are. We want to handle cases like UNC but also
164
+ // "/c:/foo". This is when there is no scheme, so we can allow pages to do
165
+ // links like "c:/foo/bar" or "//foo/bar". This is also called by the
166
+ // relative URL resolver when it determines there is an absolute URL, which
167
+ // may give us input like "/c:/foo".
168
+ num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
169
+ after_slashes = begin + num_slashes;
170
+ if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
171
+ // Windows path, don't try to extract the scheme (for example, "c:\foo").
172
+ parsed->scheme.reset();
173
+ after_scheme = after_slashes;
174
+ } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
175
+ // Windows UNC path: don't try to extract the scheme, but keep the slashes.
176
+ parsed->scheme.reset();
177
+ after_scheme = begin;
178
+ } else
179
+ #endif
180
+ {
181
+ if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
182
+ // Offset the results since we gave ExtractScheme a substring.
183
+ parsed->scheme.begin += begin;
184
+ after_scheme = parsed->scheme.end() + 1;
185
+ } else {
186
+ // No scheme found, remember that.
187
+ parsed->scheme.reset();
188
+ after_scheme = begin;
189
+ }
190
+ }
191
+
192
+ // Handle empty specs ones that contain only whitespace or control chars,
193
+ // or that are just the scheme (for example "file:").
194
+ if (after_scheme == spec_len) {
195
+ parsed->host.reset();
196
+ parsed->path.reset();
197
+ return;
198
+ }
199
+
200
+ num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
201
+
202
+ after_slashes = after_scheme + num_slashes;
203
+ #ifdef WIN32
204
+ // Check whether the input is a drive again. We checked above for windows
205
+ // drive specs, but that's only at the very beginning to see if we have a
206
+ // scheme at all. This test will be duplicated in that case, but will
207
+ // additionally handle all cases with a real scheme such as "file:///C:/".
208
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
209
+ num_slashes != 3) {
210
+ // Anything not beginning with a drive spec ("c:\") on Windows is treated
211
+ // as UNC, with the exception of three slashes which always means a file.
212
+ // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
213
+ DoParseUNC(spec, after_slashes, spec_len, parsed);
214
+ return;
215
+ }
216
+ #else
217
+ // file: URL with exactly 2 slashes is considered to have a host component.
218
+ if (num_slashes == 2) {
219
+ DoParseUNC(spec, after_slashes, spec_len, parsed);
220
+ return;
221
+ }
222
+ #endif // WIN32
223
+
224
+ // Easy and common case, the full path immediately follows the scheme
225
+ // (modulo slashes), as in "file://c:/foo". Just treat everything from
226
+ // there to the end as the path. Empty hosts have 0 length instead of -1.
227
+ // We include the last slash as part of the path if there is one.
228
+ DoParseLocalFile(spec,
229
+ num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
230
+ spec_len, parsed);
231
+ }
232
+
233
+ } // namespace
234
+
235
+ void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
236
+ DoParseFileURL(url, url_len, parsed);
237
+ }
238
+
239
+ void ParseFileURL(const char16* url, int url_len, Parsed* parsed) {
240
+ DoParseFileURL(url, url_len, parsed);
241
+ }
242
+
243
+ } // namespace url_parse