uri_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,157 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ // As with url_canon_internal.h, this file is intended to be included in
31
+ // another C++ file where the template types are defined. This allows the
32
+ // programmer to use this to use these functions for their own strings
33
+ // types, without bloating the code by having inline templates used in
34
+ // every call site.
35
+ //
36
+ // *** This file must be included after url_canon_internal as we depend on some
37
+ // functions in it. ***
38
+
39
+ #ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
40
+ #define GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
41
+
42
+ #include "url_file.h"
43
+ #include "url_parse_internal.h"
44
+
45
+ using namespace url_canon;
46
+
47
+ // Given a pointer into the spec, this copies and canonicalizes the drive
48
+ // letter and colon to the output, if one is found. If there is not a drive
49
+ // spec, it won't do anything. The index of the next character in the input
50
+ // spec is returned (after the colon when a drive spec is found, the begin
51
+ // offset if one is not).
52
+ template<typename CHAR>
53
+ static int FileDoDriveSpec(const CHAR* spec, int begin, int end,
54
+ CanonOutput* output) {
55
+ // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
56
+ // (with backslashes instead of slashes as well).
57
+ int num_slashes = CountConsecutiveSlashes(spec, begin, end);
58
+ int after_slashes = begin + num_slashes;
59
+
60
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end))
61
+ return begin; // Haven't consumed any characters
62
+
63
+ // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
64
+ // and that it is followed by a colon/pipe.
65
+
66
+ // Normalize Windows drive letters to uppercase
67
+ if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
68
+ output->push_back(spec[after_slashes] - 'a' + 'A');
69
+ else
70
+ output->push_back(static_cast<char>(spec[after_slashes]));
71
+
72
+ // Normalize the character following it to a colon rather than pipe.
73
+ output->push_back(':');
74
+ output->push_back('/');
75
+ return after_slashes + 2;
76
+ }
77
+
78
+ // FileDoDriveSpec will have already added the first backslash, so we need to
79
+ // write everything following the slashes using the path canonicalizer.
80
+ template<typename CHAR, typename UCHAR>
81
+ static void FileDoPath(const CHAR* spec, int begin, int end,
82
+ CanonOutput* output) {
83
+ // Normalize the number of slashes after the drive letter. The path
84
+ // canonicalizer expects the input to begin in a slash already so
85
+ // doesn't check. We want to handle no-slashes
86
+ int num_slashes = CountConsecutiveSlashes(spec, begin, end);
87
+ int after_slashes = begin + num_slashes;
88
+
89
+ // Now use the regular path canonicalizer to canonicalize the rest of the
90
+ // path. We supply it with the path following the slashes. It won't prepend
91
+ // a slash because it assumes any nonempty path already starts with one.
92
+ // We explicitly filter out calls with no path here to prevent that case.
93
+ ParsedURL::Component sub_path(after_slashes, end - after_slashes);
94
+ if (sub_path.len > 0) {
95
+ // Give it a fake output component to write into. DoCanonicalizeFile will
96
+ // compute the full path component.
97
+ ParsedURL::Component fake_output_path;
98
+ URLCanonInternal<CHAR, UCHAR>::DoPath(
99
+ spec, sub_path, output, &fake_output_path);
100
+ }
101
+ }
102
+
103
+ template<typename CHAR, typename UCHAR>
104
+ static bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
105
+ const ParsedURL& parsed,
106
+ CanonOutput* output,
107
+ ParsedURL* new_parsed) {
108
+ // Things we don't set in file: URLs.
109
+ new_parsed->username = ParsedURL::Component(0, -1);
110
+ new_parsed->password = ParsedURL::Component(0, -1);
111
+ new_parsed->port = ParsedURL::Component(0, -1);
112
+
113
+ // Scheme (known, so we don't bother running it through the more
114
+ // complicated scheme canonicalizer).
115
+ new_parsed->scheme.begin = output->length();
116
+ output->push_back('f');
117
+ output->push_back('i');
118
+ output->push_back('l');
119
+ output->push_back('e');
120
+ new_parsed->scheme.len = output->length() - new_parsed->scheme.begin;
121
+ output->push_back(':');
122
+
123
+ // Write the separator for the host.
124
+ output->push_back('/');
125
+ output->push_back('/');
126
+
127
+ // Append the host. For many file URLs, this will be empty. For UNC, this
128
+ // will be present.
129
+ // TODO(brettw) This doesn't do any checking for host name validity. We
130
+ // should probably handle validity checking of UNC hosts differently than
131
+ // for regular IP hosts.
132
+ bool success = URLCanonInternal<CHAR, UCHAR>::DoHost(
133
+ source.host, parsed.host, output, &new_parsed->host);
134
+
135
+ // Write a separator for the start of the path. We'll ignore any slashes
136
+ // already at the beginning of the path.
137
+ new_parsed->path.begin = output->length();
138
+ output->push_back('/');
139
+
140
+ // Copies and normalizes the "c:" at the beginning, if present.
141
+ int after_drive = FileDoDriveSpec(source.path, parsed.path.begin,
142
+ parsed.path.end(), output);
143
+
144
+ // Copies the rest of the path
145
+ FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output);
146
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
147
+
148
+ // Things following the path we can use the standard canonicalizers for.
149
+ success &= URLCanonInternal<CHAR, UCHAR>::DoQuery(
150
+ source.query, parsed.query, output, &new_parsed->query);
151
+ success &= URLCanonInternal<CHAR, UCHAR>::DoRef(
152
+ source.ref, parsed.ref, output, &new_parsed->ref);
153
+
154
+ return success;
155
+ }
156
+
157
+ #endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
@@ -0,0 +1,737 @@
1
+ // Copyright 2009, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include "url_canon_ip.h"
31
+
32
+ #include <stdlib.h>
33
+
34
+ #include "basictypes.h"
35
+ #include "logging.h"
36
+ #include "url_canon_internal.h"
37
+
38
+ namespace url_canon {
39
+
40
+ namespace {
41
+
42
+ // Converts one of the character types that represent a numerical base to the
43
+ // corresponding base.
44
+ int BaseForType(SharedCharTypes type) {
45
+ switch (type) {
46
+ case CHAR_HEX:
47
+ return 16;
48
+ case CHAR_DEC:
49
+ return 10;
50
+ case CHAR_OCT:
51
+ return 8;
52
+ default:
53
+ return 0;
54
+ }
55
+ }
56
+
57
+ template<typename CHAR, typename UCHAR>
58
+ bool DoFindIPv4Components(const CHAR* spec,
59
+ const url_parse::Component& host,
60
+ url_parse::Component components[4]) {
61
+ if (!host.is_nonempty())
62
+ return false;
63
+
64
+ int cur_component = 0; // Index of the component we're working on.
65
+ int cur_component_begin = host.begin; // Start of the current component.
66
+ int end = host.end();
67
+ for (int i = host.begin; /* nothing */; i++) {
68
+ if (i >= end || spec[i] == '.') {
69
+ // Found the end of the current component.
70
+ int component_len = i - cur_component_begin;
71
+ components[cur_component] =
72
+ url_parse::Component(cur_component_begin, component_len);
73
+
74
+ // The next component starts after the dot.
75
+ cur_component_begin = i + 1;
76
+ cur_component++;
77
+
78
+ // Don't allow empty components (two dots in a row), except we may
79
+ // allow an empty component at the end (this would indicate that the
80
+ // input ends in a dot). We also want to error if the component is
81
+ // empty and it's the only component (cur_component == 1).
82
+ if (component_len == 0 && (i < end || cur_component == 1))
83
+ return false;
84
+
85
+ if (i >= end)
86
+ break; // End of the input.
87
+
88
+ if (cur_component == 4) {
89
+ // Anything else after the 4th component is an error unless it is a
90
+ // dot that would otherwise be treated as the end of input.
91
+ if (spec[i] == '.' && i + 1 == end)
92
+ break;
93
+ return false;
94
+ }
95
+ } else if (static_cast<UCHAR>(spec[i]) >= 0x80 ||
96
+ !IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
97
+ // Invalid character for an IPv4 address.
98
+ return false;
99
+ }
100
+ }
101
+
102
+ // Fill in any unused components.
103
+ while (cur_component < 4)
104
+ components[cur_component++] = url_parse::Component();
105
+ return true;
106
+ }
107
+
108
+ // Converts an IPv4 component to a 32-bit number, while checking for overflow.
109
+ //
110
+ // Possible return values:
111
+ // - IPV4 - The number was valid, and did not overflow.
112
+ // - BROKEN - The input was numeric, but too large for a 32-bit field.
113
+ // - NEUTRAL - Input was not numeric.
114
+ //
115
+ // The input is assumed to be ASCII. FindIPv4Components should have stripped
116
+ // out any input that is greater than 7 bits. The components are assumed
117
+ // to be non-empty.
118
+ template<typename CHAR>
119
+ CanonHostInfo::Family IPv4ComponentToNumber(
120
+ const CHAR* spec,
121
+ const url_parse::Component& component,
122
+ uint32* number) {
123
+ // Figure out the base
124
+ SharedCharTypes base;
125
+ int base_prefix_len = 0; // Size of the prefix for this base.
126
+ if (spec[component.begin] == '0') {
127
+ // Either hex or dec, or a standalone zero.
128
+ if (component.len == 1) {
129
+ base = CHAR_DEC;
130
+ } else if (spec[component.begin + 1] == 'X' ||
131
+ spec[component.begin + 1] == 'x') {
132
+ base = CHAR_HEX;
133
+ base_prefix_len = 2;
134
+ } else {
135
+ base = CHAR_OCT;
136
+ base_prefix_len = 1;
137
+ }
138
+ } else {
139
+ base = CHAR_DEC;
140
+ }
141
+
142
+ // Extend the prefix to consume all leading zeros.
143
+ while (base_prefix_len < component.len &&
144
+ spec[component.begin + base_prefix_len] == '0')
145
+ base_prefix_len++;
146
+
147
+ // Put the component, minus any base prefix, into a NULL-terminated buffer so
148
+ // we can call the standard library. Because leading zeros have already been
149
+ // discarded, filling the entire buffer is guaranteed to trigger the 32-bit
150
+ // overflow check.
151
+ const int kMaxComponentLen = 16;
152
+ char buf[kMaxComponentLen + 1]; // digits + '\0'
153
+ int dest_i = 0;
154
+ for (int i = component.begin + base_prefix_len; i < component.end(); i++) {
155
+ // We know the input is 7-bit, so convert to narrow (if this is the wide
156
+ // version of the template) by casting.
157
+ char input = static_cast<char>(spec[i]);
158
+
159
+ // Validate that this character is OK for the given base.
160
+ if (!IsCharOfType(input, base))
161
+ return CanonHostInfo::NEUTRAL;
162
+
163
+ // Fill the buffer, if there's space remaining. This check allows us to
164
+ // verify that all characters are numeric, even those that don't fit.
165
+ if (dest_i < kMaxComponentLen)
166
+ buf[dest_i++] = input;
167
+ }
168
+
169
+ buf[dest_i] = '\0';
170
+
171
+ // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
172
+ // number can overflow a 64-bit number in <= 16 characters).
173
+ uint64 num = _strtoui64(buf, NULL, BaseForType(base));
174
+
175
+ // Check for 32-bit overflow.
176
+ if (num > kuint32max)
177
+ return CanonHostInfo::BROKEN;
178
+
179
+ // No overflow. Success!
180
+ *number = static_cast<uint32>(num);
181
+ return CanonHostInfo::IPV4;
182
+ }
183
+
184
+ // Writes the given address (with each character representing one dotted
185
+ // part of an IPv4 address) to the output, and updating |*out_host| to
186
+ // identify the added portion.
187
+ void AppendIPv4Address(const unsigned char address[4],
188
+ CanonOutput* output,
189
+ url_parse::Component* out_host) {
190
+ out_host->begin = output->length();
191
+ for (int i = 0; i < 4; i++) {
192
+ char str[16];
193
+ _itoa_s(address[i], str, 10);
194
+
195
+ for (int ch = 0; str[ch] != 0; ch++)
196
+ output->push_back(str[ch]);
197
+
198
+ if (i != 3)
199
+ output->push_back('.');
200
+ }
201
+ out_host->len = output->length() - out_host->begin;
202
+ }
203
+
204
+ // See declaration of IPv4AddressToNumber for documentation.
205
+ template<typename CHAR>
206
+ CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec,
207
+ const url_parse::Component& host,
208
+ unsigned char address[4],
209
+ int* num_ipv4_components) {
210
+ // The identified components. Not all may exist.
211
+ url_parse::Component components[4];
212
+ if (!FindIPv4Components(spec, host, components))
213
+ return CanonHostInfo::NEUTRAL;
214
+
215
+ // Convert existing components to digits. Values up to
216
+ // |existing_components| will be valid.
217
+ uint32 component_values[4];
218
+ int existing_components = 0;
219
+ for (int i = 0; i < 4; i++) {
220
+ if (components[i].len <= 0)
221
+ continue;
222
+ CanonHostInfo::Family family = IPv4ComponentToNumber(
223
+ spec, components[i], &component_values[existing_components]);
224
+
225
+ // Stop if we hit an invalid non-empty component.
226
+ if (family != CanonHostInfo::IPV4)
227
+ return family;
228
+
229
+ existing_components++;
230
+ }
231
+
232
+ // Use that sequence of numbers to fill out the 4-component IP address.
233
+
234
+ // First, process all components but the last, while making sure each fits
235
+ // within an 8-bit field.
236
+ for (int i = 0; i < existing_components - 1; i++) {
237
+ if (component_values[i] > kuint8max)
238
+ return CanonHostInfo::BROKEN;
239
+ address[i] = static_cast<unsigned char>(component_values[i]);
240
+ }
241
+
242
+ // Next, consume the last component to fill in the remaining bytes.
243
+ uint32 last_value = component_values[existing_components - 1];
244
+ for (int i = 3; i >= existing_components - 1; i--) {
245
+ address[i] = static_cast<unsigned char>(last_value);
246
+ last_value >>= 8;
247
+ }
248
+
249
+ // If the last component has residual bits, report overflow.
250
+ if (last_value != 0)
251
+ return CanonHostInfo::BROKEN;
252
+
253
+ // Tell the caller how many components we saw.
254
+ *num_ipv4_components = existing_components;
255
+
256
+ // Success!
257
+ return CanonHostInfo::IPV4;
258
+ }
259
+
260
+ // Return true if we've made a final IPV4/BROKEN decision, false if the result
261
+ // is NEUTRAL, and we could use a second opinion.
262
+ template<typename CHAR, typename UCHAR>
263
+ bool DoCanonicalizeIPv4Address(const CHAR* spec,
264
+ const url_parse::Component& host,
265
+ CanonOutput* output,
266
+ CanonHostInfo* host_info) {
267
+ unsigned char address[4];
268
+ host_info->family = IPv4AddressToNumber(
269
+ spec, host, address, &host_info->num_ipv4_components);
270
+
271
+ switch (host_info->family) {
272
+ case CanonHostInfo::IPV4:
273
+ // Definitely an IPv4 address.
274
+ AppendIPv4Address(address, output, &host_info->out_host);
275
+ return true;
276
+ case CanonHostInfo::BROKEN:
277
+ // Definitely broken.
278
+ return true;
279
+ default:
280
+ // Could be IPv6 or a hostname.
281
+ return false;
282
+ }
283
+ }
284
+
285
+ // Helper class that describes the main components of an IPv6 input string.
286
+ // See the following examples to understand how it breaks up an input string:
287
+ //
288
+ // [Example 1]: input = "[::aa:bb]"
289
+ // ==> num_hex_components = 2
290
+ // ==> hex_components[0] = Component(3,2) "aa"
291
+ // ==> hex_components[1] = Component(6,2) "bb"
292
+ // ==> index_of_contraction = 0
293
+ // ==> ipv4_component = Component(0, -1)
294
+ //
295
+ // [Example 2]: input = "[1:2::3:4:5]"
296
+ // ==> num_hex_components = 5
297
+ // ==> hex_components[0] = Component(1,1) "1"
298
+ // ==> hex_components[1] = Component(3,1) "2"
299
+ // ==> hex_components[2] = Component(6,1) "3"
300
+ // ==> hex_components[3] = Component(8,1) "4"
301
+ // ==> hex_components[4] = Component(10,1) "5"
302
+ // ==> index_of_contraction = 2
303
+ // ==> ipv4_component = Component(0, -1)
304
+ //
305
+ // [Example 3]: input = "[::ffff:192.168.0.1]"
306
+ // ==> num_hex_components = 1
307
+ // ==> hex_components[0] = Component(3,4) "ffff"
308
+ // ==> index_of_contraction = 0
309
+ // ==> ipv4_component = Component(8, 11) "192.168.0.1"
310
+ //
311
+ // [Example 4]: input = "[1::]"
312
+ // ==> num_hex_components = 1
313
+ // ==> hex_components[0] = Component(1,1) "1"
314
+ // ==> index_of_contraction = 1
315
+ // ==> ipv4_component = Component(0, -1)
316
+ //
317
+ // [Example 5]: input = "[::192.168.0.1]"
318
+ // ==> num_hex_components = 0
319
+ // ==> index_of_contraction = 0
320
+ // ==> ipv4_component = Component(8, 11) "192.168.0.1"
321
+ //
322
+ struct IPv6Parsed {
323
+ // Zero-out the parse information.
324
+ void reset() {
325
+ num_hex_components = 0;
326
+ index_of_contraction = -1;
327
+ ipv4_component.reset();
328
+ }
329
+
330
+ // There can be up to 8 hex components (colon separated) in the literal.
331
+ url_parse::Component hex_components[8];
332
+
333
+ // The count of hex components present. Ranges from [0,8].
334
+ int num_hex_components;
335
+
336
+ // The index of the hex component that the "::" contraction precedes, or
337
+ // -1 if there is no contraction.
338
+ int index_of_contraction;
339
+
340
+ // The range of characters which are an IPv4 literal.
341
+ url_parse::Component ipv4_component;
342
+ };
343
+
344
+ // Parse the IPv6 input string. If parsing succeeded returns true and fills
345
+ // |parsed| with the information. If parsing failed (because the input is
346
+ // invalid) returns false.
347
+ template<typename CHAR, typename UCHAR>
348
+ bool DoParseIPv6(const CHAR* spec,
349
+ const url_parse::Component& host,
350
+ IPv6Parsed* parsed) {
351
+ // Zero-out the info.
352
+ parsed->reset();
353
+
354
+ if (!host.is_nonempty())
355
+ return false;
356
+
357
+ // The index for start and end of address range (no brackets).
358
+ int begin = host.begin;
359
+ int end = host.end();
360
+
361
+ int cur_component_begin = begin; // Start of the current component.
362
+
363
+ // Scan through the input, searching for hex components, "::" contractions,
364
+ // and IPv4 components.
365
+ for (int i = begin; /* i <= end */; i++) {
366
+ bool is_colon = spec[i] == ':';
367
+ bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':';
368
+
369
+ // We reached the end of the current component if we encounter a colon
370
+ // (separator between hex components, or start of a contraction), or end of
371
+ // input.
372
+ if (is_colon || i == end) {
373
+ int component_len = i - cur_component_begin;
374
+
375
+ // A component should not have more than 4 hex digits.
376
+ if (component_len > 4)
377
+ return false;
378
+
379
+ // Don't allow empty components.
380
+ if (component_len == 0) {
381
+ // The exception is when contractions appear at beginning of the
382
+ // input or at the end of the input.
383
+ if (!((is_contraction && i == begin) || (i == end &&
384
+ parsed->index_of_contraction == parsed->num_hex_components)))
385
+ return false;
386
+ }
387
+
388
+ // Add the hex component we just found to running list.
389
+ if (component_len > 0) {
390
+ // Can't have more than 8 components!
391
+ if (parsed->num_hex_components >= 8)
392
+ return false;
393
+
394
+ parsed->hex_components[parsed->num_hex_components++] =
395
+ url_parse::Component(cur_component_begin, component_len);
396
+ }
397
+ }
398
+
399
+ if (i == end)
400
+ break; // Reached the end of the input, DONE.
401
+
402
+ // We found a "::" contraction.
403
+ if (is_contraction) {
404
+ // There can be at most one contraction in the literal.
405
+ if (parsed->index_of_contraction != -1)
406
+ return false;
407
+ parsed->index_of_contraction = parsed->num_hex_components;
408
+ ++i; // Consume the colon we peeked.
409
+ }
410
+
411
+ if (is_colon) {
412
+ // Colons are separators between components, keep track of where the
413
+ // current component started (after this colon).
414
+ cur_component_begin = i + 1;
415
+ } else {
416
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
417
+ return false; // Not ASCII.
418
+
419
+ if (!IsHexChar(static_cast<unsigned char>(spec[i]))) {
420
+ // Regular components are hex numbers. It is also possible for
421
+ // a component to be an IPv4 address in dotted form.
422
+ if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
423
+ // Since IPv4 address can only appear at the end, assume the rest
424
+ // of the string is an IPv4 address. (We will parse this separately
425
+ // later).
426
+ parsed->ipv4_component = url_parse::Component(
427
+ cur_component_begin, end - cur_component_begin);
428
+ break;
429
+ } else {
430
+ // The character was neither a hex digit, nor an IPv4 character.
431
+ return false;
432
+ }
433
+ }
434
+ }
435
+ }
436
+
437
+ return true;
438
+ }
439
+
440
+ // Verifies the parsed IPv6 information, checking that the various components
441
+ // add up to the right number of bits (hex components are 16 bits, while
442
+ // embedded IPv4 formats are 32 bits, and contractions are placeholdes for
443
+ // 16 or more bits). Returns true if sizes match up, false otherwise. On
444
+ // success writes the length of the contraction (if any) to
445
+ // |out_num_bytes_of_contraction|.
446
+ bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed,
447
+ int* out_num_bytes_of_contraction) {
448
+ // Each group of four hex digits contributes 16 bits.
449
+ int num_bytes_without_contraction = parsed.num_hex_components * 2;
450
+
451
+ // If an IPv4 address was embedded at the end, it contributes 32 bits.
452
+ if (parsed.ipv4_component.is_valid())
453
+ num_bytes_without_contraction += 4;
454
+
455
+ // If there was a "::" contraction, its size is going to be:
456
+ // MAX([16bits], [128bits] - num_bytes_without_contraction).
457
+ int num_bytes_of_contraction = 0;
458
+ if (parsed.index_of_contraction != -1) {
459
+ num_bytes_of_contraction = 16 - num_bytes_without_contraction;
460
+ if (num_bytes_of_contraction < 2)
461
+ num_bytes_of_contraction = 2;
462
+ }
463
+
464
+ // Check that the numbers add up.
465
+ if (num_bytes_without_contraction + num_bytes_of_contraction != 16)
466
+ return false;
467
+
468
+ *out_num_bytes_of_contraction = num_bytes_of_contraction;
469
+ return true;
470
+ }
471
+
472
+ // Converts a hex comonent into a number. This cannot fail since the caller has
473
+ // already verified that each character in the string was a hex digit, and
474
+ // that there were no more than 4 characters.
475
+ template<typename CHAR>
476
+ uint16 IPv6HexComponentToNumber(const CHAR* spec,
477
+ const url_parse::Component& component) {
478
+ DCHECK(component.len <= 4);
479
+
480
+ // Copy the hex string into a C-string.
481
+ char buf[5];
482
+ for (int i = 0; i < component.len; ++i)
483
+ buf[i] = static_cast<char>(spec[component.begin + i]);
484
+ buf[component.len] = '\0';
485
+
486
+ // Convert it to a number (overflow is not possible, since with 4 hex
487
+ // characters we can at most have a 16 bit number).
488
+ return static_cast<uint16>(_strtoui64(buf, NULL, 16));
489
+ }
490
+
491
+ // Converts an IPv6 address to a 128-bit number (network byte order), returning
492
+ // true on success. False means that the input was not a valid IPv6 address.
493
+ template<typename CHAR, typename UCHAR>
494
+ bool DoIPv6AddressToNumber(const CHAR* spec,
495
+ const url_parse::Component& host,
496
+ unsigned char address[16]) {
497
+ // Make sure the component is bounded by '[' and ']'.
498
+ int end = host.end();
499
+ if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']')
500
+ return false;
501
+
502
+ // Exclude the square brackets.
503
+ url_parse::Component ipv6_comp(host.begin + 1, host.len - 2);
504
+
505
+ // Parse the IPv6 address -- identify where all the colon separated hex
506
+ // components are, the "::" contraction, and the embedded IPv4 address.
507
+ IPv6Parsed ipv6_parsed;
508
+ if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed))
509
+ return false;
510
+
511
+ // Do some basic size checks to make sure that the address doesn't
512
+ // specify more than 128 bits or fewer than 128 bits. This also resolves
513
+ // how may zero bytes the "::" contraction represents.
514
+ int num_bytes_of_contraction;
515
+ if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction))
516
+ return false;
517
+
518
+ int cur_index_in_address = 0;
519
+
520
+ // Loop through each hex components, and contraction in order.
521
+ for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) {
522
+ // Append the contraction if it appears before this component.
523
+ if (i == ipv6_parsed.index_of_contraction) {
524
+ for (int j = 0; j < num_bytes_of_contraction; ++j)
525
+ address[cur_index_in_address++] = 0;
526
+ }
527
+ // Append the hex component's value.
528
+ if (i != ipv6_parsed.num_hex_components) {
529
+ // Get the 16-bit value for this hex component.
530
+ uint16 number = IPv6HexComponentToNumber<CHAR>(
531
+ spec, ipv6_parsed.hex_components[i]);
532
+ // Append to |address|, in network byte order.
533
+ address[cur_index_in_address++] = (number & 0xFF00) >> 8;
534
+ address[cur_index_in_address++] = (number & 0x00FF);
535
+ }
536
+ }
537
+
538
+ // If there was an IPv4 section, convert it into a 32-bit number and append
539
+ // it to |address|.
540
+ if (ipv6_parsed.ipv4_component.is_valid()) {
541
+ // We only allow the embedded IPv4 syntax to be used for "compat" and
542
+ // "mapped" formats:
543
+ // "mapped" ==> 0:0:0:0:0:ffff:<IPv4-literal>
544
+ // "compat" ==> 0:0:0:0:0:0000:<IPv4-literal>
545
+ for (int j = 0; j < 10; ++j) {
546
+ if (address[j] != 0)
547
+ return false;
548
+ }
549
+ if (!((address[10] == 0 && address[11] == 0) ||
550
+ (address[10] == 0xFF && address[11] == 0xFF)))
551
+ return false;
552
+
553
+ // Append the 32-bit number to |address|.
554
+ int ignored_num_ipv4_components;
555
+ if (CanonHostInfo::IPV4 !=
556
+ IPv4AddressToNumber(spec,
557
+ ipv6_parsed.ipv4_component,
558
+ &address[cur_index_in_address],
559
+ &ignored_num_ipv4_components))
560
+ return false;
561
+ }
562
+
563
+ return true;
564
+ }
565
+
566
+ // Searches for the longest sequence of zeros in |address|, and writes the
567
+ // range into |contraction_range|. The run of zeros must be at least 16 bits,
568
+ // and if there is a tie the first is chosen.
569
+ void ChooseIPv6ContractionRange(const unsigned char address[16],
570
+ url_parse::Component* contraction_range) {
571
+ // The longest run of zeros in |address| seen so far.
572
+ url_parse::Component max_range;
573
+
574
+ // The current run of zeros in |address| being iterated over.
575
+ url_parse::Component cur_range;
576
+
577
+ for (int i = 0; i < 16; i += 2) {
578
+ // Test for 16 bits worth of zero.
579
+ bool is_zero = (address[i] == 0 && address[i + 1] == 0);
580
+
581
+ if (is_zero) {
582
+ // Add the zero to the current range (or start a new one).
583
+ if (!cur_range.is_valid())
584
+ cur_range = url_parse::Component(i, 0);
585
+ cur_range.len += 2;
586
+ }
587
+
588
+ if (!is_zero || i == 14) {
589
+ // Just completed a run of zeros. If the run is greater than 16 bits,
590
+ // it is a candidate for the contraction.
591
+ if (cur_range.len > 2 && cur_range.len > max_range.len) {
592
+ max_range = cur_range;
593
+ }
594
+ cur_range.reset();
595
+ }
596
+ }
597
+ *contraction_range = max_range;
598
+ }
599
+
600
+ // Return true if we've made a final IPV6/BROKEN decision, false if the result
601
+ // is NEUTRAL, and we could use a second opinion.
602
+ template<typename CHAR, typename UCHAR>
603
+ bool DoCanonicalizeIPv6Address(const CHAR* spec,
604
+ const url_parse::Component& host,
605
+ CanonOutput* output,
606
+ CanonHostInfo* host_info) {
607
+ // Turn the IP address into a 128 bit number.
608
+ unsigned char address[16];
609
+ if (!IPv6AddressToNumber(spec, host, address)) {
610
+ // If it's not an IPv6 address, scan for characters that should *only*
611
+ // exist in an IPv6 address.
612
+ for (int i = host.begin; i < host.end(); i++) {
613
+ switch (spec[i]) {
614
+ case '[':
615
+ case ']':
616
+ case ':':
617
+ host_info->family = CanonHostInfo::BROKEN;
618
+ return true;
619
+ }
620
+ }
621
+
622
+ // No invalid characters. Could still be IPv4 or a hostname.
623
+ host_info->family = CanonHostInfo::NEUTRAL;
624
+ return false;
625
+ }
626
+
627
+ host_info->out_host.begin = output->length();
628
+ output->push_back('[');
629
+
630
+ // We will now output the address according to the rules in:
631
+ // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4
632
+
633
+ // Start by finding where to place the "::" contraction (if any).
634
+ url_parse::Component contraction_range;
635
+ ChooseIPv6ContractionRange(address, &contraction_range);
636
+
637
+ for (int i = 0; i <= 14;) {
638
+ // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive.
639
+ DCHECK(i % 2 == 0);
640
+ if (i == contraction_range.begin && contraction_range.len > 0) {
641
+ // Jump over the contraction.
642
+ if (i == 0)
643
+ output->push_back(':');
644
+ output->push_back(':');
645
+ i = contraction_range.end();
646
+ } else {
647
+ // Consume the next 16 bits from |address|.
648
+ int x = address[i] << 8 | address[i + 1];
649
+
650
+ i += 2;
651
+
652
+ // Stringify the 16 bit number (at most requires 4 hex digits).
653
+ char str[5];
654
+ _itoa_s(x, str, 16);
655
+ for (int ch = 0; str[ch] != 0; ++ch)
656
+ output->push_back(str[ch]);
657
+
658
+ // Put a colon after each number, except the last.
659
+ if (i < 16)
660
+ output->push_back(':');
661
+ }
662
+ }
663
+
664
+ output->push_back(']');
665
+ host_info->out_host.len = output->length() - host_info->out_host.begin;
666
+
667
+ host_info->family = CanonHostInfo::IPV6;
668
+ return true;
669
+ }
670
+
671
+ } // namespace
672
+
673
+ bool FindIPv4Components(const char* spec,
674
+ const url_parse::Component& host,
675
+ url_parse::Component components[4]) {
676
+ return DoFindIPv4Components<char, unsigned char>(spec, host, components);
677
+ }
678
+
679
+ bool FindIPv4Components(const char16* spec,
680
+ const url_parse::Component& host,
681
+ url_parse::Component components[4]) {
682
+ return DoFindIPv4Components<char16, char16>(spec, host, components);
683
+ }
684
+
685
+ void CanonicalizeIPAddress(const char* spec,
686
+ const url_parse::Component& host,
687
+ CanonOutput* output,
688
+ CanonHostInfo* host_info) {
689
+ if (DoCanonicalizeIPv4Address<char, unsigned char>(
690
+ spec, host, output, host_info))
691
+ return;
692
+ if (DoCanonicalizeIPv6Address<char, unsigned char>(
693
+ spec, host, output, host_info))
694
+ return;
695
+ }
696
+
697
+ void CanonicalizeIPAddress(const char16* spec,
698
+ const url_parse::Component& host,
699
+ CanonOutput* output,
700
+ CanonHostInfo* host_info) {
701
+ if (DoCanonicalizeIPv4Address<char16, char16>(
702
+ spec, host, output, host_info))
703
+ return;
704
+ if (DoCanonicalizeIPv6Address<char16, char16>(
705
+ spec, host, output, host_info))
706
+ return;
707
+ }
708
+
709
+ CanonHostInfo::Family IPv4AddressToNumber(const char* spec,
710
+ const url_parse::Component& host,
711
+ unsigned char address[4],
712
+ int* num_ipv4_components) {
713
+ return DoIPv4AddressToNumber<char>(spec, host, address, num_ipv4_components);
714
+ }
715
+
716
+ CanonHostInfo::Family IPv4AddressToNumber(const char16* spec,
717
+ const url_parse::Component& host,
718
+ unsigned char address[4],
719
+ int* num_ipv4_components) {
720
+ return DoIPv4AddressToNumber<char16>(
721
+ spec, host, address, num_ipv4_components);
722
+ }
723
+
724
+ bool IPv6AddressToNumber(const char* spec,
725
+ const url_parse::Component& host,
726
+ unsigned char address[16]) {
727
+ return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address);
728
+ }
729
+
730
+ bool IPv6AddressToNumber(const char16* spec,
731
+ const url_parse::Component& host,
732
+ unsigned char address[16]) {
733
+ return DoIPv6AddressToNumber<char16, char16>(spec, host, address);
734
+ }
735
+
736
+
737
+ } // namespace url_canon