fast_json-schema 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Dockerfile +17 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +68 -0
- data/LICENSE.txt +21 -0
- data/README.md +156 -0
- data/Rakefile +60 -0
- data/build-deps +3 -0
- data/data/invalid.json +31 -0
- data/data/schema.json +150 -0
- data/data/valid.json +49 -0
- data/ext/fast_json/schema/all_of.c +23 -0
- data/ext/fast_json/schema/all_of.h +4 -0
- data/ext/fast_json/schema/any_of.c +22 -0
- data/ext/fast_json/schema/any_of.h +4 -0
- data/ext/fast_json/schema/compiled_schema.c +503 -0
- data/ext/fast_json/schema/compiled_schema.h +10 -0
- data/ext/fast_json/schema/context.c +78 -0
- data/ext/fast_json/schema/error.c +26 -0
- data/ext/fast_json/schema/error.h +5 -0
- data/ext/fast_json/schema/extconf.rb +7 -0
- data/ext/fast_json/schema/formats/custom_format.c +63 -0
- data/ext/fast_json/schema/formats/custom_format.h +4 -0
- data/ext/fast_json/schema/formats/date.c +48 -0
- data/ext/fast_json/schema/formats/date.h +5 -0
- data/ext/fast_json/schema/formats/date_time.c +22 -0
- data/ext/fast_json/schema/formats/date_time.h +4 -0
- data/ext/fast_json/schema/formats/email.c +8 -0
- data/ext/fast_json/schema/formats/email.h +4 -0
- data/ext/fast_json/schema/formats/format.c +68 -0
- data/ext/fast_json/schema/formats/format.h +4 -0
- data/ext/fast_json/schema/formats/hostname.c +8 -0
- data/ext/fast_json/schema/formats/hostname.h +4 -0
- data/ext/fast_json/schema/formats/idn_email.c +8 -0
- data/ext/fast_json/schema/formats/idn_email.h +4 -0
- data/ext/fast_json/schema/formats/idn_hostname.c +8 -0
- data/ext/fast_json/schema/formats/idn_hostname.h +4 -0
- data/ext/fast_json/schema/formats/ipv4.c +8 -0
- data/ext/fast_json/schema/formats/ipv4.h +4 -0
- data/ext/fast_json/schema/formats/ipv6.c +8 -0
- data/ext/fast_json/schema/formats/ipv6.h +4 -0
- data/ext/fast_json/schema/formats/iri.c +8 -0
- data/ext/fast_json/schema/formats/iri.h +4 -0
- data/ext/fast_json/schema/formats/iri_reference.c +8 -0
- data/ext/fast_json/schema/formats/iri_reference.h +4 -0
- data/ext/fast_json/schema/formats/json_pointer.c +8 -0
- data/ext/fast_json/schema/formats/json_pointer.h +4 -0
- data/ext/fast_json/schema/formats/regex.c +27 -0
- data/ext/fast_json/schema/formats/regex.h +4 -0
- data/ext/fast_json/schema/formats/relative_json_pointer.c +57 -0
- data/ext/fast_json/schema/formats/relative_json_pointer.h +4 -0
- data/ext/fast_json/schema/formats/time.c +65 -0
- data/ext/fast_json/schema/formats/time.h +5 -0
- data/ext/fast_json/schema/formats/uri.c +8 -0
- data/ext/fast_json/schema/formats/uri.h +4 -0
- data/ext/fast_json/schema/formats/uri_reference.c +8 -0
- data/ext/fast_json/schema/formats/uri_reference.h +4 -0
- data/ext/fast_json/schema/formats/uri_template.c +8 -0
- data/ext/fast_json/schema/formats/uri_template.h +4 -0
- data/ext/fast_json/schema/formats/utils/addr_spec_parser.c +342 -0
- data/ext/fast_json/schema/formats/utils/addr_spec_parser.h +16 -0
- data/ext/fast_json/schema/formats/utils/hostname_parser.c +113 -0
- data/ext/fast_json/schema/formats/utils/hostname_parser.h +17 -0
- data/ext/fast_json/schema/formats/utils/ip_parser.c +126 -0
- data/ext/fast_json/schema/formats/utils/ip_parser.h +25 -0
- data/ext/fast_json/schema/formats/utils/json_pointer_parser.c +45 -0
- data/ext/fast_json/schema/formats/utils/json_pointer_parser.h +20 -0
- data/ext/fast_json/schema/formats/utils/uri_parser.c +605 -0
- data/ext/fast_json/schema/formats/utils/uri_parser.h +20 -0
- data/ext/fast_json/schema/formats/utils/uri_template_parser.c +235 -0
- data/ext/fast_json/schema/formats/utils/uri_template_parser.h +18 -0
- data/ext/fast_json/schema/formats/utils/utf8.c +73 -0
- data/ext/fast_json/schema/formats/utils/utf8.h +17 -0
- data/ext/fast_json/schema/if.c +31 -0
- data/ext/fast_json/schema/if.h +4 -0
- data/ext/fast_json/schema/is_valid.c +124 -0
- data/ext/fast_json/schema/is_valid.h +6 -0
- data/ext/fast_json/schema/keywords.c +220 -0
- data/ext/fast_json/schema/keywords.h +60 -0
- data/ext/fast_json/schema/nested_schemas.c +68 -0
- data/ext/fast_json/schema/nested_schemas.h +4 -0
- data/ext/fast_json/schema/not.c +11 -0
- data/ext/fast_json/schema/not.h +4 -0
- data/ext/fast_json/schema/one_of.c +23 -0
- data/ext/fast_json/schema/one_of.h +4 -0
- data/ext/fast_json/schema/path.c +44 -0
- data/ext/fast_json/schema/path.h +5 -0
- data/ext/fast_json/schema/properties_val.c +103 -0
- data/ext/fast_json/schema/properties_val.h +6 -0
- data/ext/fast_json/schema/ref.c +7 -0
- data/ext/fast_json/schema/ref.h +4 -0
- data/ext/fast_json/schema/ref_resolver.c +85 -0
- data/ext/fast_json/schema/ref_resolver.h +5 -0
- data/ext/fast_json/schema/schema.c +68 -0
- data/ext/fast_json/schema/schema_collection.c +29 -0
- data/ext/fast_json/schema/schema_collection.h +3 -0
- data/ext/fast_json/schema/types/compiled_schema.h +96 -0
- data/ext/fast_json/schema/types/context.h +27 -0
- data/ext/fast_json/schema/validate.c +63 -0
- data/ext/fast_json/schema/validate.h +19 -0
- data/ext/fast_json/schema/validate_array.c +130 -0
- data/ext/fast_json/schema/validate_array.h +4 -0
- data/ext/fast_json/schema/validate_bool.c +7 -0
- data/ext/fast_json/schema/validate_bool.h +4 -0
- data/ext/fast_json/schema/validate_integer.c +52 -0
- data/ext/fast_json/schema/validate_integer.h +4 -0
- data/ext/fast_json/schema/validate_null.c +7 -0
- data/ext/fast_json/schema/validate_null.h +4 -0
- data/ext/fast_json/schema/validate_number.c +62 -0
- data/ext/fast_json/schema/validate_number.h +4 -0
- data/ext/fast_json/schema/validate_object.c +159 -0
- data/ext/fast_json/schema/validate_object.h +4 -0
- data/ext/fast_json/schema/validate_string.c +32 -0
- data/ext/fast_json/schema/validate_string.h +4 -0
- data/ext/fast_json/schema/value_pointer_caster.h +9 -0
- data/fast_json-schema.gemspec +31 -0
- data/lib/fast_json/schema/error.rb +16 -0
- data/lib/fast_json/schema/version.rb +7 -0
- data/lib/fast_json/schema.rb +50 -0
- data/makefile +10 -0
- metadata +164 -0
|
@@ -0,0 +1,605 @@
|
|
|
1
|
+
#include "formats/utils/uri_parser.h"
|
|
2
|
+
#include "formats/utils/ip_parser.h"
|
|
3
|
+
#include "formats/utils/utf8.h"
|
|
4
|
+
|
|
5
|
+
#include <stdbool.h>
|
|
6
|
+
|
|
7
|
+
#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9')
|
|
8
|
+
#define IS_ALPHA(c) (((c) >= 'A' && (c) <= 'Z') || ((c) >= 'a' && (c) <= 'z'))
|
|
9
|
+
#define IS_HEX(c) (IS_DIGIT(c) || ((c) >= 'A' && (c) <= 'F') || ((c) >= 'a' && (c) <= 'f'))
|
|
10
|
+
|
|
11
|
+
/*
|
|
12
|
+
* unreserved per RFC 3986 section 2.3:
|
|
13
|
+
* ALPHA / DIGIT / "-" / "." / "_" / "~"
|
|
14
|
+
*/
|
|
15
|
+
static bool is_unreserved(unsigned char c) {
|
|
16
|
+
if(IS_ALPHA(c) || IS_DIGIT(c)) return true;
|
|
17
|
+
|
|
18
|
+
switch(c) {
|
|
19
|
+
case '-': case '.': case '_': case '~':
|
|
20
|
+
return true;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/*
|
|
27
|
+
* sub-delims per RFC 3986 section 2.2:
|
|
28
|
+
* "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
|
|
29
|
+
*/
|
|
30
|
+
static bool is_sub_delim(unsigned char c) {
|
|
31
|
+
switch(c) {
|
|
32
|
+
case '!': case '$': case '&': case '\'': case '(': case ')':
|
|
33
|
+
case '*': case '+': case ',': case ';': case '=':
|
|
34
|
+
return true;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/*
|
|
41
|
+
* scheme character per RFC 3986 section 3.1:
|
|
42
|
+
* ALPHA / DIGIT / "+" / "-" / "."
|
|
43
|
+
*/
|
|
44
|
+
static bool is_scheme_char(unsigned char c) {
|
|
45
|
+
return IS_ALPHA(c) || IS_DIGIT(c) || c == '+' || c == '-' || c == '.';
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/*
|
|
49
|
+
* pct-encoded per RFC 3986 section 2.1: "%" HEXDIG HEXDIG.
|
|
50
|
+
* Returns 3 on success, 0 on failure.
|
|
51
|
+
*/
|
|
52
|
+
static long consume_pct_encoded(const char *s, long len) {
|
|
53
|
+
if(len < 3 || s[0] != '%') return 0;
|
|
54
|
+
if(!IS_HEX((unsigned char)s[1]) || !IS_HEX((unsigned char)s[2])) return 0;
|
|
55
|
+
|
|
56
|
+
return 3;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/*
|
|
60
|
+
* Consume a UTF-8 multi-byte sequence when allow_utf8 is true.
|
|
61
|
+
* Returns the byte length on success, or 0 if not a valid UTF-8 sequence
|
|
62
|
+
* (or if allow_utf8 is false).
|
|
63
|
+
*/
|
|
64
|
+
static long consume_utf8(const char *s, long len, bool allow_utf8) {
|
|
65
|
+
if(!allow_utf8) return 0;
|
|
66
|
+
|
|
67
|
+
unsigned char c = (unsigned char)s[0];
|
|
68
|
+
if(c < 0x80) return 0;
|
|
69
|
+
|
|
70
|
+
long n = utf8_seq_len((const unsigned char *)s, len);
|
|
71
|
+
return n >= 2 ? n : 0;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/*
|
|
75
|
+
* pchar per RFC 3986 section 3.3:
|
|
76
|
+
* unreserved / pct-encoded / sub-delims / ":" / "@"
|
|
77
|
+
* Returns bytes consumed, or 0 on failure.
|
|
78
|
+
*/
|
|
79
|
+
static long consume_pchar(const char *s, long len, bool allow_utf8) {
|
|
80
|
+
unsigned char c = (unsigned char)s[0];
|
|
81
|
+
|
|
82
|
+
if(c < 0x80) {
|
|
83
|
+
if(is_unreserved(c)) return 1;
|
|
84
|
+
if(is_sub_delim(c)) return 1;
|
|
85
|
+
if(c == ':' || c == '@') return 1;
|
|
86
|
+
if(c == '%') return consume_pct_encoded(s, len);
|
|
87
|
+
|
|
88
|
+
return 0;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return consume_utf8(s, len, allow_utf8);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/*
|
|
95
|
+
* segment-nz-nc per RFC 3986 section 3.3 (relative-part):
|
|
96
|
+
* 1*( unreserved / pct-encoded / sub-delims / "@" )
|
|
97
|
+
* — i.e., pchar without ":".
|
|
98
|
+
*/
|
|
99
|
+
static long consume_pchar_nc(const char *s, long len, bool allow_utf8) {
|
|
100
|
+
unsigned char c = (unsigned char)s[0];
|
|
101
|
+
|
|
102
|
+
if(c == ':') return 0;
|
|
103
|
+
|
|
104
|
+
return consume_pchar(s, len, allow_utf8);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/*
|
|
108
|
+
* scheme per RFC 3986 section 3.1:
|
|
109
|
+
* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
|
|
110
|
+
* Returns bytes consumed, or 0 on failure.
|
|
111
|
+
*/
|
|
112
|
+
static long parse_scheme(const char *s, long len) {
|
|
113
|
+
if(len == 0) return 0;
|
|
114
|
+
if(!IS_ALPHA((unsigned char)s[0])) return 0;
|
|
115
|
+
|
|
116
|
+
long pos = 1;
|
|
117
|
+
while(pos < len && is_scheme_char((unsigned char)s[pos])) pos++;
|
|
118
|
+
|
|
119
|
+
return pos;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/*
|
|
123
|
+
* IPvFuture per RFC 3986 section 3.2.2:
|
|
124
|
+
* "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
|
|
125
|
+
* Returns bytes consumed, or 0 on failure.
|
|
126
|
+
*/
|
|
127
|
+
static long parse_ipv_future(const char *s, long len) {
|
|
128
|
+
if(len < 4) return 0;
|
|
129
|
+
if(s[0] != 'v' && s[0] != 'V') return 0;
|
|
130
|
+
|
|
131
|
+
long pos = 1;
|
|
132
|
+
long hex_start = pos;
|
|
133
|
+
|
|
134
|
+
while(pos < len && IS_HEX((unsigned char)s[pos])) pos++;
|
|
135
|
+
if(pos == hex_start) return 0;
|
|
136
|
+
if(pos >= len || s[pos] != '.') return 0;
|
|
137
|
+
|
|
138
|
+
pos++;
|
|
139
|
+
long tail_start = pos;
|
|
140
|
+
|
|
141
|
+
while(pos < len) {
|
|
142
|
+
unsigned char c = (unsigned char)s[pos];
|
|
143
|
+
if(is_unreserved(c) || is_sub_delim(c) || c == ':') {
|
|
144
|
+
pos++;
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
break;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if(pos == tail_start) return 0;
|
|
151
|
+
|
|
152
|
+
return pos;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/*
|
|
156
|
+
* IP-literal per RFC 3986 section 3.2.2:
|
|
157
|
+
* "[" ( IPv6address / IPvFuture ) "]"
|
|
158
|
+
* Returns bytes consumed (including brackets), or 0 on failure.
|
|
159
|
+
*/
|
|
160
|
+
static long parse_ip_literal(const char *s, long len) {
|
|
161
|
+
if(len < 2 || s[0] != '[') return 0;
|
|
162
|
+
|
|
163
|
+
long end = -1;
|
|
164
|
+
for(long i = 1; i < len; i++) {
|
|
165
|
+
if(s[i] == ']') { end = i; break; }
|
|
166
|
+
}
|
|
167
|
+
if(end == -1) return 0;
|
|
168
|
+
|
|
169
|
+
long body_len = end - 1;
|
|
170
|
+
const char *body = s + 1;
|
|
171
|
+
|
|
172
|
+
if(body_len >= 1 && (body[0] == 'v' || body[0] == 'V')) {
|
|
173
|
+
if(parse_ipv_future(body, body_len) == body_len) return end + 1;
|
|
174
|
+
return 0;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
if(parse_ipv6(body, body_len)) return end + 1;
|
|
178
|
+
return 0;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/*
|
|
182
|
+
* reg-name per RFC 3986 section 3.2.2:
|
|
183
|
+
* *( unreserved / pct-encoded / sub-delims )
|
|
184
|
+
* Returns bytes consumed (may be 0 — empty reg-name is valid).
|
|
185
|
+
*/
|
|
186
|
+
static long parse_reg_name(const char *s, long len, bool allow_utf8) {
|
|
187
|
+
long pos = 0;
|
|
188
|
+
|
|
189
|
+
while(pos < len) {
|
|
190
|
+
unsigned char c = (unsigned char)s[pos];
|
|
191
|
+
|
|
192
|
+
if(c < 0x80) {
|
|
193
|
+
if(is_unreserved(c) || is_sub_delim(c)) {
|
|
194
|
+
pos++;
|
|
195
|
+
continue;
|
|
196
|
+
}
|
|
197
|
+
if(c == '%') {
|
|
198
|
+
long n = consume_pct_encoded(s + pos, len - pos);
|
|
199
|
+
if(n == 0) break;
|
|
200
|
+
pos += n;
|
|
201
|
+
continue;
|
|
202
|
+
}
|
|
203
|
+
break;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
long n = consume_utf8(s + pos, len - pos, allow_utf8);
|
|
207
|
+
if(n == 0) break;
|
|
208
|
+
pos += n;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return pos;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/*
|
|
215
|
+
* host per RFC 3986 section 3.2.2: IP-literal / IPv4address / reg-name.
|
|
216
|
+
* Returns bytes consumed (may be 0 — empty host is valid).
|
|
217
|
+
*/
|
|
218
|
+
static long parse_host(const char *s, long len, bool allow_utf8) {
|
|
219
|
+
if(len == 0) return 0;
|
|
220
|
+
|
|
221
|
+
if(s[0] == '[') {
|
|
222
|
+
long n = parse_ip_literal(s, len);
|
|
223
|
+
return n;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/*
|
|
227
|
+
* Try IPv4 first if it can match the longest prefix that looks like one
|
|
228
|
+
* (i.e., the run of digits and dots). If it matches a prefix exactly equal
|
|
229
|
+
* to that run, accept; otherwise fall through to reg-name (which is
|
|
230
|
+
* more permissive and would also accept dotted-decimal byte sequences).
|
|
231
|
+
*/
|
|
232
|
+
long digit_dot_run = 0;
|
|
233
|
+
while(digit_dot_run < len) {
|
|
234
|
+
unsigned char c = (unsigned char)s[digit_dot_run];
|
|
235
|
+
if(IS_DIGIT(c) || c == '.') {
|
|
236
|
+
digit_dot_run++;
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
break;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if(digit_dot_run > 0 && parse_ipv4(s, digit_dot_run)) {
|
|
243
|
+
return digit_dot_run;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return parse_reg_name(s, len, allow_utf8);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/*
|
|
250
|
+
* port per RFC 3986 section 3.2.3: *DIGIT (may be empty).
|
|
251
|
+
*/
|
|
252
|
+
static long parse_port(const char *s, long len) {
|
|
253
|
+
long pos = 0;
|
|
254
|
+
while(pos < len && IS_DIGIT((unsigned char)s[pos])) pos++;
|
|
255
|
+
return pos;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/*
|
|
259
|
+
* userinfo per RFC 3986 section 3.2.1:
|
|
260
|
+
* *( unreserved / pct-encoded / sub-delims / ":" )
|
|
261
|
+
*/
|
|
262
|
+
static long parse_userinfo(const char *s, long len, bool allow_utf8) {
|
|
263
|
+
long pos = 0;
|
|
264
|
+
|
|
265
|
+
while(pos < len) {
|
|
266
|
+
unsigned char c = (unsigned char)s[pos];
|
|
267
|
+
|
|
268
|
+
if(c < 0x80) {
|
|
269
|
+
if(is_unreserved(c) || is_sub_delim(c) || c == ':') {
|
|
270
|
+
pos++;
|
|
271
|
+
continue;
|
|
272
|
+
}
|
|
273
|
+
if(c == '%') {
|
|
274
|
+
long n = consume_pct_encoded(s + pos, len - pos);
|
|
275
|
+
if(n == 0) break;
|
|
276
|
+
pos += n;
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
break;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
long n = consume_utf8(s + pos, len - pos, allow_utf8);
|
|
283
|
+
if(n == 0) break;
|
|
284
|
+
pos += n;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
return pos;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/*
|
|
291
|
+
* authority per RFC 3986 section 3.2:
|
|
292
|
+
* [ userinfo "@" ] host [ ":" port ]
|
|
293
|
+
* Returns bytes consumed (must consume up to the next "/", "?", "#", or end).
|
|
294
|
+
*/
|
|
295
|
+
static long parse_authority(const char *s, long len, bool allow_utf8) {
|
|
296
|
+
long pos = 0;
|
|
297
|
+
|
|
298
|
+
/* Find the end of authority: next "/", "?", "#", or end. */
|
|
299
|
+
long auth_end = len;
|
|
300
|
+
for(long i = 0; i < len; i++) {
|
|
301
|
+
if(s[i] == '/' || s[i] == '?' || s[i] == '#') { auth_end = i; break; }
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/* Optional userinfo terminated by "@". Search within auth_end only. */
|
|
305
|
+
long at_pos = -1;
|
|
306
|
+
for(long i = 0; i < auth_end; i++) {
|
|
307
|
+
if(s[i] == '@') { at_pos = i; break; }
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
if(at_pos >= 0) {
|
|
311
|
+
if(parse_userinfo(s, at_pos, allow_utf8) != at_pos) return 0;
|
|
312
|
+
pos = at_pos + 1;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
/* host: either IP-literal "[...]", or a sequence ending at ":" or auth_end. */
|
|
316
|
+
long host_start = pos;
|
|
317
|
+
long host_end;
|
|
318
|
+
|
|
319
|
+
if(pos < auth_end && s[pos] == '[') {
|
|
320
|
+
long lit = parse_ip_literal(s + pos, auth_end - pos);
|
|
321
|
+
if(lit == 0) return 0;
|
|
322
|
+
host_end = pos + lit;
|
|
323
|
+
} else {
|
|
324
|
+
/* host runs up to ":" (port separator) or auth_end. */
|
|
325
|
+
host_end = pos;
|
|
326
|
+
while(host_end < auth_end && s[host_end] != ':') host_end++;
|
|
327
|
+
|
|
328
|
+
long host_len = parse_host(s + pos, host_end - pos, allow_utf8);
|
|
329
|
+
if(host_len != host_end - pos) return 0;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
pos = host_end;
|
|
333
|
+
|
|
334
|
+
/* Optional ":" port */
|
|
335
|
+
if(pos < auth_end && s[pos] == ':') {
|
|
336
|
+
pos++;
|
|
337
|
+
long port_len = parse_port(s + pos, auth_end - pos);
|
|
338
|
+
if(pos + port_len != auth_end) return 0;
|
|
339
|
+
pos += port_len;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
if(pos != auth_end) return 0;
|
|
343
|
+
|
|
344
|
+
(void)host_start;
|
|
345
|
+
return pos;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/*
|
|
349
|
+
* segment per RFC 3986 section 3.3: *pchar
|
|
350
|
+
*/
|
|
351
|
+
static long parse_segment(const char *s, long len, bool allow_utf8) {
|
|
352
|
+
long pos = 0;
|
|
353
|
+
while(pos < len) {
|
|
354
|
+
long n = consume_pchar(s + pos, len - pos, allow_utf8);
|
|
355
|
+
if(n == 0) break;
|
|
356
|
+
pos += n;
|
|
357
|
+
}
|
|
358
|
+
return pos;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/*
|
|
362
|
+
* segment-nz per RFC 3986 section 3.3: 1*pchar
|
|
363
|
+
*/
|
|
364
|
+
static long parse_segment_nz(const char *s, long len, bool allow_utf8) {
|
|
365
|
+
long pos = parse_segment(s, len, allow_utf8);
|
|
366
|
+
return pos > 0 ? pos : 0;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
/*
|
|
370
|
+
* segment-nz-nc per RFC 3986 section 3.3:
|
|
371
|
+
* 1*( unreserved / pct-encoded / sub-delims / "@" ) (no colon)
|
|
372
|
+
*/
|
|
373
|
+
static long parse_segment_nz_nc(const char *s, long len, bool allow_utf8) {
|
|
374
|
+
long pos = 0;
|
|
375
|
+
while(pos < len) {
|
|
376
|
+
long n = consume_pchar_nc(s + pos, len - pos, allow_utf8);
|
|
377
|
+
if(n == 0) break;
|
|
378
|
+
pos += n;
|
|
379
|
+
}
|
|
380
|
+
return pos > 0 ? pos : 0;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/*
|
|
384
|
+
* path-abempty per RFC 3986 section 3.3:
|
|
385
|
+
* *( "/" segment )
|
|
386
|
+
* Always succeeds (may consume zero bytes).
|
|
387
|
+
*/
|
|
388
|
+
static long parse_path_abempty(const char *s, long len, bool allow_utf8) {
|
|
389
|
+
long pos = 0;
|
|
390
|
+
while(pos < len && s[pos] == '/') {
|
|
391
|
+
pos++;
|
|
392
|
+
pos += parse_segment(s + pos, len - pos, allow_utf8);
|
|
393
|
+
}
|
|
394
|
+
return pos;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/*
|
|
398
|
+
* path-absolute per RFC 3986 section 3.3:
|
|
399
|
+
* "/" [ segment-nz *( "/" segment ) ]
|
|
400
|
+
*/
|
|
401
|
+
static long parse_path_absolute(const char *s, long len, bool allow_utf8) {
|
|
402
|
+
if(len < 1 || s[0] != '/') return 0;
|
|
403
|
+
long pos = 1;
|
|
404
|
+
|
|
405
|
+
long nz = parse_segment_nz(s + pos, len - pos, allow_utf8);
|
|
406
|
+
if(nz == 0) return pos; /* just "/" is valid */
|
|
407
|
+
|
|
408
|
+
pos += nz;
|
|
409
|
+
while(pos < len && s[pos] == '/') {
|
|
410
|
+
pos++;
|
|
411
|
+
pos += parse_segment(s + pos, len - pos, allow_utf8);
|
|
412
|
+
}
|
|
413
|
+
return pos;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/*
|
|
417
|
+
* path-rootless per RFC 3986 section 3.3:
|
|
418
|
+
* segment-nz *( "/" segment )
|
|
419
|
+
*/
|
|
420
|
+
static long parse_path_rootless(const char *s, long len, bool allow_utf8) {
|
|
421
|
+
long nz = parse_segment_nz(s, len, allow_utf8);
|
|
422
|
+
if(nz == 0) return 0;
|
|
423
|
+
|
|
424
|
+
long pos = nz;
|
|
425
|
+
while(pos < len && s[pos] == '/') {
|
|
426
|
+
pos++;
|
|
427
|
+
pos += parse_segment(s + pos, len - pos, allow_utf8);
|
|
428
|
+
}
|
|
429
|
+
return pos;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
/*
|
|
433
|
+
* path-noscheme per RFC 3986 section 4.2:
|
|
434
|
+
* segment-nz-nc *( "/" segment )
|
|
435
|
+
*/
|
|
436
|
+
static long parse_path_noscheme(const char *s, long len, bool allow_utf8) {
|
|
437
|
+
long nz = parse_segment_nz_nc(s, len, allow_utf8);
|
|
438
|
+
if(nz == 0) return 0;
|
|
439
|
+
|
|
440
|
+
long pos = nz;
|
|
441
|
+
while(pos < len && s[pos] == '/') {
|
|
442
|
+
pos++;
|
|
443
|
+
pos += parse_segment(s + pos, len - pos, allow_utf8);
|
|
444
|
+
}
|
|
445
|
+
return pos;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/*
|
|
449
|
+
* query / fragment per RFC 3986 section 3.4, section 3.5:
|
|
450
|
+
* *( pchar / "/" / "?" )
|
|
451
|
+
*/
|
|
452
|
+
static long parse_query_or_fragment(const char *s, long len, bool allow_utf8) {
|
|
453
|
+
long pos = 0;
|
|
454
|
+
while(pos < len) {
|
|
455
|
+
unsigned char c = (unsigned char)s[pos];
|
|
456
|
+
if(c == '/' || c == '?') {
|
|
457
|
+
pos++;
|
|
458
|
+
continue;
|
|
459
|
+
}
|
|
460
|
+
long n = consume_pchar(s + pos, len - pos, allow_utf8);
|
|
461
|
+
if(n == 0) break;
|
|
462
|
+
pos += n;
|
|
463
|
+
}
|
|
464
|
+
return pos;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/*
|
|
468
|
+
* Parse hier-part or relative-part starting at s, consuming through the
|
|
469
|
+
* end of the path component (i.e., up to "?", "#", or end of input).
|
|
470
|
+
*
|
|
471
|
+
* Selected variant:
|
|
472
|
+
* - If starts with "//": authority + path-abempty.
|
|
473
|
+
* - If starts with "/": path-absolute.
|
|
474
|
+
* - Else if is_relative: path-noscheme.
|
|
475
|
+
* - Else: path-rootless or empty.
|
|
476
|
+
*
|
|
477
|
+
* Returns the number of bytes consumed, or -1 on failure.
|
|
478
|
+
*/
|
|
479
|
+
static long parse_hier_or_relative_part(const char *s, long len, bool allow_utf8, bool is_relative) {
|
|
480
|
+
if(len == 0) return 0; /* path-empty */
|
|
481
|
+
|
|
482
|
+
/* Boundary of the path component: up to "?" or "#". */
|
|
483
|
+
long path_end = len;
|
|
484
|
+
for(long i = 0; i < len; i++) {
|
|
485
|
+
if(s[i] == '?' || s[i] == '#') { path_end = i; break; }
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
long pos = 0;
|
|
489
|
+
|
|
490
|
+
if(path_end >= 2 && s[0] == '/' && s[1] == '/') {
|
|
491
|
+
pos = 2;
|
|
492
|
+
long auth_end = path_end;
|
|
493
|
+
/* Find end of authority (next "/" within path_end). */
|
|
494
|
+
for(long i = pos; i < path_end; i++) {
|
|
495
|
+
if(s[i] == '/') { auth_end = i; break; }
|
|
496
|
+
}
|
|
497
|
+
long auth_len = parse_authority(s + pos, auth_end - pos, allow_utf8);
|
|
498
|
+
if(auth_len != auth_end - pos) return -1;
|
|
499
|
+
pos = auth_end;
|
|
500
|
+
pos += parse_path_abempty(s + pos, path_end - pos, allow_utf8);
|
|
501
|
+
if(pos != path_end) return -1;
|
|
502
|
+
return pos;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
if(path_end >= 1 && s[0] == '/') {
|
|
506
|
+
long n = parse_path_absolute(s, path_end, allow_utf8);
|
|
507
|
+
if(n != path_end) return -1;
|
|
508
|
+
return n;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
if(path_end == 0) return 0; /* path-empty */
|
|
512
|
+
|
|
513
|
+
if(is_relative) {
|
|
514
|
+
long n = parse_path_noscheme(s, path_end, allow_utf8);
|
|
515
|
+
if(n != path_end) return -1;
|
|
516
|
+
return n;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
long n = parse_path_rootless(s, path_end, allow_utf8);
|
|
520
|
+
if(n != path_end) return -1;
|
|
521
|
+
return n;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
/*
|
|
525
|
+
* Try to parse a full URI starting at s: scheme ":" hier-part [ "?" query ] [ "#" fragment ].
|
|
526
|
+
* Returns true if the entire input is consumed, false otherwise.
|
|
527
|
+
*/
|
|
528
|
+
static bool try_parse_full_uri(const char *s, long len, bool allow_utf8) {
|
|
529
|
+
long scheme_len = parse_scheme(s, len);
|
|
530
|
+
if(scheme_len == 0) return false;
|
|
531
|
+
if(scheme_len >= len || s[scheme_len] != ':') return false;
|
|
532
|
+
|
|
533
|
+
long pos = scheme_len + 1;
|
|
534
|
+
|
|
535
|
+
long hp = parse_hier_or_relative_part(s + pos, len - pos, allow_utf8, false);
|
|
536
|
+
if(hp < 0) return false;
|
|
537
|
+
|
|
538
|
+
/* But parse_hier_or_relative_part stops at the first "?" or "#" — we need
|
|
539
|
+
* to walk to the end of the current path component. parse_hier_or_relative_part
|
|
540
|
+
* already enforces that it consumes everything up to "?", "#", or end of its
|
|
541
|
+
* window, so hp equals the length of the hier-part subset before "?" / "#". */
|
|
542
|
+
pos += hp;
|
|
543
|
+
|
|
544
|
+
if(pos < len && s[pos] == '?') {
|
|
545
|
+
pos++;
|
|
546
|
+
/* query runs to "#" or end. */
|
|
547
|
+
long q_end = len;
|
|
548
|
+
for(long i = pos; i < len; i++) {
|
|
549
|
+
if(s[i] == '#') { q_end = i; break; }
|
|
550
|
+
}
|
|
551
|
+
long q_len = parse_query_or_fragment(s + pos, q_end - pos, allow_utf8);
|
|
552
|
+
if(pos + q_len != q_end) return false;
|
|
553
|
+
pos = q_end;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
if(pos < len && s[pos] == '#') {
|
|
557
|
+
pos++;
|
|
558
|
+
long f_len = parse_query_or_fragment(s + pos, len - pos, allow_utf8);
|
|
559
|
+
if(pos + f_len != len) return false;
|
|
560
|
+
pos = len;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
return pos == len;
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
/*
|
|
567
|
+
* Try to parse a relative-ref: relative-part [ "?" query ] [ "#" fragment ].
|
|
568
|
+
* Returns true if the entire input is consumed, false otherwise.
|
|
569
|
+
*/
|
|
570
|
+
static bool try_parse_relative_ref(const char *s, long len, bool allow_utf8) {
|
|
571
|
+
long rp = parse_hier_or_relative_part(s, len, allow_utf8, true);
|
|
572
|
+
if(rp < 0) return false;
|
|
573
|
+
|
|
574
|
+
long pos = rp;
|
|
575
|
+
|
|
576
|
+
if(pos < len && s[pos] == '?') {
|
|
577
|
+
pos++;
|
|
578
|
+
long q_end = len;
|
|
579
|
+
for(long i = pos; i < len; i++) {
|
|
580
|
+
if(s[i] == '#') { q_end = i; break; }
|
|
581
|
+
}
|
|
582
|
+
long q_len = parse_query_or_fragment(s + pos, q_end - pos, allow_utf8);
|
|
583
|
+
if(pos + q_len != q_end) return false;
|
|
584
|
+
pos = q_end;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
if(pos < len && s[pos] == '#') {
|
|
588
|
+
pos++;
|
|
589
|
+
long f_len = parse_query_or_fragment(s + pos, len - pos, allow_utf8);
|
|
590
|
+
if(pos + f_len != len) return false;
|
|
591
|
+
pos = len;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
return pos == len;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
bool parse_uri_reference(const char *s, long len, bool require_scheme, bool allow_utf8) {
|
|
598
|
+
if(len == 0) return !require_scheme; /* empty is valid only for URI-reference */
|
|
599
|
+
|
|
600
|
+
if(try_parse_full_uri(s, len, allow_utf8)) return true;
|
|
601
|
+
|
|
602
|
+
if(require_scheme) return false;
|
|
603
|
+
|
|
604
|
+
return try_parse_relative_ref(s, len, allow_utf8);
|
|
605
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#ifndef FAST_JSON_FORMATS_UTILS_URI_PARSER_H
|
|
2
|
+
#define FAST_JSON_FORMATS_UTILS_URI_PARSER_H
|
|
3
|
+
|
|
4
|
+
#include <stdbool.h>
|
|
5
|
+
|
|
6
|
+
/*
|
|
7
|
+
* RFC 3986 URI / URI-reference parser, with optional RFC 3987 UTF-8 extensions.
|
|
8
|
+
*
|
|
9
|
+
* When require_scheme is true, input must start with a valid `scheme ":"`
|
|
10
|
+
* (RFC 3986 section 3.1). When false, input may also be a relative-ref including
|
|
11
|
+
* the empty string (RFC 3986 section 4.2 same-document reference).
|
|
12
|
+
*
|
|
13
|
+
* When allow_utf8 is true, any byte >= 0x80 must begin a valid UTF-8 sequence;
|
|
14
|
+
* such sequences are accepted anywhere `unreserved` characters are allowed
|
|
15
|
+
* (pragmatic IRI approach matching RFC 3987 section 2.2, without strict ucschar /
|
|
16
|
+
* iprivate Unicode range enforcement).
|
|
17
|
+
*/
|
|
18
|
+
bool parse_uri_reference(const char *s, long len, bool require_scheme, bool allow_utf8);
|
|
19
|
+
|
|
20
|
+
#endif
|