ata-validator 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/ata.cpp ADDED
@@ -0,0 +1,1017 @@
1
+ #include "ata.h"
2
+
3
+ #include <algorithm>
4
+ #include <cmath>
5
+ #include <regex>
6
+ #include <set>
7
+ #include <unordered_map>
8
+
9
+ #include "simdjson.h"
10
+
11
+ // --- Fast format validators (no std::regex) ---
12
+
13
+ static bool is_digit(char c) { return c >= '0' && c <= '9'; }
14
+ static bool is_alpha(char c) {
15
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
16
+ }
17
+ static bool is_alnum(char c) { return is_alpha(c) || is_digit(c); }
18
+ static bool is_hex(char c) {
19
+ return is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
20
+ }
21
+
22
+ static bool fast_check_email(std::string_view s) {
23
+ auto at = s.find('@');
24
+ if (at == std::string_view::npos || at == 0 || at == s.size() - 1)
25
+ return false;
26
+ auto dot = s.find('.', at + 1);
27
+ if (dot == std::string_view::npos || dot == at + 1 ||
28
+ dot == s.size() - 1)
29
+ return false;
30
+ // Check TLD has at least 2 chars
31
+ return (s.size() - dot - 1) >= 2;
32
+ }
33
+
34
+ static bool fast_check_date(std::string_view s) {
35
+ // YYYY-MM-DD
36
+ return s.size() == 10 && is_digit(s[0]) && is_digit(s[1]) &&
37
+ is_digit(s[2]) && is_digit(s[3]) && s[4] == '-' &&
38
+ is_digit(s[5]) && is_digit(s[6]) && s[7] == '-' &&
39
+ is_digit(s[8]) && is_digit(s[9]);
40
+ }
41
+
42
+ static bool fast_check_time(std::string_view s) {
43
+ // HH:MM:SS[.frac][Z|+HH:MM]
44
+ if (s.size() < 8) return false;
45
+ if (!is_digit(s[0]) || !is_digit(s[1]) || s[2] != ':' ||
46
+ !is_digit(s[3]) || !is_digit(s[4]) || s[5] != ':' ||
47
+ !is_digit(s[6]) || !is_digit(s[7]))
48
+ return false;
49
+ return true;
50
+ }
51
+
52
+ static bool fast_check_datetime(std::string_view s) {
53
+ if (s.size() < 19) return false;
54
+ if (!fast_check_date(s.substr(0, 10))) return false;
55
+ if (s[10] != 'T' && s[10] != 't' && s[10] != ' ') return false;
56
+ return fast_check_time(s.substr(11));
57
+ }
58
+
59
+ static bool fast_check_ipv4(std::string_view s) {
60
+ int parts = 0, val = 0, digits = 0;
61
+ for (size_t i = 0; i <= s.size(); ++i) {
62
+ if (i == s.size() || s[i] == '.') {
63
+ if (digits == 0 || val > 255) return false;
64
+ ++parts;
65
+ val = 0;
66
+ digits = 0;
67
+ } else if (is_digit(s[i])) {
68
+ val = val * 10 + (s[i] - '0');
69
+ ++digits;
70
+ if (digits > 3) return false;
71
+ } else {
72
+ return false;
73
+ }
74
+ }
75
+ return parts == 4;
76
+ }
77
+
78
+ static bool fast_check_uri(std::string_view s) {
79
+ if (s.size() < 3) return false;
80
+ // Must start with alpha, then scheme chars, then ':'
81
+ if (!is_alpha(s[0])) return false;
82
+ size_t i = 1;
83
+ while (i < s.size() && (is_alnum(s[i]) || s[i] == '+' || s[i] == '-' ||
84
+ s[i] == '.'))
85
+ ++i;
86
+ return i < s.size() && s[i] == ':' && i + 1 < s.size();
87
+ }
88
+
89
+ static bool fast_check_uuid(std::string_view s) {
90
+ // 8-4-4-4-12
91
+ if (s.size() != 36) return false;
92
+ for (size_t i = 0; i < 36; ++i) {
93
+ if (i == 8 || i == 13 || i == 18 || i == 23) {
94
+ if (s[i] != '-') return false;
95
+ } else {
96
+ if (!is_hex(s[i])) return false;
97
+ }
98
+ }
99
+ return true;
100
+ }
101
+
102
+ static bool fast_check_hostname(std::string_view s) {
103
+ if (s.empty() || s.size() > 253) return false;
104
+ size_t label_len = 0;
105
+ for (size_t i = 0; i < s.size(); ++i) {
106
+ if (s[i] == '.') {
107
+ if (label_len == 0) return false;
108
+ label_len = 0;
109
+ } else if (is_alnum(s[i]) || s[i] == '-') {
110
+ ++label_len;
111
+ if (label_len > 63) return false;
112
+ } else {
113
+ return false;
114
+ }
115
+ }
116
+ return label_len > 0;
117
+ }
118
+
119
+ static bool check_format(std::string_view sv, const std::string& fmt) {
120
+ if (fmt == "email") return fast_check_email(sv);
121
+ if (fmt == "date") return fast_check_date(sv);
122
+ if (fmt == "date-time") return fast_check_datetime(sv);
123
+ if (fmt == "time") return fast_check_time(sv);
124
+ if (fmt == "ipv4") return fast_check_ipv4(sv);
125
+ if (fmt == "ipv6") return sv.find(':') != std::string_view::npos;
126
+ if (fmt == "uri" || fmt == "uri-reference") return fast_check_uri(sv);
127
+ if (fmt == "uuid") return fast_check_uuid(sv);
128
+ if (fmt == "hostname") return fast_check_hostname(sv);
129
+ return true; // unknown formats pass
130
+ }
131
+
132
+ namespace ata {
133
+
134
+ using namespace simdjson;
135
+
136
+ // Forward declarations
137
+ struct schema_node;
138
+ using schema_node_ptr = std::shared_ptr<schema_node>;
139
+
140
+ struct schema_node {
141
+ // type constraint: "string", "number", "integer", "boolean", "null",
142
+ // "object", "array"
143
+ std::vector<std::string> types;
144
+
145
+ // numeric
146
+ std::optional<double> minimum;
147
+ std::optional<double> maximum;
148
+ std::optional<double> exclusive_minimum;
149
+ std::optional<double> exclusive_maximum;
150
+ std::optional<double> multiple_of;
151
+
152
+ // string
153
+ std::optional<uint64_t> min_length;
154
+ std::optional<uint64_t> max_length;
155
+ std::optional<std::string> pattern;
156
+ std::shared_ptr<std::regex> compiled_pattern; // cached compiled regex
157
+
158
+ // array
159
+ std::optional<uint64_t> min_items;
160
+ std::optional<uint64_t> max_items;
161
+ bool unique_items = false;
162
+ schema_node_ptr items_schema;
163
+ std::vector<schema_node_ptr> prefix_items;
164
+
165
+ // object
166
+ std::unordered_map<std::string, schema_node_ptr> properties;
167
+ std::vector<std::string> required;
168
+ std::optional<bool> additional_properties_bool;
169
+ schema_node_ptr additional_properties_schema;
170
+ std::optional<uint64_t> min_properties;
171
+ std::optional<uint64_t> max_properties;
172
+
173
+ // patternProperties
174
+ std::vector<std::pair<std::string, schema_node_ptr>> pattern_properties;
175
+
176
+ // enum / const
177
+ std::optional<std::string> enum_values_raw; // raw JSON array string
178
+ std::vector<std::string> enum_values_minified; // pre-minified enum values
179
+ std::optional<std::string> const_value_raw; // raw JSON value string
180
+
181
+ // format
182
+ std::optional<std::string> format;
183
+
184
+ // composition
185
+ std::vector<schema_node_ptr> all_of;
186
+ std::vector<schema_node_ptr> any_of;
187
+ std::vector<schema_node_ptr> one_of;
188
+ schema_node_ptr not_schema;
189
+
190
+ // conditional
191
+ schema_node_ptr if_schema;
192
+ schema_node_ptr then_schema;
193
+ schema_node_ptr else_schema;
194
+
195
+ // $ref
196
+ std::string ref;
197
+
198
+ // boolean schema
199
+ std::optional<bool> boolean_schema;
200
+ };
201
+
202
+ struct compiled_schema {
203
+ schema_node_ptr root;
204
+ std::unordered_map<std::string, schema_node_ptr> defs;
205
+ std::string raw_schema;
206
+ dom::parser parser;
207
+ dom::parser doc_parser; // reusable parser for document validation
208
+ };
209
+
210
+ // --- Schema compilation ---
211
+
212
+ static schema_node_ptr compile_node(dom::element el,
213
+ compiled_schema& ctx);
214
+
215
+ static schema_node_ptr compile_node(dom::element el,
216
+ compiled_schema& ctx) {
217
+ auto node = std::make_shared<schema_node>();
218
+
219
+ // Boolean schema
220
+ if (el.is<bool>()) {
221
+ node->boolean_schema = bool(el);
222
+ return node;
223
+ }
224
+
225
+ if (!el.is<dom::object>()) {
226
+ return node;
227
+ }
228
+
229
+ auto obj = dom::object(el);
230
+
231
+ // $ref
232
+ dom::element ref_el;
233
+ if (obj["$ref"].get(ref_el) == SUCCESS) {
234
+ std::string_view ref_sv;
235
+ if (ref_el.get(ref_sv) == SUCCESS) {
236
+ node->ref = std::string(ref_sv);
237
+ }
238
+ }
239
+
240
+ // type
241
+ dom::element type_el;
242
+ if (obj["type"].get(type_el) == SUCCESS) {
243
+ if (type_el.is<std::string_view>()) {
244
+ std::string_view sv;
245
+ type_el.get(sv);
246
+ node->types.emplace_back(sv);
247
+ } else if (type_el.is<dom::array>()) {
248
+ for (auto t : dom::array(type_el)) {
249
+ std::string_view sv;
250
+ if (t.get(sv) == SUCCESS) {
251
+ node->types.emplace_back(sv);
252
+ }
253
+ }
254
+ }
255
+ }
256
+
257
+ // numeric constraints
258
+ dom::element num_el;
259
+ if (obj["minimum"].get(num_el) == SUCCESS) {
260
+ double v;
261
+ if (num_el.get(v) == SUCCESS) node->minimum = v;
262
+ }
263
+ if (obj["maximum"].get(num_el) == SUCCESS) {
264
+ double v;
265
+ if (num_el.get(v) == SUCCESS) node->maximum = v;
266
+ }
267
+ if (obj["exclusiveMinimum"].get(num_el) == SUCCESS) {
268
+ double v;
269
+ if (num_el.get(v) == SUCCESS) node->exclusive_minimum = v;
270
+ }
271
+ if (obj["exclusiveMaximum"].get(num_el) == SUCCESS) {
272
+ double v;
273
+ if (num_el.get(v) == SUCCESS) node->exclusive_maximum = v;
274
+ }
275
+ if (obj["multipleOf"].get(num_el) == SUCCESS) {
276
+ double v;
277
+ if (num_el.get(v) == SUCCESS) node->multiple_of = v;
278
+ }
279
+
280
+ // string constraints
281
+ dom::element str_el;
282
+ if (obj["minLength"].get(str_el) == SUCCESS) {
283
+ uint64_t v;
284
+ if (str_el.get(v) == SUCCESS) node->min_length = v;
285
+ }
286
+ if (obj["maxLength"].get(str_el) == SUCCESS) {
287
+ uint64_t v;
288
+ if (str_el.get(v) == SUCCESS) node->max_length = v;
289
+ }
290
+ if (obj["pattern"].get(str_el) == SUCCESS) {
291
+ std::string_view sv;
292
+ if (str_el.get(sv) == SUCCESS) {
293
+ node->pattern = std::string(sv);
294
+ try {
295
+ node->compiled_pattern =
296
+ std::make_shared<std::regex>(node->pattern.value());
297
+ } catch (...) {
298
+ // Invalid regex — leave compiled_pattern null
299
+ }
300
+ }
301
+ }
302
+
303
+ // array constraints
304
+ if (obj["minItems"].get(str_el) == SUCCESS) {
305
+ uint64_t v;
306
+ if (str_el.get(v) == SUCCESS) node->min_items = v;
307
+ }
308
+ if (obj["maxItems"].get(str_el) == SUCCESS) {
309
+ uint64_t v;
310
+ if (str_el.get(v) == SUCCESS) node->max_items = v;
311
+ }
312
+ dom::element ui_el;
313
+ if (obj["uniqueItems"].get(ui_el) == SUCCESS) {
314
+ bool v;
315
+ if (ui_el.get(v) == SUCCESS) node->unique_items = v;
316
+ }
317
+ // prefixItems (Draft 2020-12)
318
+ dom::element pi_el;
319
+ if (obj["prefixItems"].get(pi_el) == SUCCESS && pi_el.is<dom::array>()) {
320
+ for (auto item : dom::array(pi_el)) {
321
+ node->prefix_items.push_back(compile_node(item, ctx));
322
+ }
323
+ }
324
+
325
+ dom::element items_el;
326
+ if (obj["items"].get(items_el) == SUCCESS) {
327
+ node->items_schema = compile_node(items_el, ctx);
328
+ }
329
+
330
+ // object constraints
331
+ dom::element props_el;
332
+ if (obj["properties"].get(props_el) == SUCCESS && props_el.is<dom::object>()) {
333
+ for (auto [key, val] : dom::object(props_el)) {
334
+ node->properties[std::string(key)] = compile_node(val, ctx);
335
+ }
336
+ }
337
+
338
+ dom::element req_el;
339
+ if (obj["required"].get(req_el) == SUCCESS && req_el.is<dom::array>()) {
340
+ for (auto r : dom::array(req_el)) {
341
+ std::string_view sv;
342
+ if (r.get(sv) == SUCCESS) {
343
+ node->required.emplace_back(sv);
344
+ }
345
+ }
346
+ }
347
+
348
+ dom::element ap_el;
349
+ if (obj["additionalProperties"].get(ap_el) == SUCCESS) {
350
+ if (ap_el.is<bool>()) {
351
+ node->additional_properties_bool = bool(ap_el);
352
+ } else {
353
+ node->additional_properties_schema = compile_node(ap_el, ctx);
354
+ }
355
+ }
356
+
357
+ if (obj["minProperties"].get(str_el) == SUCCESS) {
358
+ uint64_t v;
359
+ if (str_el.get(v) == SUCCESS) node->min_properties = v;
360
+ }
361
+ if (obj["maxProperties"].get(str_el) == SUCCESS) {
362
+ uint64_t v;
363
+ if (str_el.get(v) == SUCCESS) node->max_properties = v;
364
+ }
365
+
366
+ // patternProperties
367
+ dom::element pp_el;
368
+ if (obj["patternProperties"].get(pp_el) == SUCCESS &&
369
+ pp_el.is<dom::object>()) {
370
+ for (auto [key, val] : dom::object(pp_el)) {
371
+ node->pattern_properties.emplace_back(std::string(key),
372
+ compile_node(val, ctx));
373
+ }
374
+ }
375
+
376
+ // format
377
+ dom::element fmt_el;
378
+ if (obj["format"].get(fmt_el) == SUCCESS) {
379
+ std::string_view sv;
380
+ if (fmt_el.get(sv) == SUCCESS) node->format = std::string(sv);
381
+ }
382
+
383
+ // $id (register in defs for potential resolution)
384
+ dom::element id_el;
385
+ if (obj["$id"].get(id_el) == SUCCESS) {
386
+ std::string_view sv;
387
+ if (id_el.get(sv) == SUCCESS) {
388
+ ctx.defs[std::string(sv)] = node;
389
+ }
390
+ }
391
+
392
+ // enum — pre-minify each value at compile time
393
+ dom::element enum_el;
394
+ if (obj["enum"].get(enum_el) == SUCCESS) {
395
+ node->enum_values_raw = std::string(minify(enum_el));
396
+ if (enum_el.is<dom::array>()) {
397
+ for (auto e : dom::array(enum_el)) {
398
+ node->enum_values_minified.push_back(std::string(minify(e)));
399
+ }
400
+ }
401
+ }
402
+
403
+ // const
404
+ dom::element const_el;
405
+ if (obj["const"].get(const_el) == SUCCESS) {
406
+ node->const_value_raw = std::string(minify(const_el));
407
+ }
408
+
409
+ // composition
410
+ dom::element comp_el;
411
+ if (obj["allOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
412
+ for (auto s : dom::array(comp_el)) {
413
+ node->all_of.push_back(compile_node(s, ctx));
414
+ }
415
+ }
416
+ if (obj["anyOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
417
+ for (auto s : dom::array(comp_el)) {
418
+ node->any_of.push_back(compile_node(s, ctx));
419
+ }
420
+ }
421
+ if (obj["oneOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
422
+ for (auto s : dom::array(comp_el)) {
423
+ node->one_of.push_back(compile_node(s, ctx));
424
+ }
425
+ }
426
+ dom::element not_el;
427
+ if (obj["not"].get(not_el) == SUCCESS) {
428
+ node->not_schema = compile_node(not_el, ctx);
429
+ }
430
+
431
+ // conditional
432
+ dom::element if_el;
433
+ if (obj["if"].get(if_el) == SUCCESS) {
434
+ node->if_schema = compile_node(if_el, ctx);
435
+ }
436
+ dom::element then_el;
437
+ if (obj["then"].get(then_el) == SUCCESS) {
438
+ node->then_schema = compile_node(then_el, ctx);
439
+ }
440
+ dom::element else_el;
441
+ if (obj["else"].get(else_el) == SUCCESS) {
442
+ node->else_schema = compile_node(else_el, ctx);
443
+ }
444
+
445
+ // $defs / definitions
446
+ dom::element defs_el;
447
+ if (obj["$defs"].get(defs_el) == SUCCESS && defs_el.is<dom::object>()) {
448
+ for (auto [key, val] : dom::object(defs_el)) {
449
+ std::string def_path = "#/$defs/" + std::string(key);
450
+ ctx.defs[def_path] = compile_node(val, ctx);
451
+ }
452
+ }
453
+ if (obj["definitions"].get(defs_el) == SUCCESS &&
454
+ defs_el.is<dom::object>()) {
455
+ for (auto [key, val] : dom::object(defs_el)) {
456
+ std::string def_path = "#/definitions/" + std::string(key);
457
+ ctx.defs[def_path] = compile_node(val, ctx);
458
+ }
459
+ }
460
+
461
+ return node;
462
+ }
463
+
464
+ // --- Validation ---
465
+
466
+ static void validate_node(const schema_node_ptr& node,
467
+ dom::element value,
468
+ const std::string& path,
469
+ const compiled_schema& ctx,
470
+ std::vector<validation_error>& errors,
471
+ bool all_errors = true);
472
+
473
+ // Macro for early termination
474
+ #define ATA_CHECK_EARLY() if (!all_errors && !errors.empty()) return
475
+
476
+ // Use string_view to avoid allocations in hot path
477
+ static std::string_view type_of_sv(dom::element el) {
478
+ switch (el.type()) {
479
+ case dom::element_type::STRING: return "string";
480
+ case dom::element_type::INT64:
481
+ case dom::element_type::UINT64: return "integer";
482
+ case dom::element_type::DOUBLE: return "number";
483
+ case dom::element_type::BOOL: return "boolean";
484
+ case dom::element_type::NULL_VALUE:return "null";
485
+ case dom::element_type::ARRAY: return "array";
486
+ case dom::element_type::OBJECT: return "object";
487
+ }
488
+ return "unknown";
489
+ }
490
+
491
+ static std::string type_of(dom::element el) {
492
+ return std::string(type_of_sv(el));
493
+ }
494
+
495
+ static bool type_matches(dom::element el, const std::string& type) {
496
+ auto actual = type_of_sv(el);
497
+ if (actual == type) return true;
498
+ if (type == "number" && (actual == "integer" || actual == "number"))
499
+ return true;
500
+ return false;
501
+ }
502
+
503
+ static double to_double(dom::element el) {
504
+ double v = 0;
505
+ if (el.get(v) == SUCCESS) return v;
506
+ int64_t i = 0;
507
+ if (el.get(i) == SUCCESS) return static_cast<double>(i);
508
+ uint64_t u = 0;
509
+ if (el.get(u) == SUCCESS) return static_cast<double>(u);
510
+ return 0;
511
+ }
512
+
513
+ // Count UTF-8 codepoints — branchless: count non-continuation bytes
514
+ static uint64_t utf8_length(std::string_view s) {
515
+ uint64_t count = 0;
516
+ for (size_t i = 0; i < s.size(); ++i) {
517
+ // Continuation bytes are 10xxxxxx (0x80-0xBF)
518
+ // Non-continuation bytes start codepoints
519
+ count += ((static_cast<unsigned char>(s[i]) & 0xC0) != 0x80);
520
+ }
521
+ return count;
522
+ }
523
+
524
+ static void validate_node(const schema_node_ptr& node,
525
+ dom::element value,
526
+ const std::string& path,
527
+ const compiled_schema& ctx,
528
+ std::vector<validation_error>& errors,
529
+ bool all_errors) {
530
+ if (!node) return;
531
+
532
+ // Boolean schema
533
+ if (node->boolean_schema.has_value()) {
534
+ if (!node->boolean_schema.value()) {
535
+ errors.push_back({error_code::type_mismatch, path,
536
+ "schema is false, no value is valid"});
537
+ }
538
+ return;
539
+ }
540
+
541
+ // $ref
542
+ if (!node->ref.empty()) {
543
+ // First check defs map
544
+ auto it = ctx.defs.find(node->ref);
545
+ if (it != ctx.defs.end()) {
546
+ validate_node(it->second, value, path, ctx, errors, all_errors);
547
+ return;
548
+ }
549
+ // Try JSON Pointer resolution from root (e.g., "#/properties/foo")
550
+ if (node->ref.size() > 1 && node->ref[0] == '#' &&
551
+ node->ref[1] == '/') {
552
+ // Walk the schema tree following the pointer
553
+ std::string pointer = node->ref.substr(2);
554
+ schema_node_ptr current = ctx.root;
555
+ bool resolved = true;
556
+ size_t pos = 0;
557
+ while (pos < pointer.size() && current) {
558
+ size_t next = pointer.find('/', pos);
559
+ std::string segment =
560
+ pointer.substr(pos, next == std::string::npos ? next : next - pos);
561
+ // Unescape JSON Pointer: ~1 -> /, ~0 -> ~
562
+ std::string key;
563
+ for (size_t i = 0; i < segment.size(); ++i) {
564
+ if (segment[i] == '~' && i + 1 < segment.size()) {
565
+ if (segment[i + 1] == '1') { key += '/'; ++i; }
566
+ else if (segment[i + 1] == '0') { key += '~'; ++i; }
567
+ else key += segment[i];
568
+ } else {
569
+ key += segment[i];
570
+ }
571
+ }
572
+ // Navigate the compiled schema tree
573
+ if (key == "properties" && !current->properties.empty()) {
574
+ // Next segment is the property name
575
+ pos = (next == std::string::npos) ? pointer.size() : next + 1;
576
+ next = pointer.find('/', pos);
577
+ std::string prop_name = pointer.substr(
578
+ pos, next == std::string::npos ? next : next - pos);
579
+ auto pit = current->properties.find(prop_name);
580
+ if (pit != current->properties.end()) {
581
+ current = pit->second;
582
+ } else {
583
+ resolved = false; break;
584
+ }
585
+ } else if (key == "items" && current->items_schema) {
586
+ current = current->items_schema;
587
+ } else if (key == "$defs" || key == "definitions") {
588
+ // Next segment is the def name — already in ctx.defs
589
+ pos = (next == std::string::npos) ? pointer.size() : next + 1;
590
+ next = pointer.find('/', pos);
591
+ std::string def_name = pointer.substr(
592
+ pos, next == std::string::npos ? next : next - pos);
593
+ std::string full_ref = "#/" + key + "/" + def_name;
594
+ auto dit = ctx.defs.find(full_ref);
595
+ if (dit != ctx.defs.end()) {
596
+ current = dit->second;
597
+ } else {
598
+ resolved = false; break;
599
+ }
600
+ } else if (key == "allOf" || key == "anyOf" || key == "oneOf") {
601
+ pos = (next == std::string::npos) ? pointer.size() : next + 1;
602
+ next = pointer.find('/', pos);
603
+ std::string idx_str = pointer.substr(
604
+ pos, next == std::string::npos ? next : next - pos);
605
+ size_t idx = std::stoul(idx_str);
606
+ auto& vec = (key == "allOf") ? current->all_of
607
+ : (key == "anyOf") ? current->any_of
608
+ : current->one_of;
609
+ if (idx < vec.size()) {
610
+ current = vec[idx];
611
+ } else {
612
+ resolved = false; break;
613
+ }
614
+ } else if (key == "not" && current->not_schema) {
615
+ current = current->not_schema;
616
+ } else if (key == "if" && current->if_schema) {
617
+ current = current->if_schema;
618
+ } else if (key == "then" && current->then_schema) {
619
+ current = current->then_schema;
620
+ } else if (key == "else" && current->else_schema) {
621
+ current = current->else_schema;
622
+ } else if (key == "additionalProperties" &&
623
+ current->additional_properties_schema) {
624
+ current = current->additional_properties_schema;
625
+ } else if (key == "prefixItems") {
626
+ pos = (next == std::string::npos) ? pointer.size() : next + 1;
627
+ next = pointer.find('/', pos);
628
+ std::string idx_str = pointer.substr(
629
+ pos, next == std::string::npos ? next : next - pos);
630
+ size_t idx = std::stoul(idx_str);
631
+ if (idx < current->prefix_items.size()) {
632
+ current = current->prefix_items[idx];
633
+ } else {
634
+ resolved = false; break;
635
+ }
636
+ } else {
637
+ resolved = false; break;
638
+ }
639
+ pos = (next == std::string::npos) ? pointer.size() : next + 1;
640
+ }
641
+ if (resolved && current) {
642
+ validate_node(current, value, path, ctx, errors, all_errors);
643
+ return;
644
+ }
645
+ }
646
+ // Self-reference: "#"
647
+ if (node->ref == "#" && ctx.root) {
648
+ validate_node(ctx.root, value, path, ctx, errors, all_errors);
649
+ return;
650
+ }
651
+ errors.push_back({error_code::ref_not_found, path,
652
+ "cannot resolve $ref: " + node->ref});
653
+ return;
654
+ }
655
+
656
+ // type
657
+ if (!node->types.empty()) {
658
+ bool match = false;
659
+ for (const auto& t : node->types) {
660
+ if (type_matches(value, t)) {
661
+ match = true;
662
+ break;
663
+ }
664
+ }
665
+ if (!match) {
666
+ std::string expected;
667
+ for (size_t i = 0; i < node->types.size(); ++i) {
668
+ if (i > 0) expected += ", ";
669
+ expected += node->types[i];
670
+ }
671
+ errors.push_back({error_code::type_mismatch, path,
672
+ "expected type " + expected + ", got " + type_of(value)});
673
+ ATA_CHECK_EARLY();
674
+ }
675
+ }
676
+
677
+ // enum — use pre-minified values (no re-parsing)
678
+ if (!node->enum_values_minified.empty()) {
679
+ std::string val_str = std::string(minify(value));
680
+ bool found = false;
681
+ for (const auto& ev : node->enum_values_minified) {
682
+ if (ev == val_str) {
683
+ found = true;
684
+ break;
685
+ }
686
+ }
687
+ if (!found) {
688
+ errors.push_back({error_code::enum_mismatch, path,
689
+ "value not in enum"});
690
+ }
691
+ }
692
+
693
+ // const
694
+ if (node->const_value_raw.has_value()) {
695
+ std::string val_str = std::string(minify(value));
696
+ if (val_str != node->const_value_raw.value()) {
697
+ errors.push_back({error_code::const_mismatch, path,
698
+ "value does not match const"});
699
+ ATA_CHECK_EARLY();
700
+ }
701
+ }
702
+
703
+ ATA_CHECK_EARLY();
704
+ // Numeric validations
705
+ auto actual_type = type_of(value);
706
+ if (actual_type == "integer" || actual_type == "number") {
707
+ double v = to_double(value);
708
+ if (node->minimum.has_value() && v < node->minimum.value()) {
709
+ errors.push_back({error_code::minimum_violation, path,
710
+ "value " + std::to_string(v) + " < minimum " +
711
+ std::to_string(node->minimum.value())});
712
+ }
713
+ if (node->maximum.has_value() && v > node->maximum.value()) {
714
+ errors.push_back({error_code::maximum_violation, path,
715
+ "value " + std::to_string(v) + " > maximum " +
716
+ std::to_string(node->maximum.value())});
717
+ }
718
+ if (node->exclusive_minimum.has_value() &&
719
+ v <= node->exclusive_minimum.value()) {
720
+ errors.push_back({error_code::exclusive_minimum_violation, path,
721
+ "value must be > " +
722
+ std::to_string(node->exclusive_minimum.value())});
723
+ }
724
+ if (node->exclusive_maximum.has_value() &&
725
+ v >= node->exclusive_maximum.value()) {
726
+ errors.push_back({error_code::exclusive_maximum_violation, path,
727
+ "value must be < " +
728
+ std::to_string(node->exclusive_maximum.value())});
729
+ }
730
+ if (node->multiple_of.has_value()) {
731
+ double divisor = node->multiple_of.value();
732
+ double rem = std::fmod(v, divisor);
733
+ // Use relative tolerance for floating point comparison
734
+ if (std::abs(rem) > 1e-8 && std::abs(rem - divisor) > 1e-8) {
735
+ errors.push_back({error_code::multiple_of_violation, path,
736
+ "value not a multiple of " +
737
+ std::to_string(node->multiple_of.value())});
738
+ }
739
+ }
740
+ }
741
+
742
+ // String validations
743
+ if (actual_type == "string") {
744
+ std::string_view sv;
745
+ value.get(sv);
746
+ uint64_t len = utf8_length(sv);
747
+
748
+ if (node->min_length.has_value() && len < node->min_length.value()) {
749
+ errors.push_back({error_code::min_length_violation, path,
750
+ "string length " + std::to_string(len) +
751
+ " < minLength " +
752
+ std::to_string(node->min_length.value())});
753
+ }
754
+ if (node->max_length.has_value() && len > node->max_length.value()) {
755
+ errors.push_back({error_code::max_length_violation, path,
756
+ "string length " + std::to_string(len) +
757
+ " > maxLength " +
758
+ std::to_string(node->max_length.value())});
759
+ }
760
+ if (node->compiled_pattern) {
761
+ if (!std::regex_search(sv.begin(), sv.end(), *node->compiled_pattern)) {
762
+ errors.push_back({error_code::pattern_mismatch, path,
763
+ "string does not match pattern: " +
764
+ node->pattern.value()});
765
+ }
766
+ }
767
+
768
+ if (node->format.has_value()) {
769
+ if (!check_format(sv, node->format.value())) {
770
+ errors.push_back({error_code::format_mismatch, path,
771
+ "string does not match format: " +
772
+ node->format.value()});
773
+ }
774
+ }
775
+ }
776
+
777
+ // Array validations
778
+ if (actual_type == "array" && value.is<dom::array>()) {
779
+ auto arr = dom::array(value);
780
+ uint64_t arr_size = 0;
781
+ for ([[maybe_unused]] auto _ : arr) ++arr_size;
782
+
783
+ if (node->min_items.has_value() && arr_size < node->min_items.value()) {
784
+ errors.push_back({error_code::min_items_violation, path,
785
+ "array has " + std::to_string(arr_size) +
786
+ " items, minimum " +
787
+ std::to_string(node->min_items.value())});
788
+ }
789
+ if (node->max_items.has_value() && arr_size > node->max_items.value()) {
790
+ errors.push_back({error_code::max_items_violation, path,
791
+ "array has " + std::to_string(arr_size) +
792
+ " items, maximum " +
793
+ std::to_string(node->max_items.value())});
794
+ }
795
+
796
+ if (node->unique_items) {
797
+ std::set<std::string> seen;
798
+ bool has_dup = false;
799
+ for (auto item : arr) {
800
+ auto s = std::string(minify(item));
801
+ if (!seen.insert(s).second) {
802
+ has_dup = true;
803
+ break;
804
+ }
805
+ }
806
+ if (has_dup) {
807
+ errors.push_back({error_code::unique_items_violation, path,
808
+ "array contains duplicate items"});
809
+ }
810
+ }
811
+
812
+ // prefixItems + items (Draft 2020-12 semantics)
813
+ {
814
+ uint64_t idx = 0;
815
+ for (auto item : arr) {
816
+ if (idx < node->prefix_items.size()) {
817
+ validate_node(node->prefix_items[idx], item,
818
+ path + "/" + std::to_string(idx), ctx, errors);
819
+ } else if (node->items_schema) {
820
+ validate_node(node->items_schema, item,
821
+ path + "/" + std::to_string(idx), ctx, errors);
822
+ }
823
+ ++idx;
824
+ }
825
+ }
826
+ }
827
+
828
+ // Object validations
829
+ if (actual_type == "object" && value.is<dom::object>()) {
830
+ auto obj = dom::object(value);
831
+ uint64_t prop_count = 0;
832
+ for ([[maybe_unused]] auto _ : obj) ++prop_count;
833
+
834
+ if (node->min_properties.has_value() &&
835
+ prop_count < node->min_properties.value()) {
836
+ errors.push_back({error_code::min_properties_violation, path,
837
+ "object has " + std::to_string(prop_count) +
838
+ " properties, minimum " +
839
+ std::to_string(node->min_properties.value())});
840
+ }
841
+ if (node->max_properties.has_value() &&
842
+ prop_count > node->max_properties.value()) {
843
+ errors.push_back({error_code::max_properties_violation, path,
844
+ "object has " + std::to_string(prop_count) +
845
+ " properties, maximum " +
846
+ std::to_string(node->max_properties.value())});
847
+ }
848
+
849
+ // required
850
+ for (const auto& req : node->required) {
851
+ dom::element dummy;
852
+ if (obj[req].get(dummy) != SUCCESS) {
853
+ errors.push_back({error_code::required_property_missing, path,
854
+ "missing required property: " + req});
855
+ }
856
+ }
857
+
858
+ // properties + patternProperties + additionalProperties
859
+ for (auto [key, val] : obj) {
860
+ std::string key_str(key);
861
+ bool matched = false;
862
+
863
+ // Check properties
864
+ auto it = node->properties.find(key_str);
865
+ if (it != node->properties.end()) {
866
+ validate_node(it->second, val, path + "/" + key_str, ctx, errors, all_errors);
867
+ matched = true;
868
+ }
869
+
870
+ // Check patternProperties
871
+ for (const auto& [pat, pat_schema] : node->pattern_properties) {
872
+ try {
873
+ std::regex re(pat);
874
+ if (std::regex_search(key_str, re)) {
875
+ validate_node(pat_schema, val, path + "/" + key_str, ctx, errors, all_errors);
876
+ matched = true;
877
+ }
878
+ } catch (...) {
879
+ }
880
+ }
881
+
882
+ // additionalProperties (only if not matched by properties or patternProperties)
883
+ if (!matched) {
884
+ if (node->additional_properties_bool.has_value() &&
885
+ !node->additional_properties_bool.value()) {
886
+ errors.push_back(
887
+ {error_code::additional_property_not_allowed, path,
888
+ "additional property not allowed: " + key_str});
889
+ } else if (node->additional_properties_schema) {
890
+ validate_node(node->additional_properties_schema, val,
891
+ path + "/" + key_str, ctx, errors);
892
+ }
893
+ }
894
+ }
895
+ }
896
+
897
+ // allOf
898
+ if (!node->all_of.empty()) {
899
+ for (const auto& sub : node->all_of) {
900
+ std::vector<validation_error> sub_errors;
901
+ validate_node(sub, value, path, ctx, sub_errors, all_errors);
902
+ if (!sub_errors.empty()) {
903
+ errors.push_back({error_code::all_of_failed, path,
904
+ "allOf subschema failed"});
905
+ errors.insert(errors.end(), sub_errors.begin(), sub_errors.end());
906
+ }
907
+ }
908
+ }
909
+
910
+ // anyOf
911
+ if (!node->any_of.empty()) {
912
+ bool any_valid = false;
913
+ for (const auto& sub : node->any_of) {
914
+ std::vector<validation_error> sub_errors;
915
+ validate_node(sub, value, path, ctx, sub_errors, all_errors);
916
+ if (sub_errors.empty()) {
917
+ any_valid = true;
918
+ break;
919
+ }
920
+ }
921
+ if (!any_valid) {
922
+ errors.push_back({error_code::any_of_failed, path,
923
+ "no anyOf subschema matched"});
924
+ }
925
+ }
926
+
927
+ // oneOf
928
+ if (!node->one_of.empty()) {
929
+ int match_count = 0;
930
+ for (const auto& sub : node->one_of) {
931
+ std::vector<validation_error> sub_errors;
932
+ validate_node(sub, value, path, ctx, sub_errors, all_errors);
933
+ if (sub_errors.empty()) ++match_count;
934
+ }
935
+ if (match_count != 1) {
936
+ errors.push_back({error_code::one_of_failed, path,
937
+ "expected exactly one oneOf match, got " +
938
+ std::to_string(match_count)});
939
+ }
940
+ }
941
+
942
+ // not
943
+ if (node->not_schema) {
944
+ std::vector<validation_error> sub_errors;
945
+ validate_node(node->not_schema, value, path, ctx, sub_errors, all_errors);
946
+ if (sub_errors.empty()) {
947
+ errors.push_back({error_code::not_failed, path,
948
+ "value should not match 'not' schema"});
949
+ }
950
+ }
951
+
952
+ // if/then/else
953
+ if (node->if_schema) {
954
+ std::vector<validation_error> if_errors;
955
+ validate_node(node->if_schema, value, path, ctx, if_errors, all_errors);
956
+ if (if_errors.empty()) {
957
+ // if passed → validate then
958
+ if (node->then_schema) {
959
+ validate_node(node->then_schema, value, path, ctx, errors, all_errors);
960
+ }
961
+ } else {
962
+ // if failed → validate else
963
+ if (node->else_schema) {
964
+ validate_node(node->else_schema, value, path, ctx, errors, all_errors);
965
+ }
966
+ }
967
+ }
968
+ }
969
+
970
+ schema_ref compile(std::string_view schema_json) {
971
+ auto ctx = std::make_shared<compiled_schema>();
972
+ ctx->raw_schema = std::string(schema_json);
973
+
974
+ dom::element doc;
975
+ auto result = ctx->parser.parse(ctx->raw_schema);
976
+ if (result.error()) {
977
+ return schema_ref{nullptr};
978
+ }
979
+ doc = result.value();
980
+
981
+ ctx->root = compile_node(doc, *ctx);
982
+
983
+ schema_ref ref;
984
+ ref.impl = ctx;
985
+ return ref;
986
+ }
987
+
988
+ validation_result validate(const schema_ref& schema, std::string_view json,
989
+ const validate_options& opts) {
990
+ if (!schema.impl || !schema.impl->root) {
991
+ return {false, {{error_code::invalid_schema, "", "schema not compiled"}}};
992
+ }
993
+
994
+ auto padded = simdjson::padded_string(json);
995
+ auto result = schema.impl->doc_parser.parse(padded);
996
+ if (result.error()) {
997
+ return {false, {{error_code::invalid_json, "", "invalid JSON document"}}};
998
+ }
999
+
1000
+ std::vector<validation_error> errors;
1001
+ validate_node(schema.impl->root, result.value(), "", *schema.impl, errors,
1002
+ opts.all_errors);
1003
+
1004
+ return {errors.empty(), std::move(errors)};
1005
+ }
1006
+
1007
+ validation_result validate(std::string_view schema_json,
1008
+ std::string_view json,
1009
+ const validate_options& opts) {
1010
+ auto s = compile(schema_json);
1011
+ if (!s) {
1012
+ return {false, {{error_code::invalid_schema, "", "failed to compile schema"}}};
1013
+ }
1014
+ return validate(s, json, opts);
1015
+ }
1016
+
1017
+ } // namespace ata