ata-validator 0.1.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/ata.cpp CHANGED
@@ -2,10 +2,17 @@
2
2
 
3
3
  #include <algorithm>
4
4
  #include <cmath>
5
- #include <regex>
5
+ #include <re2/re2.h>
6
6
  #include <set>
7
7
  #include <unordered_map>
8
8
 
9
+ #ifdef _WIN32
10
+ #include <windows.h>
11
+ #include <sysinfoapi.h>
12
+ #else
13
+ #include <unistd.h>
14
+ #endif
15
+
9
16
  #include "simdjson.h"
10
17
 
11
18
  // --- Fast format validators (no std::regex) ---
@@ -133,6 +140,43 @@ namespace ata {
133
140
 
134
141
  using namespace simdjson;
135
142
 
143
+ // Canonical JSON: sort object keys for semantic equality comparison
144
+ static std::string canonical_json(dom::element el) {
145
+ switch (el.type()) {
146
+ case dom::element_type::OBJECT: {
147
+ dom::object obj; el.get(obj);
148
+ std::vector<std::pair<std::string_view, dom::element>> entries;
149
+ for (auto [k, v] : obj) entries.push_back({k, v});
150
+ std::sort(entries.begin(), entries.end(),
151
+ [](const auto& a, const auto& b) { return a.first < b.first; });
152
+ std::string r = "{";
153
+ for (size_t i = 0; i < entries.size(); ++i) {
154
+ if (i) r += ',';
155
+ r += '"';
156
+ r += entries[i].first;
157
+ r += "\":";
158
+ r += canonical_json(entries[i].second);
159
+ }
160
+ r += '}';
161
+ return r;
162
+ }
163
+ case dom::element_type::ARRAY: {
164
+ dom::array arr; el.get(arr);
165
+ std::string r = "[";
166
+ bool first = true;
167
+ for (auto v : arr) {
168
+ if (!first) r += ',';
169
+ first = false;
170
+ r += canonical_json(v);
171
+ }
172
+ r += ']';
173
+ return r;
174
+ }
175
+ default:
176
+ return std::string(minify(el));
177
+ }
178
+ }
179
+
136
180
  // Forward declarations
137
181
  struct schema_node;
138
182
  using schema_node_ptr = std::shared_ptr<schema_node>;
@@ -153,7 +197,7 @@ struct schema_node {
153
197
  std::optional<uint64_t> min_length;
154
198
  std::optional<uint64_t> max_length;
155
199
  std::optional<std::string> pattern;
156
- std::shared_ptr<std::regex> compiled_pattern; // cached compiled regex
200
+ std::shared_ptr<re2::RE2> compiled_pattern; // cached compiled regex (RE2)
157
201
 
158
202
  // array
159
203
  std::optional<uint64_t> min_items;
@@ -161,6 +205,9 @@ struct schema_node {
161
205
  bool unique_items = false;
162
206
  schema_node_ptr items_schema;
163
207
  std::vector<schema_node_ptr> prefix_items;
208
+ schema_node_ptr contains_schema;
209
+ std::optional<uint64_t> min_contains;
210
+ std::optional<uint64_t> max_contains;
164
211
 
165
212
  // object
166
213
  std::unordered_map<std::string, schema_node_ptr> properties;
@@ -169,9 +216,17 @@ struct schema_node {
169
216
  schema_node_ptr additional_properties_schema;
170
217
  std::optional<uint64_t> min_properties;
171
218
  std::optional<uint64_t> max_properties;
172
-
173
- // patternProperties
174
- std::vector<std::pair<std::string, schema_node_ptr>> pattern_properties;
219
+ schema_node_ptr property_names_schema;
220
+ std::unordered_map<std::string, std::vector<std::string>> dependent_required;
221
+ std::unordered_map<std::string, schema_node_ptr> dependent_schemas;
222
+
223
+ // patternProperties — each entry: (pattern_string, schema, compiled_regex)
224
+ struct pattern_prop {
225
+ std::string pattern;
226
+ schema_node_ptr schema;
227
+ std::shared_ptr<re2::RE2> compiled;
228
+ };
229
+ std::vector<pattern_prop> pattern_properties;
175
230
 
176
231
  // enum / const
177
232
  std::optional<std::string> enum_values_raw; // raw JSON array string
@@ -195,16 +250,56 @@ struct schema_node {
195
250
  // $ref
196
251
  std::string ref;
197
252
 
253
+ // $defs — stored on node for pointer navigation
254
+ std::unordered_map<std::string, schema_node_ptr> defs;
255
+
198
256
  // boolean schema
199
257
  std::optional<bool> boolean_schema;
200
258
  };
201
259
 
260
+ // --- Codegen: flat bytecode plan ---
261
+ namespace cg {
262
+ enum class op : uint8_t {
263
+ END=0, EXPECT_OBJECT, EXPECT_ARRAY, EXPECT_STRING, EXPECT_NUMBER,
264
+ EXPECT_INTEGER, EXPECT_BOOLEAN, EXPECT_NULL, EXPECT_TYPE_MULTI,
265
+ CHECK_MINIMUM, CHECK_MAXIMUM, CHECK_EX_MINIMUM, CHECK_EX_MAXIMUM,
266
+ CHECK_MULTIPLE_OF, CHECK_MIN_LENGTH, CHECK_MAX_LENGTH, CHECK_PATTERN,
267
+ CHECK_FORMAT, CHECK_MIN_ITEMS, CHECK_MAX_ITEMS, CHECK_UNIQUE_ITEMS,
268
+ ARRAY_ITEMS, CHECK_REQUIRED, CHECK_MIN_PROPS, CHECK_MAX_PROPS,
269
+ OBJ_PROPS_START, OBJ_PROP, OBJ_PROPS_END, CHECK_NO_ADDITIONAL,
270
+ CHECK_ENUM_STR, CHECK_ENUM, CHECK_CONST, COMPOSITION,
271
+ };
272
+ struct ins { op o; uint32_t a=0, b=0; };
273
+ struct plan {
274
+ std::vector<ins> code;
275
+ std::vector<double> doubles;
276
+ std::vector<std::string> strings;
277
+ std::vector<std::shared_ptr<re2::RE2>> regexes;
278
+ std::vector<std::vector<std::string>> enum_sets;
279
+ std::vector<std::vector<std::string>> type_sets;
280
+ std::vector<uint8_t> format_ids;
281
+ std::vector<std::vector<ins>> subs;
282
+ };
283
+ } // namespace cg
284
+
202
285
  struct compiled_schema {
203
286
  schema_node_ptr root;
204
287
  std::unordered_map<std::string, schema_node_ptr> defs;
205
288
  std::string raw_schema;
206
- dom::parser parser;
207
- dom::parser doc_parser; // reusable parser for document validation
289
+ dom::parser parser; // used only at compile time
290
+ cg::plan gen_plan; // codegen validation plan
291
+ bool use_ondemand = false; // true if codegen plan supports On Demand
292
+ };
293
+
294
+ // Thread-local persistent parsers — reused across all validate calls on the
295
+ // same thread. Keeps internal buffers hot in cache and avoids re-allocation.
296
+ static dom::parser& tl_dom_parser() {
297
+ thread_local dom::parser p;
298
+ return p;
299
+ }
300
+ static simdjson::ondemand::parser& tl_od_parser() {
301
+ thread_local simdjson::ondemand::parser p;
302
+ return p;
208
303
  };
209
304
 
210
305
  // --- Schema compilation ---
@@ -218,7 +313,9 @@ static schema_node_ptr compile_node(dom::element el,
218
313
 
219
314
  // Boolean schema
220
315
  if (el.is<bool>()) {
221
- node->boolean_schema = bool(el);
316
+ bool bval;
317
+ el.get(bval);
318
+ node->boolean_schema = bval;
222
319
  return node;
223
320
  }
224
321
 
@@ -226,7 +323,8 @@ static schema_node_ptr compile_node(dom::element el,
226
323
  return node;
227
324
  }
228
325
 
229
- auto obj = dom::object(el);
326
+ dom::object obj;
327
+ el.get(obj);
230
328
 
231
329
  // $ref
232
330
  dom::element ref_el;
@@ -245,7 +343,7 @@ static schema_node_ptr compile_node(dom::element el,
245
343
  type_el.get(sv);
246
344
  node->types.emplace_back(sv);
247
345
  } else if (type_el.is<dom::array>()) {
248
- for (auto t : dom::array(type_el)) {
346
+ dom::array type_arr; type_el.get(type_arr); for (auto t : type_arr) {
249
347
  std::string_view sv;
250
348
  if (t.get(sv) == SUCCESS) {
251
349
  node->types.emplace_back(sv);
@@ -291,11 +389,9 @@ static schema_node_ptr compile_node(dom::element el,
291
389
  std::string_view sv;
292
390
  if (str_el.get(sv) == SUCCESS) {
293
391
  node->pattern = std::string(sv);
294
- try {
295
- node->compiled_pattern =
296
- std::make_shared<std::regex>(node->pattern.value());
297
- } catch (...) {
298
- // Invalid regex — leave compiled_pattern null
392
+ auto re = std::make_shared<re2::RE2>(node->pattern.value());
393
+ if (re->ok()) {
394
+ node->compiled_pattern = std::move(re);
299
395
  }
300
396
  }
301
397
  }
@@ -317,7 +413,7 @@ static schema_node_ptr compile_node(dom::element el,
317
413
  // prefixItems (Draft 2020-12)
318
414
  dom::element pi_el;
319
415
  if (obj["prefixItems"].get(pi_el) == SUCCESS && pi_el.is<dom::array>()) {
320
- for (auto item : dom::array(pi_el)) {
416
+ dom::array pi_arr; pi_el.get(pi_arr); for (auto item : pi_arr) {
321
417
  node->prefix_items.push_back(compile_node(item, ctx));
322
418
  }
323
419
  }
@@ -327,17 +423,32 @@ static schema_node_ptr compile_node(dom::element el,
327
423
  node->items_schema = compile_node(items_el, ctx);
328
424
  }
329
425
 
426
+ // contains
427
+ dom::element contains_el;
428
+ if (obj["contains"].get(contains_el) == SUCCESS) {
429
+ node->contains_schema = compile_node(contains_el, ctx);
430
+ }
431
+ dom::element mc_el;
432
+ if (obj["minContains"].get(mc_el) == SUCCESS) {
433
+ uint64_t v;
434
+ if (mc_el.get(v) == SUCCESS) node->min_contains = v;
435
+ }
436
+ if (obj["maxContains"].get(mc_el) == SUCCESS) {
437
+ uint64_t v;
438
+ if (mc_el.get(v) == SUCCESS) node->max_contains = v;
439
+ }
440
+
330
441
  // object constraints
331
442
  dom::element props_el;
332
443
  if (obj["properties"].get(props_el) == SUCCESS && props_el.is<dom::object>()) {
333
- for (auto [key, val] : dom::object(props_el)) {
444
+ dom::object props_obj; props_el.get(props_obj); for (auto [key, val] : props_obj) {
334
445
  node->properties[std::string(key)] = compile_node(val, ctx);
335
446
  }
336
447
  }
337
448
 
338
449
  dom::element req_el;
339
450
  if (obj["required"].get(req_el) == SUCCESS && req_el.is<dom::array>()) {
340
- for (auto r : dom::array(req_el)) {
451
+ dom::array req_arr; req_el.get(req_arr); for (auto r : req_arr) {
341
452
  std::string_view sv;
342
453
  if (r.get(sv) == SUCCESS) {
343
454
  node->required.emplace_back(sv);
@@ -348,7 +459,7 @@ static schema_node_ptr compile_node(dom::element el,
348
459
  dom::element ap_el;
349
460
  if (obj["additionalProperties"].get(ap_el) == SUCCESS) {
350
461
  if (ap_el.is<bool>()) {
351
- node->additional_properties_bool = bool(ap_el);
462
+ bool ap_bool; ap_el.get(ap_bool); node->additional_properties_bool = ap_bool;
352
463
  } else {
353
464
  node->additional_properties_schema = compile_node(ap_el, ctx);
354
465
  }
@@ -363,13 +474,51 @@ static schema_node_ptr compile_node(dom::element el,
363
474
  if (str_el.get(v) == SUCCESS) node->max_properties = v;
364
475
  }
365
476
 
366
- // patternProperties
477
+ // propertyNames
478
+ dom::element pn_el;
479
+ if (obj["propertyNames"].get(pn_el) == SUCCESS) {
480
+ node->property_names_schema = compile_node(pn_el, ctx);
481
+ }
482
+
483
+ // dependentRequired
484
+ dom::element dr_el;
485
+ if (obj["dependentRequired"].get(dr_el) == SUCCESS &&
486
+ dr_el.is<dom::object>()) {
487
+ dom::object dr_obj; dr_el.get(dr_obj); for (auto [key, val] : dr_obj) {
488
+ std::vector<std::string> deps;
489
+ if (val.is<dom::array>()) {
490
+ dom::array val_arr; val.get(val_arr); for (auto d : val_arr) {
491
+ std::string_view sv;
492
+ if (d.get(sv) == SUCCESS) deps.emplace_back(sv);
493
+ }
494
+ }
495
+ node->dependent_required[std::string(key)] = std::move(deps);
496
+ }
497
+ }
498
+
499
+ // dependentSchemas
500
+ dom::element ds_el;
501
+ if (obj["dependentSchemas"].get(ds_el) == SUCCESS &&
502
+ ds_el.is<dom::object>()) {
503
+ dom::object ds_obj; ds_el.get(ds_obj); for (auto [key, val] : ds_obj) {
504
+ node->dependent_schemas[std::string(key)] = compile_node(val, ctx);
505
+ }
506
+ }
507
+
508
+ // patternProperties — compile regex at schema compile time
367
509
  dom::element pp_el;
368
510
  if (obj["patternProperties"].get(pp_el) == SUCCESS &&
369
511
  pp_el.is<dom::object>()) {
370
- for (auto [key, val] : dom::object(pp_el)) {
371
- node->pattern_properties.emplace_back(std::string(key),
372
- compile_node(val, ctx));
512
+ dom::object pp_obj; pp_el.get(pp_obj);
513
+ for (auto [key, val] : pp_obj) {
514
+ schema_node::pattern_prop pp;
515
+ pp.pattern = std::string(key);
516
+ pp.schema = compile_node(val, ctx);
517
+ auto re = std::make_shared<re2::RE2>(pp.pattern);
518
+ if (re->ok()) {
519
+ pp.compiled = std::move(re);
520
+ }
521
+ node->pattern_properties.push_back(std::move(pp));
373
522
  }
374
523
  }
375
524
 
@@ -392,10 +541,10 @@ static schema_node_ptr compile_node(dom::element el,
392
541
  // enum — pre-minify each value at compile time
393
542
  dom::element enum_el;
394
543
  if (obj["enum"].get(enum_el) == SUCCESS) {
395
- node->enum_values_raw = std::string(minify(enum_el));
544
+ node->enum_values_raw = canonical_json(enum_el);
396
545
  if (enum_el.is<dom::array>()) {
397
- for (auto e : dom::array(enum_el)) {
398
- node->enum_values_minified.push_back(std::string(minify(e)));
546
+ dom::array enum_arr; enum_el.get(enum_arr); for (auto e : enum_arr) {
547
+ node->enum_values_minified.push_back(canonical_json(e));
399
548
  }
400
549
  }
401
550
  }
@@ -403,23 +552,26 @@ static schema_node_ptr compile_node(dom::element el,
403
552
  // const
404
553
  dom::element const_el;
405
554
  if (obj["const"].get(const_el) == SUCCESS) {
406
- node->const_value_raw = std::string(minify(const_el));
555
+ node->const_value_raw = canonical_json(const_el);
407
556
  }
408
557
 
409
558
  // composition
410
559
  dom::element comp_el;
411
560
  if (obj["allOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
412
- for (auto s : dom::array(comp_el)) {
561
+ dom::array comp_arr; comp_el.get(comp_arr);
562
+ for (auto s : comp_arr) {
413
563
  node->all_of.push_back(compile_node(s, ctx));
414
564
  }
415
565
  }
416
566
  if (obj["anyOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
417
- for (auto s : dom::array(comp_el)) {
567
+ dom::array comp_arr2; comp_el.get(comp_arr2);
568
+ for (auto s : comp_arr2) {
418
569
  node->any_of.push_back(compile_node(s, ctx));
419
570
  }
420
571
  }
421
572
  if (obj["oneOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
422
- for (auto s : dom::array(comp_el)) {
573
+ dom::array comp_arr3; comp_el.get(comp_arr3);
574
+ for (auto s : comp_arr3) {
423
575
  node->one_of.push_back(compile_node(s, ctx));
424
576
  }
425
577
  }
@@ -445,16 +597,20 @@ static schema_node_ptr compile_node(dom::element el,
445
597
  // $defs / definitions
446
598
  dom::element defs_el;
447
599
  if (obj["$defs"].get(defs_el) == SUCCESS && defs_el.is<dom::object>()) {
448
- for (auto [key, val] : dom::object(defs_el)) {
600
+ dom::object defs_obj; defs_el.get(defs_obj); for (auto [key, val] : defs_obj) {
449
601
  std::string def_path = "#/$defs/" + std::string(key);
450
- ctx.defs[def_path] = compile_node(val, ctx);
602
+ auto compiled = compile_node(val, ctx);
603
+ ctx.defs[def_path] = compiled;
604
+ node->defs[std::string(key)] = compiled;
451
605
  }
452
606
  }
453
607
  if (obj["definitions"].get(defs_el) == SUCCESS &&
454
608
  defs_el.is<dom::object>()) {
455
- for (auto [key, val] : dom::object(defs_el)) {
609
+ dom::object defs_obj; defs_el.get(defs_obj); for (auto [key, val] : defs_obj) {
456
610
  std::string def_path = "#/definitions/" + std::string(key);
457
- ctx.defs[def_path] = compile_node(val, ctx);
611
+ auto compiled = compile_node(val, ctx);
612
+ ctx.defs[def_path] = compiled;
613
+ node->defs[std::string(key)] = compiled;
458
614
  }
459
615
  }
460
616
 
@@ -538,79 +694,106 @@ static void validate_node(const schema_node_ptr& node,
538
694
  return;
539
695
  }
540
696
 
541
- // $ref
697
+ // $ref — Draft 2020-12: $ref is not a short-circuit, sibling keywords still apply
698
+ bool ref_resolved = false;
542
699
  if (!node->ref.empty()) {
543
700
  // First check defs map
544
701
  auto it = ctx.defs.find(node->ref);
545
702
  if (it != ctx.defs.end()) {
546
703
  validate_node(it->second, value, path, ctx, errors, all_errors);
547
- return;
704
+ ref_resolved = true;
548
705
  }
549
706
  // Try JSON Pointer resolution from root (e.g., "#/properties/foo")
550
707
  if (node->ref.size() > 1 && node->ref[0] == '#' &&
551
708
  node->ref[1] == '/') {
552
- // Walk the schema tree following the pointer
553
- std::string pointer = node->ref.substr(2);
554
- schema_node_ptr current = ctx.root;
555
- bool resolved = true;
556
- size_t pos = 0;
557
- while (pos < pointer.size() && current) {
558
- size_t next = pointer.find('/', pos);
559
- std::string segment =
560
- pointer.substr(pos, next == std::string::npos ? next : next - pos);
561
- // Unescape JSON Pointer: ~1 -> /, ~0 -> ~
562
- std::string key;
563
- for (size_t i = 0; i < segment.size(); ++i) {
564
- if (segment[i] == '~' && i + 1 < segment.size()) {
565
- if (segment[i + 1] == '1') { key += '/'; ++i; }
566
- else if (segment[i + 1] == '0') { key += '~'; ++i; }
567
- else key += segment[i];
709
+ // Decode JSON Pointer segments
710
+ auto decode_pointer_segment = [](const std::string& seg) -> std::string {
711
+ // Percent-decode first
712
+ std::string pct;
713
+ for (size_t i = 0; i < seg.size(); ++i) {
714
+ if (seg[i] == '%' && i + 2 < seg.size()) {
715
+ char h = seg[i+1], l = seg[i+2];
716
+ auto hex = [](char c) -> int {
717
+ if (c >= '0' && c <= '9') return c - '0';
718
+ if (c >= 'a' && c <= 'f') return 10 + c - 'a';
719
+ if (c >= 'A' && c <= 'F') return 10 + c - 'A';
720
+ return -1;
721
+ };
722
+ int hv = hex(h), lv = hex(l);
723
+ if (hv >= 0 && lv >= 0) {
724
+ pct += static_cast<char>(hv * 16 + lv);
725
+ i += 2;
726
+ } else {
727
+ pct += seg[i];
728
+ }
568
729
  } else {
569
- key += segment[i];
730
+ pct += seg[i];
570
731
  }
571
732
  }
572
- // Navigate the compiled schema tree
573
- if (key == "properties" && !current->properties.empty()) {
574
- // Next segment is the property name
575
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
576
- next = pointer.find('/', pos);
577
- std::string prop_name = pointer.substr(
578
- pos, next == std::string::npos ? next : next - pos);
733
+ // Then JSON Pointer unescape: ~1 -> /, ~0 -> ~
734
+ std::string out;
735
+ for (size_t i = 0; i < pct.size(); ++i) {
736
+ if (pct[i] == '~' && i + 1 < pct.size()) {
737
+ if (pct[i + 1] == '1') { out += '/'; ++i; }
738
+ else if (pct[i + 1] == '0') { out += '~'; ++i; }
739
+ else out += pct[i];
740
+ } else {
741
+ out += pct[i];
742
+ }
743
+ }
744
+ return out;
745
+ };
746
+
747
+ // Split pointer into segments
748
+ std::string pointer = node->ref.substr(2);
749
+ std::vector<std::string> segments;
750
+ size_t spos = 0;
751
+ while (spos < pointer.size()) {
752
+ size_t snext = pointer.find('/', spos);
753
+ segments.push_back(decode_pointer_segment(
754
+ pointer.substr(spos, snext == std::string::npos ? snext : snext - spos)));
755
+ spos = (snext == std::string::npos) ? pointer.size() : snext + 1;
756
+ }
757
+
758
+ // Walk the schema tree
759
+ schema_node_ptr current = ctx.root;
760
+ bool resolved = true;
761
+ for (size_t si = 0; si < segments.size() && current; ++si) {
762
+ const auto& key = segments[si];
763
+
764
+ if (key == "properties" && si + 1 < segments.size()) {
765
+ const auto& prop_name = segments[++si];
579
766
  auto pit = current->properties.find(prop_name);
580
767
  if (pit != current->properties.end()) {
581
768
  current = pit->second;
582
- } else {
583
- resolved = false; break;
584
- }
769
+ } else { resolved = false; break; }
585
770
  } else if (key == "items" && current->items_schema) {
586
771
  current = current->items_schema;
587
772
  } else if (key == "$defs" || key == "definitions") {
588
- // Next segment is the def name — already in ctx.defs
589
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
590
- next = pointer.find('/', pos);
591
- std::string def_name = pointer.substr(
592
- pos, next == std::string::npos ? next : next - pos);
593
- std::string full_ref = "#/" + key + "/" + def_name;
594
- auto dit = ctx.defs.find(full_ref);
595
- if (dit != ctx.defs.end()) {
596
- current = dit->second;
597
- } else {
598
- resolved = false; break;
599
- }
773
+ if (si + 1 < segments.size()) {
774
+ const auto& def_name = segments[++si];
775
+ // Navigate into node's defs map
776
+ auto dit = current->defs.find(def_name);
777
+ if (dit != current->defs.end()) {
778
+ current = dit->second;
779
+ } else {
780
+ // Fallback: try ctx.defs with full path
781
+ std::string full_ref = "#/" + key + "/" + def_name;
782
+ auto cit = ctx.defs.find(full_ref);
783
+ if (cit != ctx.defs.end()) {
784
+ current = cit->second;
785
+ } else { resolved = false; break; }
786
+ }
787
+ } else { resolved = false; break; }
600
788
  } else if (key == "allOf" || key == "anyOf" || key == "oneOf") {
601
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
602
- next = pointer.find('/', pos);
603
- std::string idx_str = pointer.substr(
604
- pos, next == std::string::npos ? next : next - pos);
605
- size_t idx = std::stoul(idx_str);
606
- auto& vec = (key == "allOf") ? current->all_of
607
- : (key == "anyOf") ? current->any_of
608
- : current->one_of;
609
- if (idx < vec.size()) {
610
- current = vec[idx];
611
- } else {
612
- resolved = false; break;
613
- }
789
+ if (si + 1 < segments.size()) {
790
+ size_t idx = std::stoul(segments[++si]);
791
+ auto& vec = (key == "allOf") ? current->all_of
792
+ : (key == "anyOf") ? current->any_of
793
+ : current->one_of;
794
+ if (idx < vec.size()) { current = vec[idx]; }
795
+ else { resolved = false; break; }
796
+ } else { resolved = false; break; }
614
797
  } else if (key == "not" && current->not_schema) {
615
798
  current = current->not_schema;
616
799
  } else if (key == "if" && current->if_schema) {
@@ -623,34 +806,29 @@ static void validate_node(const schema_node_ptr& node,
623
806
  current->additional_properties_schema) {
624
807
  current = current->additional_properties_schema;
625
808
  } else if (key == "prefixItems") {
626
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
627
- next = pointer.find('/', pos);
628
- std::string idx_str = pointer.substr(
629
- pos, next == std::string::npos ? next : next - pos);
630
- size_t idx = std::stoul(idx_str);
631
- if (idx < current->prefix_items.size()) {
632
- current = current->prefix_items[idx];
633
- } else {
634
- resolved = false; break;
635
- }
809
+ if (si + 1 < segments.size()) {
810
+ size_t idx = std::stoul(segments[++si]);
811
+ if (idx < current->prefix_items.size()) { current = current->prefix_items[idx]; }
812
+ else { resolved = false; break; }
813
+ } else { resolved = false; break; }
636
814
  } else {
637
815
  resolved = false; break;
638
816
  }
639
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
640
817
  }
641
818
  if (resolved && current) {
642
819
  validate_node(current, value, path, ctx, errors, all_errors);
643
- return;
820
+ ref_resolved = true;
644
821
  }
645
822
  }
646
823
  // Self-reference: "#"
647
- if (node->ref == "#" && ctx.root) {
824
+ if (!ref_resolved && node->ref == "#" && ctx.root) {
648
825
  validate_node(ctx.root, value, path, ctx, errors, all_errors);
649
- return;
826
+ ref_resolved = true;
827
+ }
828
+ if (!ref_resolved) {
829
+ errors.push_back({error_code::ref_not_found, path,
830
+ "cannot resolve $ref: " + node->ref});
650
831
  }
651
- errors.push_back({error_code::ref_not_found, path,
652
- "cannot resolve $ref: " + node->ref});
653
- return;
654
832
  }
655
833
 
656
834
  // type
@@ -676,7 +854,7 @@ static void validate_node(const schema_node_ptr& node,
676
854
 
677
855
  // enum — use pre-minified values (no re-parsing)
678
856
  if (!node->enum_values_minified.empty()) {
679
- std::string val_str = std::string(minify(value));
857
+ std::string val_str = canonical_json(value);
680
858
  bool found = false;
681
859
  for (const auto& ev : node->enum_values_minified) {
682
860
  if (ev == val_str) {
@@ -692,7 +870,7 @@ static void validate_node(const schema_node_ptr& node,
692
870
 
693
871
  // const
694
872
  if (node->const_value_raw.has_value()) {
695
- std::string val_str = std::string(minify(value));
873
+ std::string val_str = canonical_json(value);
696
874
  if (val_str != node->const_value_raw.value()) {
697
875
  errors.push_back({error_code::const_mismatch, path,
698
876
  "value does not match const"});
@@ -758,7 +936,7 @@ static void validate_node(const schema_node_ptr& node,
758
936
  std::to_string(node->max_length.value())});
759
937
  }
760
938
  if (node->compiled_pattern) {
761
- if (!std::regex_search(sv.begin(), sv.end(), *node->compiled_pattern)) {
939
+ if (!re2::RE2::PartialMatch(re2::StringPiece(sv.data(), sv.size()), *node->compiled_pattern)) {
762
940
  errors.push_back({error_code::pattern_mismatch, path,
763
941
  "string does not match pattern: " +
764
942
  node->pattern.value()});
@@ -776,7 +954,7 @@ static void validate_node(const schema_node_ptr& node,
776
954
 
777
955
  // Array validations
778
956
  if (actual_type == "array" && value.is<dom::array>()) {
779
- auto arr = dom::array(value);
957
+ dom::array arr; value.get(arr);
780
958
  uint64_t arr_size = 0;
781
959
  for ([[maybe_unused]] auto _ : arr) ++arr_size;
782
960
 
@@ -797,7 +975,7 @@ static void validate_node(const schema_node_ptr& node,
797
975
  std::set<std::string> seen;
798
976
  bool has_dup = false;
799
977
  for (auto item : arr) {
800
- auto s = std::string(minify(item));
978
+ auto s = canonical_json(item);
801
979
  if (!seen.insert(s).second) {
802
980
  has_dup = true;
803
981
  break;
@@ -815,19 +993,41 @@ static void validate_node(const schema_node_ptr& node,
815
993
  for (auto item : arr) {
816
994
  if (idx < node->prefix_items.size()) {
817
995
  validate_node(node->prefix_items[idx], item,
818
- path + "/" + std::to_string(idx), ctx, errors);
996
+ path + "/" + std::to_string(idx), ctx, errors, all_errors);
819
997
  } else if (node->items_schema) {
820
998
  validate_node(node->items_schema, item,
821
- path + "/" + std::to_string(idx), ctx, errors);
999
+ path + "/" + std::to_string(idx), ctx, errors, all_errors);
822
1000
  }
823
1001
  ++idx;
824
1002
  }
825
1003
  }
1004
+
1005
+ // contains / minContains / maxContains
1006
+ if (node->contains_schema) {
1007
+ uint64_t match_count = 0;
1008
+ for (auto item : arr) {
1009
+ std::vector<validation_error> tmp;
1010
+ validate_node(node->contains_schema, item, path, ctx, tmp, false);
1011
+ if (tmp.empty()) ++match_count;
1012
+ }
1013
+ uint64_t min_c = node->min_contains.value_or(1);
1014
+ uint64_t max_c = node->max_contains.value_or(arr_size);
1015
+ if (match_count < min_c) {
1016
+ errors.push_back({error_code::min_items_violation, path,
1017
+ "contains: " + std::to_string(match_count) +
1018
+ " matches, minimum " + std::to_string(min_c)});
1019
+ }
1020
+ if (match_count > max_c) {
1021
+ errors.push_back({error_code::max_items_violation, path,
1022
+ "contains: " + std::to_string(match_count) +
1023
+ " matches, maximum " + std::to_string(max_c)});
1024
+ }
1025
+ }
826
1026
  }
827
1027
 
828
1028
  // Object validations
829
1029
  if (actual_type == "object" && value.is<dom::object>()) {
830
- auto obj = dom::object(value);
1030
+ dom::object obj; value.get(obj);
831
1031
  uint64_t prop_count = 0;
832
1032
  for ([[maybe_unused]] auto _ : obj) ++prop_count;
833
1033
 
@@ -867,15 +1067,11 @@ static void validate_node(const schema_node_ptr& node,
867
1067
  matched = true;
868
1068
  }
869
1069
 
870
- // Check patternProperties
871
- for (const auto& [pat, pat_schema] : node->pattern_properties) {
872
- try {
873
- std::regex re(pat);
874
- if (std::regex_search(key_str, re)) {
875
- validate_node(pat_schema, val, path + "/" + key_str, ctx, errors, all_errors);
876
- matched = true;
877
- }
878
- } catch (...) {
1070
+ // Check patternProperties (use cached compiled regex)
1071
+ for (const auto& pp : node->pattern_properties) {
1072
+ if (pp.compiled && re2::RE2::PartialMatch(key_str, *pp.compiled)) {
1073
+ validate_node(pp.schema, val, path + "/" + key_str, ctx, errors, all_errors);
1074
+ matched = true;
879
1075
  }
880
1076
  }
881
1077
 
@@ -892,6 +1088,43 @@ static void validate_node(const schema_node_ptr& node,
892
1088
  }
893
1089
  }
894
1090
  }
1091
+
1092
+ // propertyNames
1093
+ if (node->property_names_schema) {
1094
+ for (auto [key, val] : obj) {
1095
+ // Create a string element to validate the key
1096
+ std::string key_json = "\"" + std::string(key) + "\"";
1097
+ dom::parser key_parser;
1098
+ auto key_result = key_parser.parse(key_json);
1099
+ if (!key_result.error()) {
1100
+ validate_node(node->property_names_schema, key_result.value(),
1101
+ path, ctx, errors, all_errors);
1102
+ }
1103
+ }
1104
+ }
1105
+
1106
+ // dependentRequired
1107
+ for (const auto& [prop, deps] : node->dependent_required) {
1108
+ dom::element dummy;
1109
+ if (obj[prop].get(dummy) == SUCCESS) {
1110
+ for (const auto& dep : deps) {
1111
+ dom::element dep_dummy;
1112
+ if (obj[dep].get(dep_dummy) != SUCCESS) {
1113
+ errors.push_back({error_code::required_property_missing, path,
1114
+ "property '" + prop + "' requires '" + dep +
1115
+ "' to be present"});
1116
+ }
1117
+ }
1118
+ }
1119
+ }
1120
+
1121
+ // dependentSchemas
1122
+ for (const auto& [prop, schema] : node->dependent_schemas) {
1123
+ dom::element dummy;
1124
+ if (obj[prop].get(dummy) == SUCCESS) {
1125
+ validate_node(schema, value, path, ctx, errors, all_errors);
1126
+ }
1127
+ }
895
1128
  }
896
1129
 
897
1130
  // allOf
@@ -967,6 +1200,369 @@ static void validate_node(const schema_node_ptr& node,
967
1200
  }
968
1201
  }
969
1202
 
1203
+ // --- Codegen compiler ---
1204
+ static void cg_compile(const schema_node* n, cg::plan& p,
1205
+ std::vector<cg::ins>& out) {
1206
+ if (!n) return;
1207
+ if (n->boolean_schema.has_value()) {
1208
+ if (!*n->boolean_schema) out.push_back({cg::op::EXPECT_NULL});
1209
+ return;
1210
+ }
1211
+ // Composition fallback
1212
+ if (!n->ref.empty() || !n->all_of.empty() || !n->any_of.empty() ||
1213
+ !n->one_of.empty() || n->not_schema || n->if_schema) {
1214
+ uintptr_t ptr = reinterpret_cast<uintptr_t>(n);
1215
+ out.push_back({cg::op::COMPOSITION, (uint32_t)(ptr & 0xFFFFFFFF),
1216
+ (uint32_t)((ptr >> 32) & 0xFFFFFFFF)});
1217
+ return;
1218
+ }
1219
+ // Type
1220
+ if (!n->types.empty()) {
1221
+ if (n->types.size() == 1) {
1222
+ auto& t = n->types[0];
1223
+ if (t=="object") out.push_back({cg::op::EXPECT_OBJECT});
1224
+ else if (t=="array") out.push_back({cg::op::EXPECT_ARRAY});
1225
+ else if (t=="string") out.push_back({cg::op::EXPECT_STRING});
1226
+ else if (t=="number") out.push_back({cg::op::EXPECT_NUMBER});
1227
+ else if (t=="integer") out.push_back({cg::op::EXPECT_INTEGER});
1228
+ else if (t=="boolean") out.push_back({cg::op::EXPECT_BOOLEAN});
1229
+ else if (t=="null") out.push_back({cg::op::EXPECT_NULL});
1230
+ } else {
1231
+ uint32_t i = (uint32_t)p.type_sets.size();
1232
+ p.type_sets.push_back(n->types);
1233
+ out.push_back({cg::op::EXPECT_TYPE_MULTI, i});
1234
+ }
1235
+ }
1236
+ // Enum
1237
+ if (!n->enum_values_minified.empty()) {
1238
+ bool all_str = true;
1239
+ for (auto& e : n->enum_values_minified)
1240
+ if (e.empty() || e[0]!='"') { all_str=false; break; }
1241
+ uint32_t i = (uint32_t)p.enum_sets.size();
1242
+ p.enum_sets.push_back(n->enum_values_minified);
1243
+ out.push_back({all_str ? cg::op::CHECK_ENUM_STR : cg::op::CHECK_ENUM, i});
1244
+ }
1245
+ if (n->const_value_raw.has_value()) {
1246
+ uint32_t i=(uint32_t)p.strings.size();
1247
+ p.strings.push_back(*n->const_value_raw);
1248
+ out.push_back({cg::op::CHECK_CONST, i});
1249
+ }
1250
+ // Numeric
1251
+ if (n->minimum.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->minimum); out.push_back({cg::op::CHECK_MINIMUM,i}); }
1252
+ if (n->maximum.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->maximum); out.push_back({cg::op::CHECK_MAXIMUM,i}); }
1253
+ if (n->exclusive_minimum.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->exclusive_minimum); out.push_back({cg::op::CHECK_EX_MINIMUM,i}); }
1254
+ if (n->exclusive_maximum.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->exclusive_maximum); out.push_back({cg::op::CHECK_EX_MAXIMUM,i}); }
1255
+ if (n->multiple_of.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->multiple_of); out.push_back({cg::op::CHECK_MULTIPLE_OF,i}); }
1256
+ // String
1257
+ if (n->min_length.has_value()) out.push_back({cg::op::CHECK_MIN_LENGTH,(uint32_t)*n->min_length});
1258
+ if (n->max_length.has_value()) out.push_back({cg::op::CHECK_MAX_LENGTH,(uint32_t)*n->max_length});
1259
+ if (n->compiled_pattern) { uint32_t i=(uint32_t)p.regexes.size(); p.regexes.push_back(n->compiled_pattern); out.push_back({cg::op::CHECK_PATTERN,i}); }
1260
+ if (n->format.has_value()) {
1261
+ uint32_t i=(uint32_t)p.format_ids.size();
1262
+ uint8_t fid=255;
1263
+ auto& f=*n->format;
1264
+ if(f=="email")fid=0;else if(f=="date")fid=1;else if(f=="date-time")fid=2;
1265
+ else if(f=="time")fid=3;else if(f=="ipv4")fid=4;else if(f=="ipv6")fid=5;
1266
+ else if(f=="uri"||f=="uri-reference")fid=6;else if(f=="uuid")fid=7;
1267
+ else if(f=="hostname")fid=8;
1268
+ p.format_ids.push_back(fid);
1269
+ out.push_back({cg::op::CHECK_FORMAT,i});
1270
+ }
1271
+ // Array
1272
+ if (n->min_items.has_value()) out.push_back({cg::op::CHECK_MIN_ITEMS,(uint32_t)*n->min_items});
1273
+ if (n->max_items.has_value()) out.push_back({cg::op::CHECK_MAX_ITEMS,(uint32_t)*n->max_items});
1274
+ if (n->unique_items) out.push_back({cg::op::CHECK_UNIQUE_ITEMS});
1275
+ if (n->items_schema) {
1276
+ uint32_t si=(uint32_t)p.subs.size();
1277
+ p.subs.emplace_back();
1278
+ std::vector<cg::ins> sub_code;
1279
+ cg_compile(n->items_schema.get(), p, sub_code);
1280
+ sub_code.push_back({cg::op::END});
1281
+ p.subs[si] = std::move(sub_code);
1282
+ out.push_back({cg::op::ARRAY_ITEMS, si});
1283
+ }
1284
+ // Object
1285
+ for (auto& r : n->required) { uint32_t i=(uint32_t)p.strings.size(); p.strings.push_back(r); out.push_back({cg::op::CHECK_REQUIRED,i}); }
1286
+ if (n->min_properties.has_value()) out.push_back({cg::op::CHECK_MIN_PROPS,(uint32_t)*n->min_properties});
1287
+ if (n->max_properties.has_value()) out.push_back({cg::op::CHECK_MAX_PROPS,(uint32_t)*n->max_properties});
1288
+ // additional_properties_schema requires tree walker — bail out to COMPOSITION
1289
+ if (n->additional_properties_schema) {
1290
+ out.push_back({cg::op::COMPOSITION, 0, 0});
1291
+ return;
1292
+ }
1293
+ if (!n->properties.empty() || (n->additional_properties_bool.has_value() && !*n->additional_properties_bool)) {
1294
+ out.push_back({cg::op::OBJ_PROPS_START});
1295
+ if (n->additional_properties_bool.has_value() && !*n->additional_properties_bool)
1296
+ out.push_back({cg::op::CHECK_NO_ADDITIONAL});
1297
+ for (auto& [name, schema] : n->properties) {
1298
+ uint32_t ni=(uint32_t)p.strings.size(); p.strings.push_back(name);
1299
+ uint32_t si=(uint32_t)p.subs.size();
1300
+ p.subs.emplace_back();
1301
+ std::vector<cg::ins> sub_code;
1302
+ cg_compile(schema.get(), p, sub_code);
1303
+ sub_code.push_back({cg::op::END});
1304
+ p.subs[si] = std::move(sub_code);
1305
+ out.push_back({cg::op::OBJ_PROP, ni, si});
1306
+ }
1307
+ out.push_back({cg::op::OBJ_PROPS_END});
1308
+ }
1309
+ }
1310
+
1311
+ // --- Codegen executor ---
1312
+ static const char* fmt_names[]={"email","date","date-time","time","ipv4","ipv6","uri","uuid","hostname"};
1313
+
1314
+ static bool cg_exec(const cg::plan& p, const std::vector<cg::ins>& code,
1315
+ dom::element value) {
1316
+ auto t = type_of_sv(value);
1317
+ for (size_t i=0; i<code.size(); ++i) {
1318
+ auto& c = code[i];
1319
+ switch(c.o) {
1320
+ case cg::op::END: return true;
1321
+ case cg::op::EXPECT_OBJECT: if(t!="object") return false; break;
1322
+ case cg::op::EXPECT_ARRAY: if(t!="array") return false; break;
1323
+ case cg::op::EXPECT_STRING: if(t!="string") return false; break;
1324
+ case cg::op::EXPECT_NUMBER: if(t!="number"&&t!="integer") return false; break;
1325
+ case cg::op::EXPECT_INTEGER: if(t!="integer") return false; break;
1326
+ case cg::op::EXPECT_BOOLEAN: if(t!="boolean") return false; break;
1327
+ case cg::op::EXPECT_NULL: if(t!="null") return false; break;
1328
+ case cg::op::EXPECT_TYPE_MULTI: {
1329
+ auto& ts=p.type_sets[c.a]; bool m=false;
1330
+ for(auto& ty:ts){if(t==ty||(ty=="number"&&(t=="integer"||t=="number"))){m=true;break;}}
1331
+ if(!m) return false; break;
1332
+ }
1333
+ case cg::op::CHECK_MINIMUM: if(t=="integer"||t=="number"){if(to_double(value)<p.doubles[c.a])return false;} break;
1334
+ case cg::op::CHECK_MAXIMUM: if(t=="integer"||t=="number"){if(to_double(value)>p.doubles[c.a])return false;} break;
1335
+ case cg::op::CHECK_EX_MINIMUM: if(t=="integer"||t=="number"){if(to_double(value)<=p.doubles[c.a])return false;} break;
1336
+ case cg::op::CHECK_EX_MAXIMUM: if(t=="integer"||t=="number"){if(to_double(value)>=p.doubles[c.a])return false;} break;
1337
+ case cg::op::CHECK_MULTIPLE_OF: if(t=="integer"||t=="number"){double v=to_double(value),d=p.doubles[c.a],r=std::fmod(v,d);if(std::abs(r)>1e-8&&std::abs(r-d)>1e-8)return false;} break;
1338
+ case cg::op::CHECK_MIN_LENGTH: if(t=="string"){std::string_view sv;value.get(sv);if(utf8_length(sv)<c.a)return false;} break;
1339
+ case cg::op::CHECK_MAX_LENGTH: if(t=="string"){std::string_view sv;value.get(sv);if(utf8_length(sv)>c.a)return false;} break;
1340
+ case cg::op::CHECK_PATTERN: if(t=="string"){std::string_view sv;value.get(sv);if(!re2::RE2::PartialMatch(re2::StringPiece(sv.data(),sv.size()),*p.regexes[c.a]))return false;} break;
1341
+ case cg::op::CHECK_FORMAT: if(t=="string"){std::string_view sv;value.get(sv);uint8_t f=p.format_ids[c.a];if(f<9&&!check_format(sv,fmt_names[f]))return false;} break;
1342
+ case cg::op::CHECK_MIN_ITEMS: if(t=="array"){dom::array a;value.get(a);uint64_t s=0;for([[maybe_unused]]auto _:a)++s;if(s<c.a)return false;} break;
1343
+ case cg::op::CHECK_MAX_ITEMS: if(t=="array"){dom::array a;value.get(a);uint64_t s=0;for([[maybe_unused]]auto _:a)++s;if(s>c.a)return false;} break;
1344
+ case cg::op::CHECK_UNIQUE_ITEMS: if(t=="array"){dom::array a;value.get(a);std::set<std::string> seen;for(auto x:a)if(!seen.insert(canonical_json(x)).second)return false;} break;
1345
+ case cg::op::ARRAY_ITEMS: if(t=="array"){dom::array a;value.get(a);for(auto x:a)if(!cg_exec(p,p.subs[c.a],x))return false;} break;
1346
+ case cg::op::CHECK_REQUIRED: if(t=="object"){dom::object o;value.get(o);dom::element d;if(o[p.strings[c.a]].get(d)!=SUCCESS)return false;} break;
1347
+ case cg::op::CHECK_MIN_PROPS: if(t=="object"){dom::object o;value.get(o);uint64_t n=0;for([[maybe_unused]]auto _:o)++n;if(n<c.a)return false;} break;
1348
+ case cg::op::CHECK_MAX_PROPS: if(t=="object"){dom::object o;value.get(o);uint64_t n=0;for([[maybe_unused]]auto _:o)++n;if(n>c.a)return false;} break;
1349
+ case cg::op::OBJ_PROPS_START: if(t=="object"){
1350
+ dom::object o; value.get(o);
1351
+ // collect prop defs
1352
+ struct pd{std::string_view nm;uint32_t si;};
1353
+ std::vector<pd> props; bool no_add=false;
1354
+ size_t j=i+1;
1355
+ for(;j<code.size()&&code[j].o!=cg::op::OBJ_PROPS_END;++j){
1356
+ if(code[j].o==cg::op::OBJ_PROP) props.push_back({p.strings[code[j].a],code[j].b});
1357
+ else if(code[j].o==cg::op::CHECK_NO_ADDITIONAL) no_add=true;
1358
+ }
1359
+ for(auto [key,val]:o){
1360
+ bool matched=false;
1361
+ for(auto& pp:props){if(key==pp.nm){if(!cg_exec(p,p.subs[pp.si],val))return false;matched=true;break;}}
1362
+ if(!matched&&no_add)return false;
1363
+ }
1364
+ i=j; break;
1365
+ } else { /* skip to OBJ_PROPS_END */ size_t j=i+1; for(;j<code.size()&&code[j].o!=cg::op::OBJ_PROPS_END;++j); i=j; } break;
1366
+ case cg::op::OBJ_PROP: case cg::op::OBJ_PROPS_END: case cg::op::CHECK_NO_ADDITIONAL: break;
1367
+ case cg::op::CHECK_ENUM_STR: {
1368
+ auto& es=p.enum_sets[c.a]; bool f=false;
1369
+ if(t=="string"){std::string_view sv;value.get(sv);for(auto& e:es)if(e.size()==sv.size()+2&&e[0]=='"'&&e.back()=='"'&&e.compare(1,sv.size(),sv)==0){f=true;break;}}
1370
+ if(!f){std::string v=canonical_json(value);for(auto& e:es)if(e==v){f=true;break;}}
1371
+ if(!f)return false; break;
1372
+ }
1373
+ case cg::op::CHECK_ENUM: {
1374
+ auto& es=p.enum_sets[c.a]; bool f=false;
1375
+ if(t=="string"){std::string_view sv;value.get(sv);for(auto& e:es)if(e.size()==sv.size()+2&&e[0]=='"'&&e.back()=='"'&&e.compare(1,sv.size(),sv)==0){f=true;break;}}
1376
+ if(!f&&value.is<int64_t>()){int64_t v;value.get(v);auto s=std::to_string(v);for(auto& e:es)if(e==s){f=true;break;}}
1377
+ if(!f){std::string v=canonical_json(value);for(auto& e:es)if(e==v){f=true;break;}}
1378
+ if(!f)return false; break;
1379
+ }
1380
+ case cg::op::CHECK_CONST: if(canonical_json(value)!=p.strings[c.a])return false; break;
1381
+ case cg::op::COMPOSITION: return false; // fallback to tree walker
1382
+ }
1383
+ }
1384
+ return true;
1385
+ }
1386
+
1387
+ // --- On Demand fast path executor ---
1388
+ // Uses simdjson On Demand API to avoid materializing the full DOM tree.
1389
+ // Returns: true = valid, false = invalid OR unsupported (fallback to DOM).
1390
+
1391
+ static std::string_view od_type(simdjson::ondemand::value& v) {
1392
+ switch (v.type()) {
1393
+ case simdjson::ondemand::json_type::object: return "object";
1394
+ case simdjson::ondemand::json_type::array: return "array";
1395
+ case simdjson::ondemand::json_type::string: return "string";
1396
+ case simdjson::ondemand::json_type::boolean: return "boolean";
1397
+ case simdjson::ondemand::json_type::null: return "null";
1398
+ case simdjson::ondemand::json_type::number: {
1399
+ simdjson::ondemand::number_type nt;
1400
+ if (v.get_number_type().get(nt) == SUCCESS &&
1401
+ nt == simdjson::ondemand::number_type::floating_point_number)
1402
+ return "number";
1403
+ return "integer";
1404
+ }
1405
+ }
1406
+ return "unknown";
1407
+ }
1408
+
1409
+ static bool od_exec(const cg::plan& p, const std::vector<cg::ins>& code,
1410
+ simdjson::ondemand::value value) {
1411
+ auto t = od_type(value);
1412
+ for (size_t i = 0; i < code.size(); ++i) {
1413
+ auto& c = code[i];
1414
+ switch (c.o) {
1415
+ case cg::op::END: return true;
1416
+ case cg::op::EXPECT_OBJECT: if(t!="object") return false; break;
1417
+ case cg::op::EXPECT_ARRAY: if(t!="array") return false; break;
1418
+ case cg::op::EXPECT_STRING: if(t!="string") return false; break;
1419
+ case cg::op::EXPECT_NUMBER: if(t!="number"&&t!="integer") return false; break;
1420
+ case cg::op::EXPECT_INTEGER: if(t!="integer") return false; break;
1421
+ case cg::op::EXPECT_BOOLEAN: if(t!="boolean") return false; break;
1422
+ case cg::op::EXPECT_NULL: if(t!="null") return false; break;
1423
+ case cg::op::EXPECT_TYPE_MULTI: {
1424
+ auto& ts=p.type_sets[c.a]; bool m=false;
1425
+ for(auto& ty:ts){if(t==ty||(ty=="number"&&(t=="integer"||t=="number"))){m=true;break;}}
1426
+ if(!m) return false; break;
1427
+ }
1428
+ case cg::op::CHECK_MINIMUM:
1429
+ case cg::op::CHECK_MAXIMUM:
1430
+ case cg::op::CHECK_EX_MINIMUM:
1431
+ case cg::op::CHECK_EX_MAXIMUM:
1432
+ case cg::op::CHECK_MULTIPLE_OF: {
1433
+ if (t=="integer"||t=="number") {
1434
+ double v;
1435
+ if (t=="integer") { int64_t iv; if(value.get(iv)!=SUCCESS) return false; v=(double)iv; }
1436
+ else { if(value.get(v)!=SUCCESS) return false; }
1437
+ double d=p.doubles[c.a];
1438
+ if(c.o==cg::op::CHECK_MINIMUM && v<d) return false;
1439
+ if(c.o==cg::op::CHECK_MAXIMUM && v>d) return false;
1440
+ if(c.o==cg::op::CHECK_EX_MINIMUM && v<=d) return false;
1441
+ if(c.o==cg::op::CHECK_EX_MAXIMUM && v>=d) return false;
1442
+ if(c.o==cg::op::CHECK_MULTIPLE_OF){double r=std::fmod(v,d);if(std::abs(r)>1e-8&&std::abs(r-d)>1e-8)return false;}
1443
+ }
1444
+ break;
1445
+ }
1446
+ case cg::op::CHECK_MIN_LENGTH: if(t=="string"){std::string_view sv; if(value.get(sv)!=SUCCESS) return false; if(utf8_length(sv)<c.a) return false;} break;
1447
+ case cg::op::CHECK_MAX_LENGTH: if(t=="string"){std::string_view sv; if(value.get(sv)!=SUCCESS) return false; if(utf8_length(sv)>c.a) return false;} break;
1448
+ case cg::op::CHECK_PATTERN: if(t=="string"){std::string_view sv; if(value.get(sv)!=SUCCESS) return false; if(!re2::RE2::PartialMatch(re2::StringPiece(sv.data(),sv.size()),*p.regexes[c.a]))return false;} break;
1449
+ case cg::op::CHECK_FORMAT: if(t=="string"){std::string_view sv; if(value.get(sv)!=SUCCESS) return false; uint8_t f=p.format_ids[c.a]; if(f<9&&!check_format(sv,fmt_names[f]))return false;} break;
1450
+ case cg::op::CHECK_MIN_ITEMS: if(t=="array"){
1451
+ simdjson::ondemand::array a; if(value.get(a)!=SUCCESS) return false;
1452
+ uint64_t s=0; for(auto x:a){(void)x;++s;} if(s<c.a) return false;
1453
+ } break;
1454
+ case cg::op::CHECK_MAX_ITEMS: if(t=="array"){
1455
+ simdjson::ondemand::array a; if(value.get(a)!=SUCCESS) return false;
1456
+ uint64_t s=0; for(auto x:a){(void)x;++s;} if(s>c.a) return false;
1457
+ } break;
1458
+ case cg::op::ARRAY_ITEMS: if(t=="array"){
1459
+ simdjson::ondemand::array a; if(value.get(a)!=SUCCESS) return false;
1460
+ for(auto elem:a){
1461
+ simdjson::ondemand::value v; if(elem.get(v)!=SUCCESS) return false;
1462
+ if(!od_exec(p,p.subs[c.a],v)) return false;
1463
+ }
1464
+ } break;
1465
+ case cg::op::CHECK_REQUIRED: if(t=="object"){
1466
+ simdjson::ondemand::object o; if(value.get(o)!=SUCCESS) return false;
1467
+ auto f = o.find_field_unordered(p.strings[c.a]);
1468
+ if(f.error()) return false;
1469
+ } break;
1470
+ case cg::op::CHECK_MIN_PROPS: if(t=="object"){
1471
+ simdjson::ondemand::object o; if(value.get(o)!=SUCCESS) return false;
1472
+ uint64_t n=0; for(auto f:o){(void)f;++n;} if(n<c.a) return false;
1473
+ } break;
1474
+ case cg::op::CHECK_MAX_PROPS: if(t=="object"){
1475
+ simdjson::ondemand::object o; if(value.get(o)!=SUCCESS) return false;
1476
+ uint64_t n=0; for(auto f:o){(void)f;++n;} if(n>c.a) return false;
1477
+ } break;
1478
+ case cg::op::OBJ_PROPS_START: if(t=="object"){
1479
+ simdjson::ondemand::object o; if(value.get(o)!=SUCCESS) return false;
1480
+ struct pd{std::string_view nm;uint32_t si;};
1481
+ std::vector<pd> props; bool no_add=false;
1482
+ size_t j=i+1;
1483
+ for(;j<code.size()&&code[j].o!=cg::op::OBJ_PROPS_END;++j){
1484
+ if(code[j].o==cg::op::OBJ_PROP) props.push_back({p.strings[code[j].a],code[j].b});
1485
+ else if(code[j].o==cg::op::CHECK_NO_ADDITIONAL) no_add=true;
1486
+ }
1487
+ for(auto field:o){
1488
+ simdjson::ondemand::raw_json_string rk; if(field.key().get(rk)!=SUCCESS) return false;
1489
+ std::string_view key = field.unescaped_key();
1490
+ bool matched=false;
1491
+ for(auto& pp:props){
1492
+ if(key==pp.nm){
1493
+ simdjson::ondemand::value fv; if(field.value().get(fv)!=SUCCESS) return false;
1494
+ if(!od_exec(p,p.subs[pp.si],fv)) return false;
1495
+ matched=true; break;
1496
+ }
1497
+ }
1498
+ if(!matched&&no_add) return false;
1499
+ }
1500
+ i=j; break;
1501
+ } else { size_t j=i+1; for(;j<code.size()&&code[j].o!=cg::op::OBJ_PROPS_END;++j); i=j; } break;
1502
+ case cg::op::OBJ_PROP: case cg::op::OBJ_PROPS_END: case cg::op::CHECK_NO_ADDITIONAL: break;
1503
+
1504
+ // These require full materialization — bail to DOM path
1505
+ case cg::op::CHECK_UNIQUE_ITEMS:
1506
+ case cg::op::CHECK_ENUM_STR:
1507
+ case cg::op::CHECK_ENUM:
1508
+ case cg::op::CHECK_CONST:
1509
+ case cg::op::COMPOSITION:
1510
+ return false;
1511
+ }
1512
+ }
1513
+ return true;
1514
+ }
1515
+
1516
+ // Determine if a codegen plan can use On Demand (no enum/const/uniqueItems)
1517
+ static bool plan_supports_ondemand(const cg::plan& p) {
1518
+ for (auto& c : p.code) {
1519
+ if (c.o == cg::op::CHECK_UNIQUE_ITEMS || c.o == cg::op::CHECK_ENUM_STR ||
1520
+ c.o == cg::op::CHECK_ENUM || c.o == cg::op::CHECK_CONST ||
1521
+ c.o == cg::op::COMPOSITION)
1522
+ return false;
1523
+ }
1524
+ // Also check sub-plans
1525
+ for (auto& sub : p.subs) {
1526
+ for (auto& c : sub) {
1527
+ if (c.o == cg::op::CHECK_UNIQUE_ITEMS || c.o == cg::op::CHECK_ENUM_STR ||
1528
+ c.o == cg::op::CHECK_ENUM || c.o == cg::op::CHECK_CONST ||
1529
+ c.o == cg::op::COMPOSITION)
1530
+ return false;
1531
+ }
1532
+ }
1533
+ return true;
1534
+ }
1535
+
1536
+ // Free padding: check if buffer is near a page boundary
1537
+ // On modern systems, pages are at least 4096 bytes. If we're far enough
1538
+ // from the end of a page, we can read 64 bytes beyond without a fault.
1539
+ static long get_page_size() {
1540
+ #ifdef _WIN32
1541
+ SYSTEM_INFO si; GetSystemInfo(&si); return si.dwPageSize;
1542
+ #else
1543
+ static long ps = sysconf(_SC_PAGESIZE);
1544
+ return ps;
1545
+ #endif
1546
+ }
1547
+
1548
+ static bool near_page_boundary(const char* buf, size_t len) {
1549
+ return ((reinterpret_cast<uintptr_t>(buf + len - 1) % get_page_size())
1550
+ + REQUIRED_PADDING >= static_cast<uintptr_t>(get_page_size()));
1551
+ }
1552
+
1553
+ // Zero-copy validate with free padding (Lemire's trick).
1554
+ // Almost never allocates — only if buffer is near a page boundary.
1555
+ static simdjson::padded_string_view get_free_padded_view(
1556
+ const char* data, size_t length, simdjson::padded_string& fallback) {
1557
+ if (near_page_boundary(data, length)) {
1558
+ // Rare: near page boundary, must copy
1559
+ fallback = simdjson::padded_string(data, length);
1560
+ return fallback;
1561
+ }
1562
+ // Common: free padding available, zero-copy
1563
+ return simdjson::padded_string_view(data, length, length + REQUIRED_PADDING);
1564
+ }
1565
+
970
1566
  schema_ref compile(std::string_view schema_json) {
971
1567
  auto ctx = std::make_shared<compiled_schema>();
972
1568
  ctx->raw_schema = std::string(schema_json);
@@ -980,6 +1576,11 @@ schema_ref compile(std::string_view schema_json) {
980
1576
 
981
1577
  ctx->root = compile_node(doc, *ctx);
982
1578
 
1579
+ // Generate codegen plan
1580
+ cg_compile(ctx->root.get(), ctx->gen_plan, ctx->gen_plan.code);
1581
+ ctx->gen_plan.code.push_back({cg::op::END});
1582
+ ctx->use_ondemand = plan_supports_ondemand(ctx->gen_plan);
1583
+
983
1584
  schema_ref ref;
984
1585
  ref.impl = ctx;
985
1586
  return ref;
@@ -991,14 +1592,46 @@ validation_result validate(const schema_ref& schema, std::string_view json,
991
1592
  return {false, {{error_code::invalid_schema, "", "schema not compiled"}}};
992
1593
  }
993
1594
 
994
- auto padded = simdjson::padded_string(json);
995
- auto result = schema.impl->doc_parser.parse(padded);
1595
+ // Free padding trick: avoid padded_string copy when possible
1596
+ simdjson::padded_string fallback;
1597
+ auto psv = get_free_padded_view(json.data(), json.size(), fallback);
1598
+
1599
+ // Ultra-fast path: On Demand (no DOM materialization)
1600
+ static constexpr size_t OD_THRESHOLD = 32;
1601
+ if (schema.impl->use_ondemand && !schema.impl->gen_plan.code.empty() &&
1602
+ json.size() >= OD_THRESHOLD) {
1603
+ auto od_result = tl_od_parser().iterate(psv);
1604
+ if (!od_result.error()) {
1605
+ simdjson::ondemand::value root_val;
1606
+ if (od_result.get_value().get(root_val) == SUCCESS) {
1607
+ if (od_exec(schema.impl->gen_plan, schema.impl->gen_plan.code, root_val)) {
1608
+ return {true, {}};
1609
+ }
1610
+ }
1611
+ }
1612
+ // Need fresh view for DOM parse (On Demand consumed it)
1613
+ psv = get_free_padded_view(json.data(), json.size(), fallback);
1614
+ }
1615
+
1616
+ auto& dom_p = tl_dom_parser();
1617
+ auto result = dom_p.parse(psv);
996
1618
  if (result.error()) {
997
1619
  return {false, {{error_code::invalid_json, "", "invalid JSON document"}}};
998
1620
  }
999
1621
 
1622
+ // Fast path: codegen bytecode execution (DOM)
1623
+ if (!schema.impl->use_ondemand && !schema.impl->gen_plan.code.empty()) {
1624
+ if (cg_exec(schema.impl->gen_plan, schema.impl->gen_plan.code,
1625
+ result.value())) {
1626
+ return {true, {}};
1627
+ }
1628
+ // Codegen said invalid OR hit COMPOSITION — fall through to tree walker
1629
+ }
1630
+
1631
+ // Slow path: re-parse + tree walker with error details
1632
+ auto result2 = dom_p.parse(psv);
1000
1633
  std::vector<validation_error> errors;
1001
- validate_node(schema.impl->root, result.value(), "", *schema.impl, errors,
1634
+ validate_node(schema.impl->root, result2.value(), "", *schema.impl, errors,
1002
1635
  opts.all_errors);
1003
1636
 
1004
1637
  return {errors.empty(), std::move(errors)};
@@ -1014,4 +1647,22 @@ validation_result validate(std::string_view schema_json,
1014
1647
  return validate(s, json, opts);
1015
1648
  }
1016
1649
 
1650
+
1651
+ bool is_valid_prepadded(const schema_ref& schema, const char* data, size_t length) {
1652
+ if (!schema.impl || !schema.impl->root) return false;
1653
+
1654
+ simdjson::padded_string fallback;
1655
+ auto psv = get_free_padded_view(data, length, fallback);
1656
+ auto result = tl_dom_parser().parse(psv);
1657
+ if (result.error()) return false;
1658
+
1659
+ if (!schema.impl->gen_plan.code.empty()) {
1660
+ return cg_exec(schema.impl->gen_plan, schema.impl->gen_plan.code, result.value());
1661
+ }
1662
+
1663
+ std::vector<validation_error> errors;
1664
+ validate_node(schema.impl->root, result.value(), "", *schema.impl, errors, false);
1665
+ return errors.empty();
1666
+ }
1667
+
1017
1668
  } // namespace ata