ata-validator 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/ata.cpp CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  #include <algorithm>
4
4
  #include <cmath>
5
- #include <regex>
5
+ #include <re2/re2.h>
6
6
  #include <set>
7
7
  #include <unordered_map>
8
8
 
@@ -133,6 +133,43 @@ namespace ata {
133
133
 
134
134
  using namespace simdjson;
135
135
 
136
+ // Canonical JSON: sort object keys for semantic equality comparison
137
+ static std::string canonical_json(dom::element el) {
138
+ switch (el.type()) {
139
+ case dom::element_type::OBJECT: {
140
+ dom::object obj; el.get(obj);
141
+ std::vector<std::pair<std::string_view, dom::element>> entries;
142
+ for (auto [k, v] : obj) entries.push_back({k, v});
143
+ std::sort(entries.begin(), entries.end(),
144
+ [](const auto& a, const auto& b) { return a.first < b.first; });
145
+ std::string r = "{";
146
+ for (size_t i = 0; i < entries.size(); ++i) {
147
+ if (i) r += ',';
148
+ r += '"';
149
+ r += entries[i].first;
150
+ r += "\":";
151
+ r += canonical_json(entries[i].second);
152
+ }
153
+ r += '}';
154
+ return r;
155
+ }
156
+ case dom::element_type::ARRAY: {
157
+ dom::array arr; el.get(arr);
158
+ std::string r = "[";
159
+ bool first = true;
160
+ for (auto v : arr) {
161
+ if (!first) r += ',';
162
+ first = false;
163
+ r += canonical_json(v);
164
+ }
165
+ r += ']';
166
+ return r;
167
+ }
168
+ default:
169
+ return std::string(minify(el));
170
+ }
171
+ }
172
+
136
173
  // Forward declarations
137
174
  struct schema_node;
138
175
  using schema_node_ptr = std::shared_ptr<schema_node>;
@@ -153,7 +190,7 @@ struct schema_node {
153
190
  std::optional<uint64_t> min_length;
154
191
  std::optional<uint64_t> max_length;
155
192
  std::optional<std::string> pattern;
156
- std::shared_ptr<std::regex> compiled_pattern; // cached compiled regex
193
+ std::shared_ptr<re2::RE2> compiled_pattern; // cached compiled regex (RE2)
157
194
 
158
195
  // array
159
196
  std::optional<uint64_t> min_items;
@@ -161,6 +198,9 @@ struct schema_node {
161
198
  bool unique_items = false;
162
199
  schema_node_ptr items_schema;
163
200
  std::vector<schema_node_ptr> prefix_items;
201
+ schema_node_ptr contains_schema;
202
+ std::optional<uint64_t> min_contains;
203
+ std::optional<uint64_t> max_contains;
164
204
 
165
205
  // object
166
206
  std::unordered_map<std::string, schema_node_ptr> properties;
@@ -169,9 +209,17 @@ struct schema_node {
169
209
  schema_node_ptr additional_properties_schema;
170
210
  std::optional<uint64_t> min_properties;
171
211
  std::optional<uint64_t> max_properties;
172
-
173
- // patternProperties
174
- std::vector<std::pair<std::string, schema_node_ptr>> pattern_properties;
212
+ schema_node_ptr property_names_schema;
213
+ std::unordered_map<std::string, std::vector<std::string>> dependent_required;
214
+ std::unordered_map<std::string, schema_node_ptr> dependent_schemas;
215
+
216
+ // patternProperties — each entry: (pattern_string, schema, compiled_regex)
217
+ struct pattern_prop {
218
+ std::string pattern;
219
+ schema_node_ptr schema;
220
+ std::shared_ptr<re2::RE2> compiled;
221
+ };
222
+ std::vector<pattern_prop> pattern_properties;
175
223
 
176
224
  // enum / const
177
225
  std::optional<std::string> enum_values_raw; // raw JSON array string
@@ -195,16 +243,47 @@ struct schema_node {
195
243
  // $ref
196
244
  std::string ref;
197
245
 
246
+ // $defs — stored on node for pointer navigation
247
+ std::unordered_map<std::string, schema_node_ptr> defs;
248
+
198
249
  // boolean schema
199
250
  std::optional<bool> boolean_schema;
200
251
  };
201
252
 
253
+ // --- Codegen: flat bytecode plan ---
254
+ namespace cg {
255
+ enum class op : uint8_t {
256
+ END=0, EXPECT_OBJECT, EXPECT_ARRAY, EXPECT_STRING, EXPECT_NUMBER,
257
+ EXPECT_INTEGER, EXPECT_BOOLEAN, EXPECT_NULL, EXPECT_TYPE_MULTI,
258
+ CHECK_MINIMUM, CHECK_MAXIMUM, CHECK_EX_MINIMUM, CHECK_EX_MAXIMUM,
259
+ CHECK_MULTIPLE_OF, CHECK_MIN_LENGTH, CHECK_MAX_LENGTH, CHECK_PATTERN,
260
+ CHECK_FORMAT, CHECK_MIN_ITEMS, CHECK_MAX_ITEMS, CHECK_UNIQUE_ITEMS,
261
+ ARRAY_ITEMS, CHECK_REQUIRED, CHECK_MIN_PROPS, CHECK_MAX_PROPS,
262
+ OBJ_PROPS_START, OBJ_PROP, OBJ_PROPS_END, CHECK_NO_ADDITIONAL,
263
+ CHECK_ENUM_STR, CHECK_ENUM, CHECK_CONST, COMPOSITION,
264
+ };
265
+ struct ins { op o; uint32_t a=0, b=0; };
266
+ struct plan {
267
+ std::vector<ins> code;
268
+ std::vector<double> doubles;
269
+ std::vector<std::string> strings;
270
+ std::vector<std::shared_ptr<re2::RE2>> regexes;
271
+ std::vector<std::vector<std::string>> enum_sets;
272
+ std::vector<std::vector<std::string>> type_sets;
273
+ std::vector<uint8_t> format_ids;
274
+ std::vector<std::vector<ins>> subs;
275
+ };
276
+ } // namespace cg
277
+
202
278
  struct compiled_schema {
203
279
  schema_node_ptr root;
204
280
  std::unordered_map<std::string, schema_node_ptr> defs;
205
281
  std::string raw_schema;
206
282
  dom::parser parser;
207
- dom::parser doc_parser; // reusable parser for document validation
283
+ dom::parser doc_parser;
284
+ simdjson::ondemand::parser od_parser; // On Demand parser for fast path
285
+ cg::plan gen_plan; // codegen validation plan
286
+ bool use_ondemand = false; // true if codegen plan supports On Demand
208
287
  };
209
288
 
210
289
  // --- Schema compilation ---
@@ -218,7 +297,9 @@ static schema_node_ptr compile_node(dom::element el,
218
297
 
219
298
  // Boolean schema
220
299
  if (el.is<bool>()) {
221
- node->boolean_schema = bool(el);
300
+ bool bval;
301
+ el.get(bval);
302
+ node->boolean_schema = bval;
222
303
  return node;
223
304
  }
224
305
 
@@ -226,7 +307,8 @@ static schema_node_ptr compile_node(dom::element el,
226
307
  return node;
227
308
  }
228
309
 
229
- auto obj = dom::object(el);
310
+ dom::object obj;
311
+ el.get(obj);
230
312
 
231
313
  // $ref
232
314
  dom::element ref_el;
@@ -245,7 +327,7 @@ static schema_node_ptr compile_node(dom::element el,
245
327
  type_el.get(sv);
246
328
  node->types.emplace_back(sv);
247
329
  } else if (type_el.is<dom::array>()) {
248
- for (auto t : dom::array(type_el)) {
330
+ dom::array type_arr; type_el.get(type_arr); for (auto t : type_arr) {
249
331
  std::string_view sv;
250
332
  if (t.get(sv) == SUCCESS) {
251
333
  node->types.emplace_back(sv);
@@ -291,11 +373,9 @@ static schema_node_ptr compile_node(dom::element el,
291
373
  std::string_view sv;
292
374
  if (str_el.get(sv) == SUCCESS) {
293
375
  node->pattern = std::string(sv);
294
- try {
295
- node->compiled_pattern =
296
- std::make_shared<std::regex>(node->pattern.value());
297
- } catch (...) {
298
- // Invalid regex — leave compiled_pattern null
376
+ auto re = std::make_shared<re2::RE2>(node->pattern.value());
377
+ if (re->ok()) {
378
+ node->compiled_pattern = std::move(re);
299
379
  }
300
380
  }
301
381
  }
@@ -317,7 +397,7 @@ static schema_node_ptr compile_node(dom::element el,
317
397
  // prefixItems (Draft 2020-12)
318
398
  dom::element pi_el;
319
399
  if (obj["prefixItems"].get(pi_el) == SUCCESS && pi_el.is<dom::array>()) {
320
- for (auto item : dom::array(pi_el)) {
400
+ dom::array pi_arr; pi_el.get(pi_arr); for (auto item : pi_arr) {
321
401
  node->prefix_items.push_back(compile_node(item, ctx));
322
402
  }
323
403
  }
@@ -327,17 +407,32 @@ static schema_node_ptr compile_node(dom::element el,
327
407
  node->items_schema = compile_node(items_el, ctx);
328
408
  }
329
409
 
410
+ // contains
411
+ dom::element contains_el;
412
+ if (obj["contains"].get(contains_el) == SUCCESS) {
413
+ node->contains_schema = compile_node(contains_el, ctx);
414
+ }
415
+ dom::element mc_el;
416
+ if (obj["minContains"].get(mc_el) == SUCCESS) {
417
+ uint64_t v;
418
+ if (mc_el.get(v) == SUCCESS) node->min_contains = v;
419
+ }
420
+ if (obj["maxContains"].get(mc_el) == SUCCESS) {
421
+ uint64_t v;
422
+ if (mc_el.get(v) == SUCCESS) node->max_contains = v;
423
+ }
424
+
330
425
  // object constraints
331
426
  dom::element props_el;
332
427
  if (obj["properties"].get(props_el) == SUCCESS && props_el.is<dom::object>()) {
333
- for (auto [key, val] : dom::object(props_el)) {
428
+ dom::object props_obj; props_el.get(props_obj); for (auto [key, val] : props_obj) {
334
429
  node->properties[std::string(key)] = compile_node(val, ctx);
335
430
  }
336
431
  }
337
432
 
338
433
  dom::element req_el;
339
434
  if (obj["required"].get(req_el) == SUCCESS && req_el.is<dom::array>()) {
340
- for (auto r : dom::array(req_el)) {
435
+ dom::array req_arr; req_el.get(req_arr); for (auto r : req_arr) {
341
436
  std::string_view sv;
342
437
  if (r.get(sv) == SUCCESS) {
343
438
  node->required.emplace_back(sv);
@@ -348,7 +443,7 @@ static schema_node_ptr compile_node(dom::element el,
348
443
  dom::element ap_el;
349
444
  if (obj["additionalProperties"].get(ap_el) == SUCCESS) {
350
445
  if (ap_el.is<bool>()) {
351
- node->additional_properties_bool = bool(ap_el);
446
+ bool ap_bool; ap_el.get(ap_bool); node->additional_properties_bool = ap_bool;
352
447
  } else {
353
448
  node->additional_properties_schema = compile_node(ap_el, ctx);
354
449
  }
@@ -363,13 +458,51 @@ static schema_node_ptr compile_node(dom::element el,
363
458
  if (str_el.get(v) == SUCCESS) node->max_properties = v;
364
459
  }
365
460
 
366
- // patternProperties
461
+ // propertyNames
462
+ dom::element pn_el;
463
+ if (obj["propertyNames"].get(pn_el) == SUCCESS) {
464
+ node->property_names_schema = compile_node(pn_el, ctx);
465
+ }
466
+
467
+ // dependentRequired
468
+ dom::element dr_el;
469
+ if (obj["dependentRequired"].get(dr_el) == SUCCESS &&
470
+ dr_el.is<dom::object>()) {
471
+ dom::object dr_obj; dr_el.get(dr_obj); for (auto [key, val] : dr_obj) {
472
+ std::vector<std::string> deps;
473
+ if (val.is<dom::array>()) {
474
+ dom::array val_arr; val.get(val_arr); for (auto d : val_arr) {
475
+ std::string_view sv;
476
+ if (d.get(sv) == SUCCESS) deps.emplace_back(sv);
477
+ }
478
+ }
479
+ node->dependent_required[std::string(key)] = std::move(deps);
480
+ }
481
+ }
482
+
483
+ // dependentSchemas
484
+ dom::element ds_el;
485
+ if (obj["dependentSchemas"].get(ds_el) == SUCCESS &&
486
+ ds_el.is<dom::object>()) {
487
+ dom::object ds_obj; ds_el.get(ds_obj); for (auto [key, val] : ds_obj) {
488
+ node->dependent_schemas[std::string(key)] = compile_node(val, ctx);
489
+ }
490
+ }
491
+
492
+ // patternProperties — compile regex at schema compile time
367
493
  dom::element pp_el;
368
494
  if (obj["patternProperties"].get(pp_el) == SUCCESS &&
369
495
  pp_el.is<dom::object>()) {
370
- for (auto [key, val] : dom::object(pp_el)) {
371
- node->pattern_properties.emplace_back(std::string(key),
372
- compile_node(val, ctx));
496
+ dom::object pp_obj; pp_el.get(pp_obj);
497
+ for (auto [key, val] : pp_obj) {
498
+ schema_node::pattern_prop pp;
499
+ pp.pattern = std::string(key);
500
+ pp.schema = compile_node(val, ctx);
501
+ auto re = std::make_shared<re2::RE2>(pp.pattern);
502
+ if (re->ok()) {
503
+ pp.compiled = std::move(re);
504
+ }
505
+ node->pattern_properties.push_back(std::move(pp));
373
506
  }
374
507
  }
375
508
 
@@ -392,10 +525,10 @@ static schema_node_ptr compile_node(dom::element el,
392
525
  // enum — pre-minify each value at compile time
393
526
  dom::element enum_el;
394
527
  if (obj["enum"].get(enum_el) == SUCCESS) {
395
- node->enum_values_raw = std::string(minify(enum_el));
528
+ node->enum_values_raw = canonical_json(enum_el);
396
529
  if (enum_el.is<dom::array>()) {
397
- for (auto e : dom::array(enum_el)) {
398
- node->enum_values_minified.push_back(std::string(minify(e)));
530
+ dom::array enum_arr; enum_el.get(enum_arr); for (auto e : enum_arr) {
531
+ node->enum_values_minified.push_back(canonical_json(e));
399
532
  }
400
533
  }
401
534
  }
@@ -403,23 +536,26 @@ static schema_node_ptr compile_node(dom::element el,
403
536
  // const
404
537
  dom::element const_el;
405
538
  if (obj["const"].get(const_el) == SUCCESS) {
406
- node->const_value_raw = std::string(minify(const_el));
539
+ node->const_value_raw = canonical_json(const_el);
407
540
  }
408
541
 
409
542
  // composition
410
543
  dom::element comp_el;
411
544
  if (obj["allOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
412
- for (auto s : dom::array(comp_el)) {
545
+ dom::array comp_arr; comp_el.get(comp_arr);
546
+ for (auto s : comp_arr) {
413
547
  node->all_of.push_back(compile_node(s, ctx));
414
548
  }
415
549
  }
416
550
  if (obj["anyOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
417
- for (auto s : dom::array(comp_el)) {
551
+ dom::array comp_arr2; comp_el.get(comp_arr2);
552
+ for (auto s : comp_arr2) {
418
553
  node->any_of.push_back(compile_node(s, ctx));
419
554
  }
420
555
  }
421
556
  if (obj["oneOf"].get(comp_el) == SUCCESS && comp_el.is<dom::array>()) {
422
- for (auto s : dom::array(comp_el)) {
557
+ dom::array comp_arr3; comp_el.get(comp_arr3);
558
+ for (auto s : comp_arr3) {
423
559
  node->one_of.push_back(compile_node(s, ctx));
424
560
  }
425
561
  }
@@ -445,16 +581,20 @@ static schema_node_ptr compile_node(dom::element el,
445
581
  // $defs / definitions
446
582
  dom::element defs_el;
447
583
  if (obj["$defs"].get(defs_el) == SUCCESS && defs_el.is<dom::object>()) {
448
- for (auto [key, val] : dom::object(defs_el)) {
584
+ dom::object defs_obj; defs_el.get(defs_obj); for (auto [key, val] : defs_obj) {
449
585
  std::string def_path = "#/$defs/" + std::string(key);
450
- ctx.defs[def_path] = compile_node(val, ctx);
586
+ auto compiled = compile_node(val, ctx);
587
+ ctx.defs[def_path] = compiled;
588
+ node->defs[std::string(key)] = compiled;
451
589
  }
452
590
  }
453
591
  if (obj["definitions"].get(defs_el) == SUCCESS &&
454
592
  defs_el.is<dom::object>()) {
455
- for (auto [key, val] : dom::object(defs_el)) {
593
+ dom::object defs_obj; defs_el.get(defs_obj); for (auto [key, val] : defs_obj) {
456
594
  std::string def_path = "#/definitions/" + std::string(key);
457
- ctx.defs[def_path] = compile_node(val, ctx);
595
+ auto compiled = compile_node(val, ctx);
596
+ ctx.defs[def_path] = compiled;
597
+ node->defs[std::string(key)] = compiled;
458
598
  }
459
599
  }
460
600
 
@@ -538,79 +678,106 @@ static void validate_node(const schema_node_ptr& node,
538
678
  return;
539
679
  }
540
680
 
541
- // $ref
681
+ // $ref — Draft 2020-12: $ref is not a short-circuit, sibling keywords still apply
682
+ bool ref_resolved = false;
542
683
  if (!node->ref.empty()) {
543
684
  // First check defs map
544
685
  auto it = ctx.defs.find(node->ref);
545
686
  if (it != ctx.defs.end()) {
546
687
  validate_node(it->second, value, path, ctx, errors, all_errors);
547
- return;
688
+ ref_resolved = true;
548
689
  }
549
690
  // Try JSON Pointer resolution from root (e.g., "#/properties/foo")
550
691
  if (node->ref.size() > 1 && node->ref[0] == '#' &&
551
692
  node->ref[1] == '/') {
552
- // Walk the schema tree following the pointer
553
- std::string pointer = node->ref.substr(2);
554
- schema_node_ptr current = ctx.root;
555
- bool resolved = true;
556
- size_t pos = 0;
557
- while (pos < pointer.size() && current) {
558
- size_t next = pointer.find('/', pos);
559
- std::string segment =
560
- pointer.substr(pos, next == std::string::npos ? next : next - pos);
561
- // Unescape JSON Pointer: ~1 -> /, ~0 -> ~
562
- std::string key;
563
- for (size_t i = 0; i < segment.size(); ++i) {
564
- if (segment[i] == '~' && i + 1 < segment.size()) {
565
- if (segment[i + 1] == '1') { key += '/'; ++i; }
566
- else if (segment[i + 1] == '0') { key += '~'; ++i; }
567
- else key += segment[i];
693
+ // Decode JSON Pointer segments
694
+ auto decode_pointer_segment = [](const std::string& seg) -> std::string {
695
+ // Percent-decode first
696
+ std::string pct;
697
+ for (size_t i = 0; i < seg.size(); ++i) {
698
+ if (seg[i] == '%' && i + 2 < seg.size()) {
699
+ char h = seg[i+1], l = seg[i+2];
700
+ auto hex = [](char c) -> int {
701
+ if (c >= '0' && c <= '9') return c - '0';
702
+ if (c >= 'a' && c <= 'f') return 10 + c - 'a';
703
+ if (c >= 'A' && c <= 'F') return 10 + c - 'A';
704
+ return -1;
705
+ };
706
+ int hv = hex(h), lv = hex(l);
707
+ if (hv >= 0 && lv >= 0) {
708
+ pct += static_cast<char>(hv * 16 + lv);
709
+ i += 2;
710
+ } else {
711
+ pct += seg[i];
712
+ }
568
713
  } else {
569
- key += segment[i];
714
+ pct += seg[i];
570
715
  }
571
716
  }
572
- // Navigate the compiled schema tree
573
- if (key == "properties" && !current->properties.empty()) {
574
- // Next segment is the property name
575
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
576
- next = pointer.find('/', pos);
577
- std::string prop_name = pointer.substr(
578
- pos, next == std::string::npos ? next : next - pos);
717
+ // Then JSON Pointer unescape: ~1 -> /, ~0 -> ~
718
+ std::string out;
719
+ for (size_t i = 0; i < pct.size(); ++i) {
720
+ if (pct[i] == '~' && i + 1 < pct.size()) {
721
+ if (pct[i + 1] == '1') { out += '/'; ++i; }
722
+ else if (pct[i + 1] == '0') { out += '~'; ++i; }
723
+ else out += pct[i];
724
+ } else {
725
+ out += pct[i];
726
+ }
727
+ }
728
+ return out;
729
+ };
730
+
731
+ // Split pointer into segments
732
+ std::string pointer = node->ref.substr(2);
733
+ std::vector<std::string> segments;
734
+ size_t spos = 0;
735
+ while (spos < pointer.size()) {
736
+ size_t snext = pointer.find('/', spos);
737
+ segments.push_back(decode_pointer_segment(
738
+ pointer.substr(spos, snext == std::string::npos ? snext : snext - spos)));
739
+ spos = (snext == std::string::npos) ? pointer.size() : snext + 1;
740
+ }
741
+
742
+ // Walk the schema tree
743
+ schema_node_ptr current = ctx.root;
744
+ bool resolved = true;
745
+ for (size_t si = 0; si < segments.size() && current; ++si) {
746
+ const auto& key = segments[si];
747
+
748
+ if (key == "properties" && si + 1 < segments.size()) {
749
+ const auto& prop_name = segments[++si];
579
750
  auto pit = current->properties.find(prop_name);
580
751
  if (pit != current->properties.end()) {
581
752
  current = pit->second;
582
- } else {
583
- resolved = false; break;
584
- }
753
+ } else { resolved = false; break; }
585
754
  } else if (key == "items" && current->items_schema) {
586
755
  current = current->items_schema;
587
756
  } else if (key == "$defs" || key == "definitions") {
588
- // Next segment is the def name — already in ctx.defs
589
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
590
- next = pointer.find('/', pos);
591
- std::string def_name = pointer.substr(
592
- pos, next == std::string::npos ? next : next - pos);
593
- std::string full_ref = "#/" + key + "/" + def_name;
594
- auto dit = ctx.defs.find(full_ref);
595
- if (dit != ctx.defs.end()) {
596
- current = dit->second;
597
- } else {
598
- resolved = false; break;
599
- }
757
+ if (si + 1 < segments.size()) {
758
+ const auto& def_name = segments[++si];
759
+ // Navigate into node's defs map
760
+ auto dit = current->defs.find(def_name);
761
+ if (dit != current->defs.end()) {
762
+ current = dit->second;
763
+ } else {
764
+ // Fallback: try ctx.defs with full path
765
+ std::string full_ref = "#/" + key + "/" + def_name;
766
+ auto cit = ctx.defs.find(full_ref);
767
+ if (cit != ctx.defs.end()) {
768
+ current = cit->second;
769
+ } else { resolved = false; break; }
770
+ }
771
+ } else { resolved = false; break; }
600
772
  } else if (key == "allOf" || key == "anyOf" || key == "oneOf") {
601
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
602
- next = pointer.find('/', pos);
603
- std::string idx_str = pointer.substr(
604
- pos, next == std::string::npos ? next : next - pos);
605
- size_t idx = std::stoul(idx_str);
606
- auto& vec = (key == "allOf") ? current->all_of
607
- : (key == "anyOf") ? current->any_of
608
- : current->one_of;
609
- if (idx < vec.size()) {
610
- current = vec[idx];
611
- } else {
612
- resolved = false; break;
613
- }
773
+ if (si + 1 < segments.size()) {
774
+ size_t idx = std::stoul(segments[++si]);
775
+ auto& vec = (key == "allOf") ? current->all_of
776
+ : (key == "anyOf") ? current->any_of
777
+ : current->one_of;
778
+ if (idx < vec.size()) { current = vec[idx]; }
779
+ else { resolved = false; break; }
780
+ } else { resolved = false; break; }
614
781
  } else if (key == "not" && current->not_schema) {
615
782
  current = current->not_schema;
616
783
  } else if (key == "if" && current->if_schema) {
@@ -623,34 +790,29 @@ static void validate_node(const schema_node_ptr& node,
623
790
  current->additional_properties_schema) {
624
791
  current = current->additional_properties_schema;
625
792
  } else if (key == "prefixItems") {
626
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
627
- next = pointer.find('/', pos);
628
- std::string idx_str = pointer.substr(
629
- pos, next == std::string::npos ? next : next - pos);
630
- size_t idx = std::stoul(idx_str);
631
- if (idx < current->prefix_items.size()) {
632
- current = current->prefix_items[idx];
633
- } else {
634
- resolved = false; break;
635
- }
793
+ if (si + 1 < segments.size()) {
794
+ size_t idx = std::stoul(segments[++si]);
795
+ if (idx < current->prefix_items.size()) { current = current->prefix_items[idx]; }
796
+ else { resolved = false; break; }
797
+ } else { resolved = false; break; }
636
798
  } else {
637
799
  resolved = false; break;
638
800
  }
639
- pos = (next == std::string::npos) ? pointer.size() : next + 1;
640
801
  }
641
802
  if (resolved && current) {
642
803
  validate_node(current, value, path, ctx, errors, all_errors);
643
- return;
804
+ ref_resolved = true;
644
805
  }
645
806
  }
646
807
  // Self-reference: "#"
647
- if (node->ref == "#" && ctx.root) {
808
+ if (!ref_resolved && node->ref == "#" && ctx.root) {
648
809
  validate_node(ctx.root, value, path, ctx, errors, all_errors);
649
- return;
810
+ ref_resolved = true;
811
+ }
812
+ if (!ref_resolved) {
813
+ errors.push_back({error_code::ref_not_found, path,
814
+ "cannot resolve $ref: " + node->ref});
650
815
  }
651
- errors.push_back({error_code::ref_not_found, path,
652
- "cannot resolve $ref: " + node->ref});
653
- return;
654
816
  }
655
817
 
656
818
  // type
@@ -676,7 +838,7 @@ static void validate_node(const schema_node_ptr& node,
676
838
 
677
839
  // enum — use pre-minified values (no re-parsing)
678
840
  if (!node->enum_values_minified.empty()) {
679
- std::string val_str = std::string(minify(value));
841
+ std::string val_str = canonical_json(value);
680
842
  bool found = false;
681
843
  for (const auto& ev : node->enum_values_minified) {
682
844
  if (ev == val_str) {
@@ -692,7 +854,7 @@ static void validate_node(const schema_node_ptr& node,
692
854
 
693
855
  // const
694
856
  if (node->const_value_raw.has_value()) {
695
- std::string val_str = std::string(minify(value));
857
+ std::string val_str = canonical_json(value);
696
858
  if (val_str != node->const_value_raw.value()) {
697
859
  errors.push_back({error_code::const_mismatch, path,
698
860
  "value does not match const"});
@@ -758,7 +920,7 @@ static void validate_node(const schema_node_ptr& node,
758
920
  std::to_string(node->max_length.value())});
759
921
  }
760
922
  if (node->compiled_pattern) {
761
- if (!std::regex_search(sv.begin(), sv.end(), *node->compiled_pattern)) {
923
+ if (!re2::RE2::PartialMatch(re2::StringPiece(sv.data(), sv.size()), *node->compiled_pattern)) {
762
924
  errors.push_back({error_code::pattern_mismatch, path,
763
925
  "string does not match pattern: " +
764
926
  node->pattern.value()});
@@ -776,7 +938,7 @@ static void validate_node(const schema_node_ptr& node,
776
938
 
777
939
  // Array validations
778
940
  if (actual_type == "array" && value.is<dom::array>()) {
779
- auto arr = dom::array(value);
941
+ dom::array arr; value.get(arr);
780
942
  uint64_t arr_size = 0;
781
943
  for ([[maybe_unused]] auto _ : arr) ++arr_size;
782
944
 
@@ -797,7 +959,7 @@ static void validate_node(const schema_node_ptr& node,
797
959
  std::set<std::string> seen;
798
960
  bool has_dup = false;
799
961
  for (auto item : arr) {
800
- auto s = std::string(minify(item));
962
+ auto s = canonical_json(item);
801
963
  if (!seen.insert(s).second) {
802
964
  has_dup = true;
803
965
  break;
@@ -815,19 +977,41 @@ static void validate_node(const schema_node_ptr& node,
815
977
  for (auto item : arr) {
816
978
  if (idx < node->prefix_items.size()) {
817
979
  validate_node(node->prefix_items[idx], item,
818
- path + "/" + std::to_string(idx), ctx, errors);
980
+ path + "/" + std::to_string(idx), ctx, errors, all_errors);
819
981
  } else if (node->items_schema) {
820
982
  validate_node(node->items_schema, item,
821
- path + "/" + std::to_string(idx), ctx, errors);
983
+ path + "/" + std::to_string(idx), ctx, errors, all_errors);
822
984
  }
823
985
  ++idx;
824
986
  }
825
987
  }
988
+
989
+ // contains / minContains / maxContains
990
+ if (node->contains_schema) {
991
+ uint64_t match_count = 0;
992
+ for (auto item : arr) {
993
+ std::vector<validation_error> tmp;
994
+ validate_node(node->contains_schema, item, path, ctx, tmp, false);
995
+ if (tmp.empty()) ++match_count;
996
+ }
997
+ uint64_t min_c = node->min_contains.value_or(1);
998
+ uint64_t max_c = node->max_contains.value_or(arr_size);
999
+ if (match_count < min_c) {
1000
+ errors.push_back({error_code::min_items_violation, path,
1001
+ "contains: " + std::to_string(match_count) +
1002
+ " matches, minimum " + std::to_string(min_c)});
1003
+ }
1004
+ if (match_count > max_c) {
1005
+ errors.push_back({error_code::max_items_violation, path,
1006
+ "contains: " + std::to_string(match_count) +
1007
+ " matches, maximum " + std::to_string(max_c)});
1008
+ }
1009
+ }
826
1010
  }
827
1011
 
828
1012
  // Object validations
829
1013
  if (actual_type == "object" && value.is<dom::object>()) {
830
- auto obj = dom::object(value);
1014
+ dom::object obj; value.get(obj);
831
1015
  uint64_t prop_count = 0;
832
1016
  for ([[maybe_unused]] auto _ : obj) ++prop_count;
833
1017
 
@@ -867,15 +1051,11 @@ static void validate_node(const schema_node_ptr& node,
867
1051
  matched = true;
868
1052
  }
869
1053
 
870
- // Check patternProperties
871
- for (const auto& [pat, pat_schema] : node->pattern_properties) {
872
- try {
873
- std::regex re(pat);
874
- if (std::regex_search(key_str, re)) {
875
- validate_node(pat_schema, val, path + "/" + key_str, ctx, errors, all_errors);
876
- matched = true;
877
- }
878
- } catch (...) {
1054
+ // Check patternProperties (use cached compiled regex)
1055
+ for (const auto& pp : node->pattern_properties) {
1056
+ if (pp.compiled && re2::RE2::PartialMatch(key_str, *pp.compiled)) {
1057
+ validate_node(pp.schema, val, path + "/" + key_str, ctx, errors, all_errors);
1058
+ matched = true;
879
1059
  }
880
1060
  }
881
1061
 
@@ -892,6 +1072,43 @@ static void validate_node(const schema_node_ptr& node,
892
1072
  }
893
1073
  }
894
1074
  }
1075
+
1076
+ // propertyNames
1077
+ if (node->property_names_schema) {
1078
+ for (auto [key, val] : obj) {
1079
+ // Create a string element to validate the key
1080
+ std::string key_json = "\"" + std::string(key) + "\"";
1081
+ dom::parser key_parser;
1082
+ auto key_result = key_parser.parse(key_json);
1083
+ if (!key_result.error()) {
1084
+ validate_node(node->property_names_schema, key_result.value(),
1085
+ path, ctx, errors, all_errors);
1086
+ }
1087
+ }
1088
+ }
1089
+
1090
+ // dependentRequired
1091
+ for (const auto& [prop, deps] : node->dependent_required) {
1092
+ dom::element dummy;
1093
+ if (obj[prop].get(dummy) == SUCCESS) {
1094
+ for (const auto& dep : deps) {
1095
+ dom::element dep_dummy;
1096
+ if (obj[dep].get(dep_dummy) != SUCCESS) {
1097
+ errors.push_back({error_code::required_property_missing, path,
1098
+ "property '" + prop + "' requires '" + dep +
1099
+ "' to be present"});
1100
+ }
1101
+ }
1102
+ }
1103
+ }
1104
+
1105
+ // dependentSchemas
1106
+ for (const auto& [prop, schema] : node->dependent_schemas) {
1107
+ dom::element dummy;
1108
+ if (obj[prop].get(dummy) == SUCCESS) {
1109
+ validate_node(schema, value, path, ctx, errors, all_errors);
1110
+ }
1111
+ }
895
1112
  }
896
1113
 
897
1114
  // allOf
@@ -967,6 +1184,339 @@ static void validate_node(const schema_node_ptr& node,
967
1184
  }
968
1185
  }
969
1186
 
1187
+ // --- Codegen compiler ---
1188
+ static void cg_compile(const schema_node* n, cg::plan& p,
1189
+ std::vector<cg::ins>& out) {
1190
+ if (!n) return;
1191
+ if (n->boolean_schema.has_value()) {
1192
+ if (!*n->boolean_schema) out.push_back({cg::op::EXPECT_NULL});
1193
+ return;
1194
+ }
1195
+ // Composition fallback
1196
+ if (!n->ref.empty() || !n->all_of.empty() || !n->any_of.empty() ||
1197
+ !n->one_of.empty() || n->not_schema || n->if_schema) {
1198
+ uintptr_t ptr = reinterpret_cast<uintptr_t>(n);
1199
+ out.push_back({cg::op::COMPOSITION, (uint32_t)(ptr & 0xFFFFFFFF),
1200
+ (uint32_t)((ptr >> 32) & 0xFFFFFFFF)});
1201
+ return;
1202
+ }
1203
+ // Type
1204
+ if (!n->types.empty()) {
1205
+ if (n->types.size() == 1) {
1206
+ auto& t = n->types[0];
1207
+ if (t=="object") out.push_back({cg::op::EXPECT_OBJECT});
1208
+ else if (t=="array") out.push_back({cg::op::EXPECT_ARRAY});
1209
+ else if (t=="string") out.push_back({cg::op::EXPECT_STRING});
1210
+ else if (t=="number") out.push_back({cg::op::EXPECT_NUMBER});
1211
+ else if (t=="integer") out.push_back({cg::op::EXPECT_INTEGER});
1212
+ else if (t=="boolean") out.push_back({cg::op::EXPECT_BOOLEAN});
1213
+ else if (t=="null") out.push_back({cg::op::EXPECT_NULL});
1214
+ } else {
1215
+ uint32_t i = (uint32_t)p.type_sets.size();
1216
+ p.type_sets.push_back(n->types);
1217
+ out.push_back({cg::op::EXPECT_TYPE_MULTI, i});
1218
+ }
1219
+ }
1220
+ // Enum
1221
+ if (!n->enum_values_minified.empty()) {
1222
+ bool all_str = true;
1223
+ for (auto& e : n->enum_values_minified)
1224
+ if (e.empty() || e[0]!='"') { all_str=false; break; }
1225
+ uint32_t i = (uint32_t)p.enum_sets.size();
1226
+ p.enum_sets.push_back(n->enum_values_minified);
1227
+ out.push_back({all_str ? cg::op::CHECK_ENUM_STR : cg::op::CHECK_ENUM, i});
1228
+ }
1229
+ if (n->const_value_raw.has_value()) {
1230
+ uint32_t i=(uint32_t)p.strings.size();
1231
+ p.strings.push_back(*n->const_value_raw);
1232
+ out.push_back({cg::op::CHECK_CONST, i});
1233
+ }
1234
+ // Numeric
1235
+ if (n->minimum.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->minimum); out.push_back({cg::op::CHECK_MINIMUM,i}); }
1236
+ if (n->maximum.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->maximum); out.push_back({cg::op::CHECK_MAXIMUM,i}); }
1237
+ if (n->exclusive_minimum.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->exclusive_minimum); out.push_back({cg::op::CHECK_EX_MINIMUM,i}); }
1238
+ if (n->exclusive_maximum.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->exclusive_maximum); out.push_back({cg::op::CHECK_EX_MAXIMUM,i}); }
1239
+ if (n->multiple_of.has_value()) { uint32_t i=(uint32_t)p.doubles.size(); p.doubles.push_back(*n->multiple_of); out.push_back({cg::op::CHECK_MULTIPLE_OF,i}); }
1240
+ // String
1241
+ if (n->min_length.has_value()) out.push_back({cg::op::CHECK_MIN_LENGTH,(uint32_t)*n->min_length});
1242
+ if (n->max_length.has_value()) out.push_back({cg::op::CHECK_MAX_LENGTH,(uint32_t)*n->max_length});
1243
+ if (n->compiled_pattern) { uint32_t i=(uint32_t)p.regexes.size(); p.regexes.push_back(n->compiled_pattern); out.push_back({cg::op::CHECK_PATTERN,i}); }
1244
+ if (n->format.has_value()) {
1245
+ uint32_t i=(uint32_t)p.format_ids.size();
1246
+ uint8_t fid=255;
1247
+ auto& f=*n->format;
1248
+ if(f=="email")fid=0;else if(f=="date")fid=1;else if(f=="date-time")fid=2;
1249
+ else if(f=="time")fid=3;else if(f=="ipv4")fid=4;else if(f=="ipv6")fid=5;
1250
+ else if(f=="uri"||f=="uri-reference")fid=6;else if(f=="uuid")fid=7;
1251
+ else if(f=="hostname")fid=8;
1252
+ p.format_ids.push_back(fid);
1253
+ out.push_back({cg::op::CHECK_FORMAT,i});
1254
+ }
1255
+ // Array
1256
+ if (n->min_items.has_value()) out.push_back({cg::op::CHECK_MIN_ITEMS,(uint32_t)*n->min_items});
1257
+ if (n->max_items.has_value()) out.push_back({cg::op::CHECK_MAX_ITEMS,(uint32_t)*n->max_items});
1258
+ if (n->unique_items) out.push_back({cg::op::CHECK_UNIQUE_ITEMS});
1259
+ if (n->items_schema) {
1260
+ uint32_t si=(uint32_t)p.subs.size();
1261
+ p.subs.emplace_back();
1262
+ std::vector<cg::ins> sub_code;
1263
+ cg_compile(n->items_schema.get(), p, sub_code);
1264
+ sub_code.push_back({cg::op::END});
1265
+ p.subs[si] = std::move(sub_code);
1266
+ out.push_back({cg::op::ARRAY_ITEMS, si});
1267
+ }
1268
+ // Object
1269
+ for (auto& r : n->required) { uint32_t i=(uint32_t)p.strings.size(); p.strings.push_back(r); out.push_back({cg::op::CHECK_REQUIRED,i}); }
1270
+ if (n->min_properties.has_value()) out.push_back({cg::op::CHECK_MIN_PROPS,(uint32_t)*n->min_properties});
1271
+ if (n->max_properties.has_value()) out.push_back({cg::op::CHECK_MAX_PROPS,(uint32_t)*n->max_properties});
1272
+ // additional_properties_schema requires tree walker — bail out to COMPOSITION
1273
+ if (n->additional_properties_schema) {
1274
+ out.push_back({cg::op::COMPOSITION, 0, 0});
1275
+ return;
1276
+ }
1277
+ if (!n->properties.empty() || (n->additional_properties_bool.has_value() && !*n->additional_properties_bool)) {
1278
+ out.push_back({cg::op::OBJ_PROPS_START});
1279
+ if (n->additional_properties_bool.has_value() && !*n->additional_properties_bool)
1280
+ out.push_back({cg::op::CHECK_NO_ADDITIONAL});
1281
+ for (auto& [name, schema] : n->properties) {
1282
+ uint32_t ni=(uint32_t)p.strings.size(); p.strings.push_back(name);
1283
+ uint32_t si=(uint32_t)p.subs.size();
1284
+ p.subs.emplace_back();
1285
+ std::vector<cg::ins> sub_code;
1286
+ cg_compile(schema.get(), p, sub_code);
1287
+ sub_code.push_back({cg::op::END});
1288
+ p.subs[si] = std::move(sub_code);
1289
+ out.push_back({cg::op::OBJ_PROP, ni, si});
1290
+ }
1291
+ out.push_back({cg::op::OBJ_PROPS_END});
1292
+ }
1293
+ }
1294
+
1295
+ // --- Codegen executor ---
1296
+ static const char* fmt_names[]={"email","date","date-time","time","ipv4","ipv6","uri","uuid","hostname"};
1297
+
1298
+ static bool cg_exec(const cg::plan& p, const std::vector<cg::ins>& code,
1299
+ dom::element value) {
1300
+ auto t = type_of_sv(value);
1301
+ for (size_t i=0; i<code.size(); ++i) {
1302
+ auto& c = code[i];
1303
+ switch(c.o) {
1304
+ case cg::op::END: return true;
1305
+ case cg::op::EXPECT_OBJECT: if(t!="object") return false; break;
1306
+ case cg::op::EXPECT_ARRAY: if(t!="array") return false; break;
1307
+ case cg::op::EXPECT_STRING: if(t!="string") return false; break;
1308
+ case cg::op::EXPECT_NUMBER: if(t!="number"&&t!="integer") return false; break;
1309
+ case cg::op::EXPECT_INTEGER: if(t!="integer") return false; break;
1310
+ case cg::op::EXPECT_BOOLEAN: if(t!="boolean") return false; break;
1311
+ case cg::op::EXPECT_NULL: if(t!="null") return false; break;
1312
+ case cg::op::EXPECT_TYPE_MULTI: {
1313
+ auto& ts=p.type_sets[c.a]; bool m=false;
1314
+ for(auto& ty:ts){if(t==ty||(ty=="number"&&(t=="integer"||t=="number"))){m=true;break;}}
1315
+ if(!m) return false; break;
1316
+ }
1317
+ case cg::op::CHECK_MINIMUM: if(t=="integer"||t=="number"){if(to_double(value)<p.doubles[c.a])return false;} break;
1318
+ case cg::op::CHECK_MAXIMUM: if(t=="integer"||t=="number"){if(to_double(value)>p.doubles[c.a])return false;} break;
1319
+ case cg::op::CHECK_EX_MINIMUM: if(t=="integer"||t=="number"){if(to_double(value)<=p.doubles[c.a])return false;} break;
1320
+ case cg::op::CHECK_EX_MAXIMUM: if(t=="integer"||t=="number"){if(to_double(value)>=p.doubles[c.a])return false;} break;
1321
+ case cg::op::CHECK_MULTIPLE_OF: if(t=="integer"||t=="number"){double v=to_double(value),d=p.doubles[c.a],r=std::fmod(v,d);if(std::abs(r)>1e-8&&std::abs(r-d)>1e-8)return false;} break;
1322
+ case cg::op::CHECK_MIN_LENGTH: if(t=="string"){std::string_view sv;value.get(sv);if(utf8_length(sv)<c.a)return false;} break;
1323
+ case cg::op::CHECK_MAX_LENGTH: if(t=="string"){std::string_view sv;value.get(sv);if(utf8_length(sv)>c.a)return false;} break;
1324
+ case cg::op::CHECK_PATTERN: if(t=="string"){std::string_view sv;value.get(sv);if(!re2::RE2::PartialMatch(re2::StringPiece(sv.data(),sv.size()),*p.regexes[c.a]))return false;} break;
1325
+ case cg::op::CHECK_FORMAT: if(t=="string"){std::string_view sv;value.get(sv);uint8_t f=p.format_ids[c.a];if(f<9&&!check_format(sv,fmt_names[f]))return false;} break;
1326
+ case cg::op::CHECK_MIN_ITEMS: if(t=="array"){dom::array a;value.get(a);uint64_t s=0;for([[maybe_unused]]auto _:a)++s;if(s<c.a)return false;} break;
1327
+ case cg::op::CHECK_MAX_ITEMS: if(t=="array"){dom::array a;value.get(a);uint64_t s=0;for([[maybe_unused]]auto _:a)++s;if(s>c.a)return false;} break;
1328
+ case cg::op::CHECK_UNIQUE_ITEMS: if(t=="array"){dom::array a;value.get(a);std::set<std::string> seen;for(auto x:a)if(!seen.insert(canonical_json(x)).second)return false;} break;
1329
+ case cg::op::ARRAY_ITEMS: if(t=="array"){dom::array a;value.get(a);for(auto x:a)if(!cg_exec(p,p.subs[c.a],x))return false;} break;
1330
+ case cg::op::CHECK_REQUIRED: if(t=="object"){dom::object o;value.get(o);dom::element d;if(o[p.strings[c.a]].get(d)!=SUCCESS)return false;} break;
1331
+ case cg::op::CHECK_MIN_PROPS: if(t=="object"){dom::object o;value.get(o);uint64_t n=0;for([[maybe_unused]]auto _:o)++n;if(n<c.a)return false;} break;
1332
+ case cg::op::CHECK_MAX_PROPS: if(t=="object"){dom::object o;value.get(o);uint64_t n=0;for([[maybe_unused]]auto _:o)++n;if(n>c.a)return false;} break;
1333
+ case cg::op::OBJ_PROPS_START: if(t=="object"){
1334
+ dom::object o; value.get(o);
1335
+ // collect prop defs
1336
+ struct pd{std::string_view nm;uint32_t si;};
1337
+ std::vector<pd> props; bool no_add=false;
1338
+ size_t j=i+1;
1339
+ for(;j<code.size()&&code[j].o!=cg::op::OBJ_PROPS_END;++j){
1340
+ if(code[j].o==cg::op::OBJ_PROP) props.push_back({p.strings[code[j].a],code[j].b});
1341
+ else if(code[j].o==cg::op::CHECK_NO_ADDITIONAL) no_add=true;
1342
+ }
1343
+ for(auto [key,val]:o){
1344
+ bool matched=false;
1345
+ for(auto& pp:props){if(key==pp.nm){if(!cg_exec(p,p.subs[pp.si],val))return false;matched=true;break;}}
1346
+ if(!matched&&no_add)return false;
1347
+ }
1348
+ i=j; break;
1349
+ } else { /* skip to OBJ_PROPS_END */ size_t j=i+1; for(;j<code.size()&&code[j].o!=cg::op::OBJ_PROPS_END;++j); i=j; } break;
1350
+ case cg::op::OBJ_PROP: case cg::op::OBJ_PROPS_END: case cg::op::CHECK_NO_ADDITIONAL: break;
1351
+ case cg::op::CHECK_ENUM_STR: {
1352
+ auto& es=p.enum_sets[c.a]; bool f=false;
1353
+ if(t=="string"){std::string_view sv;value.get(sv);for(auto& e:es)if(e.size()==sv.size()+2&&e[0]=='"'&&e.back()=='"'&&e.compare(1,sv.size(),sv)==0){f=true;break;}}
1354
+ if(!f){std::string v=canonical_json(value);for(auto& e:es)if(e==v){f=true;break;}}
1355
+ if(!f)return false; break;
1356
+ }
1357
+ case cg::op::CHECK_ENUM: {
1358
+ auto& es=p.enum_sets[c.a]; bool f=false;
1359
+ if(t=="string"){std::string_view sv;value.get(sv);for(auto& e:es)if(e.size()==sv.size()+2&&e[0]=='"'&&e.back()=='"'&&e.compare(1,sv.size(),sv)==0){f=true;break;}}
1360
+ if(!f&&value.is<int64_t>()){int64_t v;value.get(v);auto s=std::to_string(v);for(auto& e:es)if(e==s){f=true;break;}}
1361
+ if(!f){std::string v=canonical_json(value);for(auto& e:es)if(e==v){f=true;break;}}
1362
+ if(!f)return false; break;
1363
+ }
1364
+ case cg::op::CHECK_CONST: if(canonical_json(value)!=p.strings[c.a])return false; break;
1365
+ case cg::op::COMPOSITION: return false; // fallback to tree walker
1366
+ }
1367
+ }
1368
+ return true;
1369
+ }
1370
+
1371
+ // --- On Demand fast path executor ---
1372
+ // Uses simdjson On Demand API to avoid materializing the full DOM tree.
1373
+ // Returns: true = valid, false = invalid OR unsupported (fallback to DOM).
1374
+
1375
+ static std::string_view od_type(simdjson::ondemand::value& v) {
1376
+ switch (v.type()) {
1377
+ case simdjson::ondemand::json_type::object: return "object";
1378
+ case simdjson::ondemand::json_type::array: return "array";
1379
+ case simdjson::ondemand::json_type::string: return "string";
1380
+ case simdjson::ondemand::json_type::boolean: return "boolean";
1381
+ case simdjson::ondemand::json_type::null: return "null";
1382
+ case simdjson::ondemand::json_type::number: {
1383
+ simdjson::ondemand::number_type nt;
1384
+ if (v.get_number_type().get(nt) == SUCCESS &&
1385
+ nt == simdjson::ondemand::number_type::floating_point_number)
1386
+ return "number";
1387
+ return "integer";
1388
+ }
1389
+ }
1390
+ return "unknown";
1391
+ }
1392
+
1393
+ static bool od_exec(const cg::plan& p, const std::vector<cg::ins>& code,
1394
+ simdjson::ondemand::value value) {
1395
+ auto t = od_type(value);
1396
+ for (size_t i = 0; i < code.size(); ++i) {
1397
+ auto& c = code[i];
1398
+ switch (c.o) {
1399
+ case cg::op::END: return true;
1400
+ case cg::op::EXPECT_OBJECT: if(t!="object") return false; break;
1401
+ case cg::op::EXPECT_ARRAY: if(t!="array") return false; break;
1402
+ case cg::op::EXPECT_STRING: if(t!="string") return false; break;
1403
+ case cg::op::EXPECT_NUMBER: if(t!="number"&&t!="integer") return false; break;
1404
+ case cg::op::EXPECT_INTEGER: if(t!="integer") return false; break;
1405
+ case cg::op::EXPECT_BOOLEAN: if(t!="boolean") return false; break;
1406
+ case cg::op::EXPECT_NULL: if(t!="null") return false; break;
1407
+ case cg::op::EXPECT_TYPE_MULTI: {
1408
+ auto& ts=p.type_sets[c.a]; bool m=false;
1409
+ for(auto& ty:ts){if(t==ty||(ty=="number"&&(t=="integer"||t=="number"))){m=true;break;}}
1410
+ if(!m) return false; break;
1411
+ }
1412
+ case cg::op::CHECK_MINIMUM:
1413
+ case cg::op::CHECK_MAXIMUM:
1414
+ case cg::op::CHECK_EX_MINIMUM:
1415
+ case cg::op::CHECK_EX_MAXIMUM:
1416
+ case cg::op::CHECK_MULTIPLE_OF: {
1417
+ if (t=="integer"||t=="number") {
1418
+ double v;
1419
+ if (t=="integer") { int64_t iv; if(value.get(iv)!=SUCCESS) return false; v=(double)iv; }
1420
+ else { if(value.get(v)!=SUCCESS) return false; }
1421
+ double d=p.doubles[c.a];
1422
+ if(c.o==cg::op::CHECK_MINIMUM && v<d) return false;
1423
+ if(c.o==cg::op::CHECK_MAXIMUM && v>d) return false;
1424
+ if(c.o==cg::op::CHECK_EX_MINIMUM && v<=d) return false;
1425
+ if(c.o==cg::op::CHECK_EX_MAXIMUM && v>=d) return false;
1426
+ if(c.o==cg::op::CHECK_MULTIPLE_OF){double r=std::fmod(v,d);if(std::abs(r)>1e-8&&std::abs(r-d)>1e-8)return false;}
1427
+ }
1428
+ break;
1429
+ }
1430
+ case cg::op::CHECK_MIN_LENGTH: if(t=="string"){std::string_view sv; if(value.get(sv)!=SUCCESS) return false; if(utf8_length(sv)<c.a) return false;} break;
1431
+ case cg::op::CHECK_MAX_LENGTH: if(t=="string"){std::string_view sv; if(value.get(sv)!=SUCCESS) return false; if(utf8_length(sv)>c.a) return false;} break;
1432
+ case cg::op::CHECK_PATTERN: if(t=="string"){std::string_view sv; if(value.get(sv)!=SUCCESS) return false; if(!re2::RE2::PartialMatch(re2::StringPiece(sv.data(),sv.size()),*p.regexes[c.a]))return false;} break;
1433
+ case cg::op::CHECK_FORMAT: if(t=="string"){std::string_view sv; if(value.get(sv)!=SUCCESS) return false; uint8_t f=p.format_ids[c.a]; if(f<9&&!check_format(sv,fmt_names[f]))return false;} break;
1434
+ case cg::op::CHECK_MIN_ITEMS: if(t=="array"){
1435
+ simdjson::ondemand::array a; if(value.get(a)!=SUCCESS) return false;
1436
+ uint64_t s=0; for(auto x:a){(void)x;++s;} if(s<c.a) return false;
1437
+ } break;
1438
+ case cg::op::CHECK_MAX_ITEMS: if(t=="array"){
1439
+ simdjson::ondemand::array a; if(value.get(a)!=SUCCESS) return false;
1440
+ uint64_t s=0; for(auto x:a){(void)x;++s;} if(s>c.a) return false;
1441
+ } break;
1442
+ case cg::op::ARRAY_ITEMS: if(t=="array"){
1443
+ simdjson::ondemand::array a; if(value.get(a)!=SUCCESS) return false;
1444
+ for(auto elem:a){
1445
+ simdjson::ondemand::value v; if(elem.get(v)!=SUCCESS) return false;
1446
+ if(!od_exec(p,p.subs[c.a],v)) return false;
1447
+ }
1448
+ } break;
1449
+ case cg::op::CHECK_REQUIRED: if(t=="object"){
1450
+ simdjson::ondemand::object o; if(value.get(o)!=SUCCESS) return false;
1451
+ auto f = o.find_field_unordered(p.strings[c.a]);
1452
+ if(f.error()) return false;
1453
+ } break;
1454
+ case cg::op::CHECK_MIN_PROPS: if(t=="object"){
1455
+ simdjson::ondemand::object o; if(value.get(o)!=SUCCESS) return false;
1456
+ uint64_t n=0; for(auto f:o){(void)f;++n;} if(n<c.a) return false;
1457
+ } break;
1458
+ case cg::op::CHECK_MAX_PROPS: if(t=="object"){
1459
+ simdjson::ondemand::object o; if(value.get(o)!=SUCCESS) return false;
1460
+ uint64_t n=0; for(auto f:o){(void)f;++n;} if(n>c.a) return false;
1461
+ } break;
1462
+ case cg::op::OBJ_PROPS_START: if(t=="object"){
1463
+ simdjson::ondemand::object o; if(value.get(o)!=SUCCESS) return false;
1464
+ struct pd{std::string_view nm;uint32_t si;};
1465
+ std::vector<pd> props; bool no_add=false;
1466
+ size_t j=i+1;
1467
+ for(;j<code.size()&&code[j].o!=cg::op::OBJ_PROPS_END;++j){
1468
+ if(code[j].o==cg::op::OBJ_PROP) props.push_back({p.strings[code[j].a],code[j].b});
1469
+ else if(code[j].o==cg::op::CHECK_NO_ADDITIONAL) no_add=true;
1470
+ }
1471
+ for(auto field:o){
1472
+ simdjson::ondemand::raw_json_string rk; if(field.key().get(rk)!=SUCCESS) return false;
1473
+ std::string_view key = field.unescaped_key();
1474
+ bool matched=false;
1475
+ for(auto& pp:props){
1476
+ if(key==pp.nm){
1477
+ simdjson::ondemand::value fv; if(field.value().get(fv)!=SUCCESS) return false;
1478
+ if(!od_exec(p,p.subs[pp.si],fv)) return false;
1479
+ matched=true; break;
1480
+ }
1481
+ }
1482
+ if(!matched&&no_add) return false;
1483
+ }
1484
+ i=j; break;
1485
+ } else { size_t j=i+1; for(;j<code.size()&&code[j].o!=cg::op::OBJ_PROPS_END;++j); i=j; } break;
1486
+ case cg::op::OBJ_PROP: case cg::op::OBJ_PROPS_END: case cg::op::CHECK_NO_ADDITIONAL: break;
1487
+
1488
+ // These require full materialization — bail to DOM path
1489
+ case cg::op::CHECK_UNIQUE_ITEMS:
1490
+ case cg::op::CHECK_ENUM_STR:
1491
+ case cg::op::CHECK_ENUM:
1492
+ case cg::op::CHECK_CONST:
1493
+ case cg::op::COMPOSITION:
1494
+ return false;
1495
+ }
1496
+ }
1497
+ return true;
1498
+ }
1499
+
1500
+ // Determine if a codegen plan can use On Demand (no enum/const/uniqueItems)
1501
+ static bool plan_supports_ondemand(const cg::plan& p) {
1502
+ for (auto& c : p.code) {
1503
+ if (c.o == cg::op::CHECK_UNIQUE_ITEMS || c.o == cg::op::CHECK_ENUM_STR ||
1504
+ c.o == cg::op::CHECK_ENUM || c.o == cg::op::CHECK_CONST ||
1505
+ c.o == cg::op::COMPOSITION)
1506
+ return false;
1507
+ }
1508
+ // Also check sub-plans
1509
+ for (auto& sub : p.subs) {
1510
+ for (auto& c : sub) {
1511
+ if (c.o == cg::op::CHECK_UNIQUE_ITEMS || c.o == cg::op::CHECK_ENUM_STR ||
1512
+ c.o == cg::op::CHECK_ENUM || c.o == cg::op::CHECK_CONST ||
1513
+ c.o == cg::op::COMPOSITION)
1514
+ return false;
1515
+ }
1516
+ }
1517
+ return true;
1518
+ }
1519
+
970
1520
  schema_ref compile(std::string_view schema_json) {
971
1521
  auto ctx = std::make_shared<compiled_schema>();
972
1522
  ctx->raw_schema = std::string(schema_json);
@@ -980,6 +1530,11 @@ schema_ref compile(std::string_view schema_json) {
980
1530
 
981
1531
  ctx->root = compile_node(doc, *ctx);
982
1532
 
1533
+ // Generate codegen plan
1534
+ cg_compile(ctx->root.get(), ctx->gen_plan, ctx->gen_plan.code);
1535
+ ctx->gen_plan.code.push_back({cg::op::END});
1536
+ ctx->use_ondemand = plan_supports_ondemand(ctx->gen_plan);
1537
+
983
1538
  schema_ref ref;
984
1539
  ref.impl = ctx;
985
1540
  return ref;
@@ -992,13 +1547,42 @@ validation_result validate(const schema_ref& schema, std::string_view json,
992
1547
  }
993
1548
 
994
1549
  auto padded = simdjson::padded_string(json);
1550
+
1551
+ // Ultra-fast path: On Demand (no DOM materialization)
1552
+ // Only beneficial for larger documents where DOM materialization cost dominates
1553
+ static constexpr size_t OD_THRESHOLD = 32;
1554
+ if (schema.impl->use_ondemand && !schema.impl->gen_plan.code.empty() &&
1555
+ json.size() >= OD_THRESHOLD) {
1556
+ auto od_result = schema.impl->od_parser.iterate(padded);
1557
+ if (!od_result.error()) {
1558
+ simdjson::ondemand::value root_val;
1559
+ if (od_result.get_value().get(root_val) == SUCCESS) {
1560
+ if (od_exec(schema.impl->gen_plan, schema.impl->gen_plan.code, root_val)) {
1561
+ return {true, {}};
1562
+ }
1563
+ }
1564
+ }
1565
+ // On Demand said invalid — fall through to DOM for error details
1566
+ }
1567
+
995
1568
  auto result = schema.impl->doc_parser.parse(padded);
996
1569
  if (result.error()) {
997
1570
  return {false, {{error_code::invalid_json, "", "invalid JSON document"}}};
998
1571
  }
999
1572
 
1573
+ // Fast path: codegen bytecode execution (DOM)
1574
+ if (!schema.impl->use_ondemand && !schema.impl->gen_plan.code.empty()) {
1575
+ if (cg_exec(schema.impl->gen_plan, schema.impl->gen_plan.code,
1576
+ result.value())) {
1577
+ return {true, {}};
1578
+ }
1579
+ // Codegen said invalid OR hit COMPOSITION — fall through to tree walker
1580
+ }
1581
+
1582
+ // Slow path: re-parse + tree walker with error details
1583
+ auto result2 = schema.impl->doc_parser.parse(padded);
1000
1584
  std::vector<validation_error> errors;
1001
- validate_node(schema.impl->root, result.value(), "", *schema.impl, errors,
1585
+ validate_node(schema.impl->root, result2.value(), "", *schema.impl, errors,
1002
1586
  opts.all_errors);
1003
1587
 
1004
1588
  return {errors.empty(), std::move(errors)};