ata-validator 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "ata-validator",
3
- "version": "0.5.1",
4
- "description": "Ultra-fast JSON Schema validator with full ajv feature parity. 5.9x faster validation on complex schemas, 2,184x faster compilation. Cross-schema $ref, Draft 7 support, patternProperties/dependentSchemas/propertyNames codegen. V8-optimized JS codegen, simdjson, RE2, multi-core. Standard Schema V1 compatible.",
3
+ "version": "0.6.0",
4
+ "description": "Ultra-fast JSON Schema validator. 4.7x faster validation, 1,800x faster compilation. Works without native addon. Cross-schema $ref, Draft 2020-12 + Draft 7, V8-optimized JS codegen, simdjson, RE2, multi-core. Standard Schema V1 compatible.",
5
5
  "main": "index.js",
6
6
  "types": "index.d.ts",
7
7
  "scripts": {
package/src/ata.cpp CHANGED
@@ -7,6 +7,7 @@
7
7
 
8
8
  #include <algorithm>
9
9
  #include <cmath>
10
+ #include <cstring>
10
11
  #include <re2/re2.h>
11
12
  #include <set>
12
13
  #include <unordered_map>
@@ -350,6 +351,51 @@ struct plan {
350
351
  };
351
352
  } // namespace cg
352
353
 
354
+ // --- On-Demand validation plan ---
355
+ // Grouped checks per value type. Each value consumed exactly once.
356
+ // Built from schema_node at compile time, used by od_exec_plan at runtime.
357
+ struct od_plan {
358
+ uint8_t type_mask = 0;
359
+
360
+ // Numeric — bitmask for which checks to run + flat array of bounds
361
+ enum num_flag : uint8_t {
362
+ HAS_MIN = 1, HAS_MAX = 2, HAS_EX_MIN = 4, HAS_EX_MAX = 8, HAS_MUL = 16
363
+ };
364
+ uint8_t num_flags = 0;
365
+ double num_min = 0, num_max = 0, num_ex_min = 0, num_ex_max = 0, num_mul = 0;
366
+
367
+ // String — single value.get(sv) then all checks
368
+ std::optional<uint64_t> min_length, max_length;
369
+ re2::RE2* pattern = nullptr; // borrowed pointer from schema_node
370
+ uint8_t format_id = 255; // 255 = no format check
371
+
372
+ // Object — single iterate with merged required+property lookup
373
+ struct prop_entry {
374
+ std::string key;
375
+ int required_idx = -1; // bit index for required tracking, or -1
376
+ std::shared_ptr<od_plan> sub; // property sub-plan, or nullptr
377
+ };
378
+ struct obj_plan {
379
+ std::vector<prop_entry> entries; // merged required + properties — single scan
380
+ size_t required_count = 0;
381
+ bool no_additional = false;
382
+ std::optional<uint64_t> min_props, max_props;
383
+ };
384
+ std::shared_ptr<obj_plan> object;
385
+
386
+ // Array — single iterate: items + count
387
+ struct arr_plan {
388
+ std::shared_ptr<od_plan> items;
389
+ std::optional<uint64_t> min_items, max_items;
390
+ };
391
+ std::shared_ptr<arr_plan> array;
392
+
393
+ // If false, schema uses unsupported features — must fall back to DOM path.
394
+ bool supported = true;
395
+ };
396
+
397
+ using od_plan_ptr = std::shared_ptr<od_plan>;
398
+
353
399
  struct compiled_schema {
354
400
  schema_node_ptr root;
355
401
  std::unordered_map<std::string, schema_node_ptr> defs;
@@ -357,6 +403,7 @@ struct compiled_schema {
357
403
  dom::parser parser; // used only at compile time
358
404
  cg::plan gen_plan; // codegen validation plan
359
405
  bool use_ondemand = false; // true if codegen plan supports On Demand
406
+ od_plan_ptr od; // On-Demand execution plan
360
407
  };
361
408
 
362
409
  // Thread-local persistent parsers — reused across all validate calls on the
@@ -1917,6 +1964,256 @@ static simdjson::padded_string_view get_free_padded_view(
1917
1964
  return simdjson::padded_string_view(data, length, length + REQUIRED_PADDING);
1918
1965
  }
1919
1966
 
1967
+ // Build an od_plan from a schema_node tree.
1968
+ static od_plan_ptr compile_od_plan(const schema_node_ptr& node) {
1969
+ if (!node) return nullptr;
1970
+
1971
+ auto plan = std::make_shared<od_plan>();
1972
+
1973
+ if (node->boolean_schema.has_value()) {
1974
+ if (!node->boolean_schema.value()) plan->supported = false;
1975
+ return plan;
1976
+ }
1977
+
1978
+ // Unsupported features → fall back to DOM
1979
+ if (!node->ref.empty() ||
1980
+ !node->enum_values_minified.empty() ||
1981
+ node->const_value_raw.has_value() ||
1982
+ node->unique_items ||
1983
+ !node->all_of.empty() ||
1984
+ !node->any_of.empty() ||
1985
+ !node->one_of.empty() ||
1986
+ node->not_schema ||
1987
+ node->if_schema ||
1988
+ node->contains_schema ||
1989
+ !node->prefix_items.empty() ||
1990
+ !node->pattern_properties.empty() ||
1991
+ !node->dependent_required.empty() ||
1992
+ !node->dependent_schemas.empty() ||
1993
+ node->property_names_schema ||
1994
+ node->additional_properties_schema) {
1995
+ plan->supported = false;
1996
+ return plan;
1997
+ }
1998
+
1999
+ plan->type_mask = node->type_mask;
2000
+ if (node->minimum) { plan->num_flags |= od_plan::HAS_MIN; plan->num_min = *node->minimum; }
2001
+ if (node->maximum) { plan->num_flags |= od_plan::HAS_MAX; plan->num_max = *node->maximum; }
2002
+ if (node->exclusive_minimum) { plan->num_flags |= od_plan::HAS_EX_MIN; plan->num_ex_min = *node->exclusive_minimum; }
2003
+ if (node->exclusive_maximum) { plan->num_flags |= od_plan::HAS_EX_MAX; plan->num_ex_max = *node->exclusive_maximum; }
2004
+ if (node->multiple_of) { plan->num_flags |= od_plan::HAS_MUL; plan->num_mul = *node->multiple_of; }
2005
+ plan->min_length = node->min_length;
2006
+ plan->max_length = node->max_length;
2007
+ plan->pattern = node->compiled_pattern.get();
2008
+ plan->format_id = node->format_id;
2009
+
2010
+ // Object plan — build hash lookup for O(1) per-field dispatch
2011
+ if (!node->properties.empty() || !node->required.empty() ||
2012
+ node->additional_properties_bool.has_value() ||
2013
+ node->min_properties.has_value() || node->max_properties.has_value()) {
2014
+ auto op = std::make_shared<od_plan::obj_plan>();
2015
+ op->required_count = node->required.size();
2016
+ op->min_props = node->min_properties;
2017
+ op->max_props = node->max_properties;
2018
+ if (node->additional_properties_bool.has_value() &&
2019
+ !node->additional_properties_bool.value()) {
2020
+ op->no_additional = true;
2021
+ }
2022
+ // Build merged entries: each key appears once with required_idx + sub_plan
2023
+ std::unordered_map<std::string, size_t> key_to_idx;
2024
+ // Register required keys
2025
+ for (size_t i = 0; i < node->required.size() && i < 64; i++) {
2026
+ auto& rk = node->required[i];
2027
+ if (key_to_idx.find(rk) == key_to_idx.end()) {
2028
+ key_to_idx[rk] = op->entries.size();
2029
+ op->entries.push_back({rk, static_cast<int>(i), nullptr});
2030
+ } else {
2031
+ op->entries[key_to_idx[rk]].required_idx = static_cast<int>(i);
2032
+ }
2033
+ }
2034
+ // Register properties + compile sub-plans
2035
+ for (auto& [key, sub_node] : node->properties) {
2036
+ auto sub = compile_od_plan(sub_node);
2037
+ if (!sub || !sub->supported) { plan->supported = false; return plan; }
2038
+ auto it = key_to_idx.find(key);
2039
+ if (it != key_to_idx.end()) {
2040
+ op->entries[it->second].sub = std::move(sub);
2041
+ } else {
2042
+ key_to_idx[key] = op->entries.size();
2043
+ op->entries.push_back({key, -1, std::move(sub)});
2044
+ }
2045
+ }
2046
+ plan->object = std::move(op);
2047
+ }
2048
+
2049
+ // Array plan
2050
+ if (node->items_schema || node->min_items.has_value() || node->max_items.has_value()) {
2051
+ auto ap = std::make_shared<od_plan::arr_plan>();
2052
+ ap->min_items = node->min_items;
2053
+ ap->max_items = node->max_items;
2054
+ if (node->items_schema) {
2055
+ ap->items = compile_od_plan(node->items_schema);
2056
+ if (!ap->items || !ap->items->supported) { plan->supported = false; return plan; }
2057
+ }
2058
+ plan->array = std::move(ap);
2059
+ }
2060
+
2061
+ return plan;
2062
+ }
2063
+
2064
+ // Fast ASCII check: if all bytes < 0x80, byte length == codepoint length
2065
+ static inline uint64_t utf8_length_fast(std::string_view s) {
2066
+ // Check 8 bytes at a time for non-ASCII
2067
+ const uint8_t* p = reinterpret_cast<const uint8_t*>(s.data());
2068
+ size_t n = s.size();
2069
+ size_t i = 0;
2070
+ uint64_t has_high = 0;
2071
+ for (; i + 8 <= n; i += 8) {
2072
+ uint64_t block;
2073
+ std::memcpy(&block, p + i, 8);
2074
+ has_high |= block & 0x8080808080808080ULL;
2075
+ }
2076
+ for (; i < n; i++) has_high |= p[i] & 0x80;
2077
+ if (has_high == 0) return n; // Pure ASCII — byte count == codepoint count
2078
+ return utf8_length(s); // Fallback to full counting
2079
+ }
2080
+
2081
+ // Execute an od_plan against a simdjson On-Demand value.
2082
+ // Each value consumed exactly once. Uses simdjson types directly — no od_type() overhead.
2083
+ static bool od_exec_plan(const od_plan& plan, simdjson::ondemand::value value) {
2084
+ // Use simdjson type directly — skip od_type() conversion + get_number_type()
2085
+ using sjt = simdjson::ondemand::json_type;
2086
+ sjt st;
2087
+ if (value.type().get(st) != SUCCESS) return false;
2088
+
2089
+ // Type check using simdjson type directly
2090
+ if (plan.type_mask) {
2091
+ uint8_t tbits;
2092
+ switch (st) {
2093
+ case sjt::string: tbits = json_type_bit(json_type::string); break;
2094
+ case sjt::boolean: tbits = json_type_bit(json_type::boolean); break;
2095
+ case sjt::null: tbits = json_type_bit(json_type::null_value); break;
2096
+ case sjt::object: tbits = json_type_bit(json_type::object); break;
2097
+ case sjt::array: tbits = json_type_bit(json_type::array); break;
2098
+ case sjt::number:
2099
+ // Only call get_number_type when schema has type constraint that distinguishes int/number
2100
+ tbits = json_type_bit(json_type::number) | json_type_bit(json_type::integer);
2101
+ if ((plan.type_mask & tbits) != tbits) {
2102
+ // Schema distinguishes — need to check actual number type
2103
+ simdjson::ondemand::number_type nt;
2104
+ if (value.get_number_type().get(nt) == SUCCESS &&
2105
+ nt != simdjson::ondemand::number_type::floating_point_number)
2106
+ tbits = json_type_bit(json_type::integer) | json_type_bit(json_type::number);
2107
+ else
2108
+ tbits = json_type_bit(json_type::number);
2109
+ }
2110
+ break;
2111
+ default: tbits = 0;
2112
+ }
2113
+ if (!(tbits & plan.type_mask)) return false;
2114
+ }
2115
+
2116
+ switch (st) {
2117
+ case sjt::number: {
2118
+ if (!plan.num_flags) break; // No numeric constraints
2119
+ double v;
2120
+ // Try integer first (more common), fall back to double
2121
+ int64_t iv;
2122
+ if (value.get(iv) == SUCCESS) {
2123
+ v = static_cast<double>(iv);
2124
+ } else if (value.get(v) != SUCCESS) {
2125
+ return false;
2126
+ }
2127
+ uint8_t f = plan.num_flags;
2128
+ if ((f & od_plan::HAS_MIN) && v < plan.num_min) return false;
2129
+ if ((f & od_plan::HAS_MAX) && v > plan.num_max) return false;
2130
+ if ((f & od_plan::HAS_EX_MIN) && v <= plan.num_ex_min) return false;
2131
+ if ((f & od_plan::HAS_EX_MAX) && v >= plan.num_ex_max) return false;
2132
+ if (f & od_plan::HAS_MUL) {
2133
+ double r = std::fmod(v, plan.num_mul);
2134
+ if (std::abs(r) > 1e-8 && std::abs(r - plan.num_mul) > 1e-8) return false;
2135
+ }
2136
+ break;
2137
+ }
2138
+ case sjt::string: {
2139
+ std::string_view sv;
2140
+ if (value.get(sv) != SUCCESS) return false;
2141
+ if (plan.min_length || plan.max_length) {
2142
+ uint64_t len = utf8_length_fast(sv);
2143
+ if (plan.min_length && len < *plan.min_length) return false;
2144
+ if (plan.max_length && len > *plan.max_length) return false;
2145
+ }
2146
+ if (plan.pattern) {
2147
+ if (!re2::RE2::PartialMatch(re2::StringPiece(sv.data(), sv.size()), *plan.pattern))
2148
+ return false;
2149
+ }
2150
+ if (plan.format_id != 255) {
2151
+ if (!check_format_by_id(sv, plan.format_id)) return false;
2152
+ }
2153
+ break;
2154
+ }
2155
+ case sjt::object: {
2156
+ if (!plan.object) break;
2157
+ auto& op = *plan.object;
2158
+ simdjson::ondemand::object obj;
2159
+ if (value.get(obj) != SUCCESS) return false;
2160
+
2161
+ uint64_t required_found = 0;
2162
+ uint64_t prop_count = 0;
2163
+
2164
+ for (auto field : obj) {
2165
+ std::string_view key = field.unescaped_key();
2166
+ prop_count++;
2167
+
2168
+ // Single merged scan: required + property in one pass
2169
+ bool matched = false;
2170
+ for (auto& e : op.entries) {
2171
+ if (key == e.key) {
2172
+ if (e.required_idx >= 0)
2173
+ required_found |= (1ULL << e.required_idx);
2174
+ if (e.sub) {
2175
+ simdjson::ondemand::value fv;
2176
+ if (field.value().get(fv) != SUCCESS) return false;
2177
+ if (!od_exec_plan(*e.sub, fv)) return false;
2178
+ }
2179
+ matched = true;
2180
+ break;
2181
+ }
2182
+ }
2183
+ if (!matched && op.no_additional) return false;
2184
+ }
2185
+
2186
+ uint64_t required_mask = (op.required_count >= 64)
2187
+ ? ~0ULL : ((1ULL << op.required_count) - 1);
2188
+ if ((required_found & required_mask) != required_mask) return false;
2189
+ if (op.min_props && prop_count < *op.min_props) return false;
2190
+ if (op.max_props && prop_count > *op.max_props) return false;
2191
+ break;
2192
+ }
2193
+ case sjt::array: {
2194
+ if (!plan.array) break;
2195
+ auto& ap = *plan.array;
2196
+ simdjson::ondemand::array arr;
2197
+ if (value.get(arr) != SUCCESS) return false;
2198
+
2199
+ uint64_t count = 0;
2200
+ for (auto elem : arr) {
2201
+ simdjson::ondemand::value v;
2202
+ if (elem.get(v) != SUCCESS) return false;
2203
+ if (ap.items && !od_exec_plan(*ap.items, v)) return false;
2204
+ count++;
2205
+ }
2206
+ if (ap.min_items && count < *ap.min_items) return false;
2207
+ if (ap.max_items && count > *ap.max_items) return false;
2208
+ break;
2209
+ }
2210
+ default:
2211
+ break;
2212
+ }
2213
+
2214
+ return true;
2215
+ }
2216
+
1920
2217
  schema_ref compile(std::string_view schema_json) {
1921
2218
  auto ctx = std::make_shared<compiled_schema>();
1922
2219
  ctx->raw_schema = std::string(schema_json);
@@ -1934,6 +2231,7 @@ schema_ref compile(std::string_view schema_json) {
1934
2231
  cg_compile(ctx->root.get(), ctx->gen_plan, ctx->gen_plan.code);
1935
2232
  ctx->gen_plan.code.push_back({cg::op::END});
1936
2233
  ctx->use_ondemand = plan_supports_ondemand(ctx->gen_plan);
2234
+ ctx->od = compile_od_plan(ctx->root);
1937
2235
 
1938
2236
  schema_ref ref;
1939
2237
  ref.impl = ctx;
@@ -2006,6 +2304,22 @@ bool is_valid_prepadded(const schema_ref& schema, const char* data, size_t lengt
2006
2304
 
2007
2305
  simdjson::padded_string fallback;
2008
2306
  auto psv = get_free_padded_view(data, length, fallback);
2307
+
2308
+ // On-Demand fast path: skip DOM parse entirely
2309
+ // Minimum 32 bytes — On-Demand doesn't fully validate small malformed docs
2310
+ if (schema.impl->od && schema.impl->od->supported && length >= 32) {
2311
+ auto od_result = tl_od_parser().iterate(psv);
2312
+ if (!od_result.error()) {
2313
+ simdjson::ondemand::value root_val;
2314
+ if (od_result.get_value().get(root_val) == SUCCESS) {
2315
+ if (od_exec_plan(*schema.impl->od, root_val)) {
2316
+ return true;
2317
+ }
2318
+ }
2319
+ }
2320
+ psv = get_free_padded_view(data, length, fallback);
2321
+ }
2322
+
2009
2323
  auto result = tl_dom_parser().parse(psv);
2010
2324
  if (result.error()) return false;
2011
2325
 
@@ -2013,8 +2327,20 @@ bool is_valid_prepadded(const schema_ref& schema, const char* data, size_t lengt
2013
2327
  return cg_exec(schema.impl->gen_plan, schema.impl->gen_plan.code, result.value());
2014
2328
  }
2015
2329
 
2016
- // Use fast boolean-only tree walker — no error collection overhead
2017
2330
  return validate_fast(schema.impl->root, result.value(), *schema.impl);
2018
2331
  }
2019
2332
 
2333
+ bool is_valid_buf(const schema_ref& schema, const uint8_t* data, size_t length) {
2334
+ if (!schema.impl || !schema.impl->root || !data || length == 0) return false;
2335
+
2336
+ // Thread-local buffer with simdjson padding — reused across calls
2337
+ thread_local std::string tl_buf;
2338
+ const size_t needed = length + REQUIRED_PADDING;
2339
+ if (tl_buf.size() < needed) tl_buf.resize(needed);
2340
+ std::memcpy(tl_buf.data(), data, length);
2341
+ std::memset(tl_buf.data() + length, 0, REQUIRED_PADDING);
2342
+
2343
+ return is_valid_prepadded(schema, tl_buf.data(), length);
2344
+ }
2345
+
2020
2346
  } // namespace ata