json-schema-utils 0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jsutils/stats.py ADDED
@@ -0,0 +1,1310 @@
1
+ from typing import Any
2
+ import re
3
+ from urllib.parse import unquote as url_unquote
4
+
5
+ from .utils import JsonSchema, log
6
+
7
+ #
8
+ # FROM JSON MODE
9
+ #
10
+
11
+
12
+ def is_regex(s: str) -> bool:
13
+ if isinstance(s, str):
14
+ try:
15
+ re.compile(s)
16
+ return True
17
+ except Exception as e:
18
+ # \c ControlLetter (ECMA 262 v13 - 22.2.1 p. 552)
19
+ # \p{UnicodePropertyValueExpression} (same p. 553)
20
+ good_anyway = re.search(r"\\[Pp]\{", s) or re.search(r"\\c[a-zA-Z]", s)
21
+ if not good_anyway:
22
+ log.warning(f"invalid /{s}/: {e}")
23
+ return good_anyway is not None
24
+ else:
25
+ return False
26
+
27
+
28
+ def distinct_values(val):
29
+ try:
30
+ if isinstance(val, (list, tuple, str)):
31
+ return len(val) == len(set(val))
32
+ else:
33
+ return False
34
+ except TypeError as e:
35
+ # log.warning(f"ignoring error: {e}", exc_info=True)
36
+ log.warning(f"ignoring error: {e}")
37
+ # try slow iterative version which relies on ==
38
+ seen = []
39
+ for i in val:
40
+ if i in seen:
41
+ return False
42
+ else:
43
+ seen.append(i)
44
+ return True
45
+
46
+
47
+ def _json_metrics_rec(j, counts: dict[str, int]):
48
+
49
+ if j is None:
50
+ counts["null"] += 1
51
+ elif isinstance(j, bool):
52
+ counts["bool"] += 1
53
+ elif isinstance(j, int):
54
+ counts["int"] += 1
55
+ elif isinstance(j, float):
56
+ counts["float"] += 1
57
+ elif isinstance(j, str):
58
+ counts["string"] += 1
59
+ elif isinstance(j, (list, tuple)):
60
+ counts["array"] += 1
61
+ counts["items"] += len(j)
62
+ for i in j:
63
+ _json_metrics_rec(i, counts)
64
+ elif isinstance(j, dict):
65
+ counts["object"] += 1
66
+ counts["props"] += len(j)
67
+ for p, v in j.items():
68
+ assert isinstance(p, str), "json property must be a string"
69
+ counts["string"] += 1
70
+ _json_metrics_rec(v, counts)
71
+ else:
72
+ raise Exception(f"unexpected type: {type(j)}")
73
+
74
+
75
+ def json_metrics(j) -> dict[str, int]:
76
+
77
+ counts = {
78
+ "null": 0,
79
+ "bool": 0,
80
+ "int": 0,
81
+ "float": 0,
82
+ "string": 0,
83
+ "array": 0,
84
+ "object": 0,
85
+ "props": 0,
86
+ "items": 0,
87
+ }
88
+
89
+ _json_metrics_rec(j, counts)
90
+
91
+ return counts
92
+
93
+
94
+ # Properties names which suggest a JSON schema
95
+ DOUBTFUL_PROPERTY_NAMES = {
96
+ "$vocabulary", "exclusiveMinimum", "exclusiveMaximum", "multipleOf",
97
+ "prefixItems", "additionalItems",
98
+ "minContains", "maxContains", "unevaluatedItems",
99
+ "properties", "minProperties", "maxProperties", "patternProperties", "additionalProperties",
100
+ "unevaluatedProperties", "dependentRequired", "propertyNames",
101
+ "allOf", "anyOf", "oneOf"
102
+ }
103
+
104
+ # PER TYPE PROPERTIES
105
+ PER_TYPE = {
106
+ # out: $ref and $dynamicRef type
107
+ "hyper": [ "base", "links", "href", "rel" ], # Draft 3 Section 6 Hyper Schema (partial)
108
+ "meta": [ "$schema", "$vocabulary", "$id", "$anchor", "$dynamicAnchor", "$comment",
109
+ "title", "description", "default", "examples", "deprecated", "readOnly",
110
+ "writeOnly", "id", "context", "notes" ],
111
+ "alone": [ "enum", "const" ],
112
+ # number also stands for integer
113
+ "number": [ "minimum", "maximum", "exclusiveMinimum", "exclusiveMaximum", "multipleOf",
114
+ "divisibleBy" ],
115
+ "string": [ "minLength", "maxLength", "pattern", "contentMediaType", "contentEncoding",
116
+ "contentSchema" ],
117
+ "array": [ "items", "prefixItems", "additionalItems", "minItems", "maxItems",
118
+ "uniqueItems", "contains", "minContains", "maxContains", "unevaluatedItems" ],
119
+ "object": [ "properties", "minProperties", "maxProperties", "patternProperties",
120
+ "additionalProperties", "unevaluatedProperties", "required-list",
121
+ "dependentRequired", "propertyNames" ],
122
+ "combi": [ "allOf", "anyOf", "oneOf", "if", "then", "else", "not" ],
123
+ }
124
+
125
+ PROP_TO_TYPE: dict[str, str] = {}
126
+ for t, props in PER_TYPE.items():
127
+ for prop in props:
128
+ PROP_TO_TYPE[prop] = t
129
+
130
+ # value is not a schema
131
+ SCHEMA_KEYS_SIMPLE = [
132
+ # core
133
+ "$schema", "$vocabulary", "$id", "$anchor", "$dynamicAnchor", "$ref", "$dynamicRef",
134
+ "$comment",
135
+ # metadata
136
+ "title", "description", "default", "examples", "deprecated", "readOnly", "writeOnly",
137
+ # types
138
+ "type", "enum", "const", "format",
139
+ # validation
140
+ "minimum", "maximum", "multipleOf", "exclusiveMaximum", "exclusiveMinimum",
141
+ "minLength", "maxLength", "minItems", "maxItems", "minProperties", "maxProperties",
142
+ "pattern", "minContains", "maxContains", "uniqueItems",
143
+ "contentMediaType", "contentEncoding", "contentSchema",
144
+ "required", "dependentRequired",
145
+ # UNSURE, OLD?
146
+ "id", "context", "notes", "optional", "base", "links", "rel", "href", "requires",
147
+ ]
148
+
149
+ SCHEMA_KEYS_VALUE_SCHEMA = [
150
+ "not", "if", "then", "else", "items", "contains", "additionalProperties",
151
+ "propertyNames", "unevaluatedItems", "unevaluatedProperties",
152
+ # OLD?
153
+ # beware of dependencies which is both Schema or {"": [""]}
154
+ "additionalItems", "dependencies", "extends",
155
+ ]
156
+
157
+ SCHEMA_KEYS_ARRAY_OF_SCHEMAS = [
158
+ "allOf", "anyOf", "oneOf", "prefixItems",
159
+ # OLD
160
+ "items", "extends",
161
+ ]
162
+
163
+ SCHEMA_KEYS_OBJECT_VALUES_SCHEMAS = [
164
+ "$defs", "definitions", # old version
165
+ "dependentSchemas", "properties", "patternProperties",
166
+ ]
167
+
168
+ # typical typos…
169
+ SCHEMA_KEYS_TYPOS = [
170
+ "typeof", "min", "max", "comment", "_comment", "comments", "minSize", "maxSize", "example",
171
+ "readonly", "writeonly",
172
+ "desription", "despcription", "descritpion", "descrition", "decription", "descrption",
173
+ "descripiton", "descripition", "decsription", "descripion", "Description", "unique", "@type",
174
+ "defaults", "$default", "ContentType",
175
+ "schema", "schemas", "link", "constant", "required:", "minimum:", "maximum:", "Schema",
176
+ "Default", "Type", "$type", "ref", "@id", "_id", "refs", "__ref", "#ref", "type:",
177
+ "$allOf", "$anyOf", "$oneOf", "anyof", "allof", "oneof",
178
+ "AllOf", "OneOf", "AnyOf", "$types", "#/anyOf", "#/allOf", "#/oneOf", "$extend", "$extends",
179
+ "$rel",
180
+ "read-only", "write-only", "minitems", "maxitems", "maxLen", "minLen", "maxValue", "minValue",
181
+ "max_length", "min_length",
182
+ "maxlength", "minlength", "minLenght", "maxLenght", "regex", "allOf:indexes: 1",
183
+ "allOf:indexes: 0", "$version", "Ref",
184
+ "numItems", "require", "patterns", "properites", "$deprecated", "deprecation",
185
+ "requiredProperties", "property", "Id",
186
+ "minimal", "maximal", "inclusiveMinimum", "inclusiveMaximum", "Comment", "$refs", "enums",
187
+ "Minimum", "Maximum", "totalItems", "additional_properties", "prefix",
188
+ ]
189
+
190
+ # "type" is managed manually
191
+ SPECIAL_VALUES = [ "$schema" ]
192
+
193
+ SPECIALS = [
194
+ # boolean JSON schema, empty schema
195
+ "true", "false", "{}",
196
+ # also some values or constructs
197
+ "type-list", "type-list-one", "type-list-empty",
198
+ "type=null", "type=boolean", "type=integer", "type=number", "type=string", "type=array",
199
+ "type=object",
200
+ "items-list",
201
+ "additionalProperties=true", "additionalItems=true",
202
+ "additionalProperties=false", "additionalItems=false",
203
+ "exclusiveMinimum=true", "exclusiveMinimum=false",
204
+ "exclusiveMaximum=true", "exclusiveMaximum=false",
205
+ "required-count", "required=true", "required=false", "required-empty", "required-list",
206
+ "required-bool",
207
+ "allOf-count", "anyOf-count", "oneOf-count", "prefixItems-count", "items-count",
208
+ "allOf-one", "anyOf-one", "oneOf-one", "prefixItems-one", "items-one", "enum-one",
209
+ "allOf-empty", "anyOf-empty", "oneOf-empty", "prefixItems-empty", "items-empty", "enum-empty",
210
+ # schemas
211
+ "properties-count", "patternProperties-count", "dependentSchemas-count",
212
+ "$defs-count", "definitions-count", "extends-count", "extends-one",
213
+ # missing?
214
+ "<unknown>", "<typos>", "<version>",
215
+ ]
216
+
217
+ INTEGER_KEYWORDS = [
218
+ "minItems", "maxItems",
219
+ "minProperties", "maxProperties",
220
+ "minLength", "maxLength",
221
+ "minContains", "maxContains",
222
+ ]
223
+
224
+ NUMBER_KEYWORDS = [
225
+ "minimum", "maximum"
226
+ ]
227
+
228
+ FORMATS = [
229
+ "date", "date-time", "time", "duration",
230
+ "email", "idn-email",
231
+ "hostname", "idn-hostname", "ipv4", "ipv6",
232
+ "uri", "uri-reference", "uri-template",
233
+ # what is an iri is beyond comprehension, and has been removed
234
+ "iri", "iri-reference",
235
+ "uuid",
236
+ "json-pointer", "relative-json-pointer",
237
+ "regex",
238
+ # OLD
239
+ "color", "phone",
240
+ ]
241
+
242
+ # NOTE there are other formats in other OpenAPI versions?
243
+ OPENAPI_310_FORMATS = [
244
+ "int32", "int64", "float", "double", "password"
245
+ ]
246
+
247
+ OPENAPI_310_KEYWORDS = [
248
+ "discriminator", "xml", "externalDocs", "example"
249
+ ]
250
+
251
+ # collected sets need to be changed to lists for json serialization
252
+ SETS = [
253
+ "<typos-keywords>", "<typos-keywords-where>", "<unknown-keywords>",
254
+ "<errors>", "<bad-properties-nesting-where>", "<openapi>", "<extensions>",
255
+ ]
256
+
257
+
258
+ # schema keywords
259
+ SCHEMA_KEYS: set[str] = (
260
+ set(SCHEMA_KEYS_SIMPLE) |
261
+ set(SCHEMA_KEYS_VALUE_SCHEMA) |
262
+ set(SCHEMA_KEYS_ARRAY_OF_SCHEMAS) |
263
+ set(SCHEMA_KEYS_OBJECT_VALUES_SCHEMAS)
264
+ )
265
+
266
+ # all expected schema keys to initialize
267
+ SCHEMA_KEYS_INIT = list(
268
+ SCHEMA_KEYS_SIMPLE +
269
+ SCHEMA_KEYS_VALUE_SCHEMA +
270
+ SCHEMA_KEYS_ARRAY_OF_SCHEMAS +
271
+ SCHEMA_KEYS_OBJECT_VALUES_SCHEMAS +
272
+ SPECIALS
273
+ )
274
+
275
+ #
276
+ # VERSION GUESSING
277
+ #
278
+
279
+ CURRENT_VERSION = 9
280
+ NEXT_VERSION = CURRENT_VERSION + 1
281
+ LATEST_VERSION = NEXT_VERSION + 1
282
+
283
+ # explicit version identification in $schema
284
+ # 0 for not set, -1 if multiply set; error?
285
+ SCHEMA_VERSIONS = {
286
+ "/draft-01/": 1,
287
+ "/draft-02/": 2,
288
+ "/draft-03/": 3,
289
+ "/draft-04/": 4,
290
+ "/draft-05/": 5, # probably not used anywhere?
291
+ "/draft-06/": 6,
292
+ "/draft-07/": 7,
293
+ "/draft-08/": 8,
294
+ "/draft-2019-09/": 8,
295
+ "/draft/2019-09/": 8,
296
+ "/draft-2020-12/": 9,
297
+ "/draft/2020-12/": 9,
298
+ "/draft-next/": NEXT_VERSION,
299
+ "/draft/next/": NEXT_VERSION,
300
+ "json-schema.org/schema": LATEST_VERSION,
301
+ }
302
+
303
+ # version specific keywords which help guessing the schema
304
+ # note: some keywords type can also help guessing…
305
+ # items list vs simple schema
306
+ # formats…
307
+ SCHEMA_VERSION_GUESS = {
308
+ # TODO boolean schemas, *BUT* problems with "additional{Items,Properties}"…
309
+ "type=any": [1, 2, 3],
310
+ "requires": [1, 2],
311
+ "required-bool": [3],
312
+ "required-list": [4, 5, 6, 7, 8, 9],
313
+ "exclusiveMinimum=true": [3, 4],
314
+ "exclusiveMinimum=false": [3, 4],
315
+ "exclusiveMaximum=true": [3, 4],
316
+ "exclusiveMaximum=false": [3, 4],
317
+ "items-list": [1, 2, 3, 4, 5, 6, 7, 8], # 8: deprecated?
318
+ "maxDecimal": [1],
319
+ "optional": [1, 2],
320
+ "additionalItems": [3, 4, 5, 6, 7, 8], # 8: deprecated?
321
+ "prefixItems": [9],
322
+ "minimumCanEqual": [1, 2],
323
+ "maximumCanEqual": [1, 2],
324
+ "contentEncoding": [1, 2, 7, 8, 9], # disappear then reappears!
325
+ "exclusiveMinimum": [3, 4, 5, 6, 7, 8, 9],
326
+ "exclusiveMaximum": [3, 4, 5, 6, 7, 8, 9],
327
+ "patternProperties": [3, 4, 5, 6, 7, 8, 9],
328
+ "divisibleBy": [2, 3],
329
+ "disallow": [1, 2, 3],
330
+ "extends": [1, 2, 3],
331
+ "uniqueItems": [2, 3, 4, 5, 6, 7, 8, 9],
332
+ "multipleOf": [4, 5, 6, 7, 8, 9],
333
+ "minProperties": [4, 5, 6, 7, 8, 9],
334
+ "maxProperties": [4, 5, 6, 7, 8, 9],
335
+ "allOf": [4, 5, 6, 7, 8, 9],
336
+ "anyOf": [4, 5, 6, 7, 8, 9],
337
+ "oneOf": [4, 5, 6, 7, 8, 9],
338
+ "not": [4, 5, 6, 7, 8, 9],
339
+ "const": [6, 7, 8, 9],
340
+ "propertyNames": [6, 7, 8, 9],
341
+ "id": [1, 2, 3, 4, 5],
342
+ "$id": [6, 7, 8, 9],
343
+ "if": [7, 8, 9],
344
+ "then": [7, 8, 9],
345
+ "else": [7, 8, 9],
346
+ "contentMediaType": [7, 8, 9],
347
+ "$comment": [7, 8, 9],
348
+ "readOnly": [7, 8, 9],
349
+ "writeOnly": [7, 8, 9],
350
+ "definitions": [4, 5, 6, 7, 8, 9], # deprecated 8- (official 9)
351
+ "dependencies": [3, 4, 5, 6, 7, 8, 9], # deprecated 8- (official 9)
352
+ "$def": [8, 9],
353
+ "deprecated": [8, 9],
354
+ "dependentSchemas": [8, 9],
355
+ "dependentRequired": [8, 9],
356
+ "unevaluatedItems": [8, 9],
357
+ "unevaluatedProperties": [8, 9],
358
+ "$recursiveRef": [8],
359
+ "$recursiveAnchor": [8],
360
+ "$dynamicRef": [9],
361
+ "$dynamicAnchor": [9],
362
+ "propertyDependencies": [10], # new online draft
363
+ # OpenAPI 3.1.0 extension
364
+ "discriminator": [100],
365
+ # "propertyName": [100],
366
+ # "mapping": [100],
367
+ "externalDocs": [100],
368
+ "xml": [100],
369
+ }
370
+
371
+ FORMAT_ALL_VERSIONS = [ "date-time", "uri", "email", "ipv6" ]
372
+
373
+ # which formats are allowed at each versions
374
+ FORMAT_VERSIONS = {
375
+ "date": [1, 2, 3, 7, 8, 9],
376
+ "date-time": [1, 2, 3, 4, 5, 6, 7, 8, 9],
377
+ "time": [1, 2, 3, 7, 8, 9],
378
+ "duration": [8, 9],
379
+ "utc-millisec": [3],
380
+ "regex": [1, 2, 3, 7, 8, 9],
381
+ "color": [1, 2, 3],
382
+ "style": [1, 2, 3],
383
+ "phone": [1, 2, 3],
384
+ "uri": [1, 2, 3, 4, 5, 6, 7, 8, 9],
385
+ "iri": [7, 8, 9], # RFC 3987
386
+ "uri-ref": [5],
387
+ "uri-reference": [6, 7, 8, 9],
388
+ "iri-reference": [7, 8, 9],
389
+ "uuid": [8, 9], # 9? was it really in 2020-12?
390
+ "uri-template": [6, 7, 8, 9],
391
+ "json-pointer": [6, 7, 8, 9],
392
+ "relative-json-pointer": [7, 8, 9],
393
+ "email": [1, 2, 3, 4, 5, 6, 7, 8, 9],
394
+ "idn-email": [7, 8, 9],
395
+ "ip-address": [1, 2, 3],
396
+ "ipv4": [4, 5, 6, 7, 8, 9],
397
+ "ipv6": [1, 2, 3, 4, 5, 6, 7, 8, 9],
398
+ "host-name": [3],
399
+ "hostname": [4, 5, 6, 7, 8, 9],
400
+ "idn-hostname": [7, 8, 9],
401
+ "street-address": [1, 2],
402
+ "locality": [1, 2],
403
+ "region": [1, 2],
404
+ "country": [1, 2],
405
+ # additional custom formats may be defined with a URL to a definition of the format
406
+ # OpenAPI 3.1:
407
+ # - integer: int32, int64
408
+ # - number: float, double
409
+ # - string: password
410
+ }
411
+
412
+
413
+ # add special version numbers
414
+ for _, versions in SCHEMA_VERSION_GUESS.items():
415
+ if CURRENT_VERSION in versions:
416
+ versions.append(NEXT_VERSION)
417
+ versions.append(LATEST_VERSION)
418
+
419
+ for f in FORMAT_ALL_VERSIONS:
420
+ del FORMAT_VERSIONS[f]
421
+
422
+ for _, versions in FORMAT_VERSIONS.items():
423
+ if CURRENT_VERSION in versions:
424
+ versions.append(NEXT_VERSION)
425
+ versions.append(LATEST_VERSION)
426
+
427
+
428
+ def guess_version(col: dict):
429
+ ALL = { i for i in range(1, LATEST_VERSION + 1) }
430
+ valid = set()
431
+ invalid = set()
432
+ keywords = []
433
+
434
+ for prop, versions in SCHEMA_VERSION_GUESS.items():
435
+ if prop in col and col[prop] > 0:
436
+ if set(versions).difference(valid) or ALL.difference(versions).difference(invalid):
437
+ keywords.append(prop)
438
+ valid.update(versions)
439
+ invalid.update(ALL.difference(versions))
440
+
441
+ # possible versions
442
+ if not valid and not invalid:
443
+ # no clues
444
+ versions = ALL
445
+ else:
446
+ versions = valid.difference(invalid)
447
+
448
+ col["<versions>"] = list(sorted(versions))
449
+
450
+ if not versions:
451
+ collectErr(col, "incompatible version guesses", f"{keywords}", "$")
452
+
453
+
454
+ # all predefined JSON Schema types for Draft 4 and later
455
+ JSON_SCHEMA_TYPES = [ "null", "boolean", "integer", "number", "string", "array", "object" ]
456
+
457
+
458
+ def typeof(v: Any) -> str:
459
+ return ("null" if v is None else
460
+ "boolean" if isinstance(v, bool) else
461
+ "integer" if isinstance(v, int) else
462
+ "number" if isinstance(v, float) else
463
+ "string" if isinstance(v, str) else
464
+ "array" if isinstance(v, (list, tuple)) else
465
+ "object" if isinstance(v, dict) else
466
+ "<unknown>")
467
+
468
+
469
+ def collectAdd(collection, key, n):
470
+ if key in collection:
471
+ collection[key] += n
472
+ else:
473
+ collection[key] = n
474
+
475
+
476
+ def collectCnt(collection, key):
477
+ collectAdd(collection, key, 1)
478
+
479
+
480
+ def collectSet(collection, key, val):
481
+ if key not in collection:
482
+ collection[key] = set()
483
+ collection[key].add(val)
484
+
485
+
486
+ def collectErr(collection, cat, what, path):
487
+ collectSet(collection, "<errors>", (cat, what, path))
488
+
489
+
490
+ def collectTypo(collection, what, path):
491
+ collectSet(collection, "<typos-keywords>", what)
492
+ collectSet(collection, "<typos-keywords-where>", (what, path))
493
+
494
+
495
+ def ap(path: str, key: str):
496
+ if re.search(r"^[a-zA-Z0-9_]+$", key):
497
+ return f"{path}.{key}"
498
+ else:
499
+ return f'{path}."{key}"'
500
+
501
+ #
502
+ # TYPE RESOLUTION
503
+ #
504
+
505
+
506
+ ALL_TYPES = { "null", "boolean", "integer", "number", "string", "array", "object" }
507
+ NO_TYPE = set()
508
+
509
+
510
+ def fixIntNum(types: set[str]):
511
+ """Ensure that integer/number are accepted one for the other."""
512
+ if "integer" in types:
513
+ types.add("number")
514
+ if "number" in types:
515
+ types.add("integer")
516
+
517
+
518
+ # getTypes cache path and context to set of types there
519
+ GET_TYPES_CACHE: dict[str, set[str]] = {}
520
+
521
+
522
+ def getTypes(
523
+ jdata: JsonSchema, # JSON Schema
524
+ defs: dict[str, Any], # current definitions
525
+ recs: list[str], # paths to detect recursion
526
+ path: str, # current path
527
+ context: set[str] # external context for adjacent keywords
528
+ ) -> set[str]:
529
+ """Return the possible types for the current schema."""
530
+
531
+ # log.warning(f"types on {path} <- {context}")
532
+
533
+ if path and path.endswith(".propertyNames"):
534
+ # we know that we are checking a string
535
+ return { "string" }
536
+
537
+ if isinstance(jdata, bool):
538
+ return set(ALL_TYPES if jdata else NO_TYPE)
539
+
540
+ if not isinstance(jdata, dict):
541
+ # FIXME should not be possible!
542
+ return { "BAD" }
543
+
544
+ # cache shortcut
545
+ path_ctx = path + ":" + str(sorted(context))
546
+ if path_ctx in GET_TYPES_CACHE:
547
+ return GET_TYPES_CACHE[path_ctx]
548
+
549
+ # set initial possible types
550
+ if "type" in jdata:
551
+ # if there is an explicit type, it constrains the result
552
+ types = jdata["type"]
553
+ if isinstance(types, str):
554
+ if types == "any": # early versions…
555
+ possible_types = set(ALL_TYPES)
556
+ elif types in ALL_TYPES:
557
+ possible_types = { types }
558
+ else:
559
+ log.warning(f"unexpected string type: {types}")
560
+ # FIXME NO_TYPE?
561
+ possible_types = set(ALL_TYPES)
562
+ elif isinstance(types, (tuple, list)):
563
+ ltypes = set()
564
+ for i, t in enumerate(types):
565
+ if isinstance(t, str):
566
+ if t == "any":
567
+ ltypes.update(ALL_TYPES)
568
+ elif t in ALL_TYPES:
569
+ ltypes.add(t)
570
+ else:
571
+ log.warning(f"coldly ignoring unexpected type: {t}")
572
+ elif isinstance(t, dict): # early versions
573
+ ltypes.update(getTypes(t, defs, recs, f"{path}.type[{i}]", ALL_TYPES))
574
+ else:
575
+ log.warning(f"unexpected type item type: {typeof(t)}")
576
+ possible_types = ltypes
577
+ elif isinstance(types, dict): # early versions
578
+ possible_types = getTypes(types, defs, recs, path + ".type", ALL_TYPES)
579
+ else:
580
+ log.warning(f"unexpected value for type: {typeof(types)}")
581
+ possible_types = { "BOF" }
582
+ else:
583
+ possible_types = set(ALL_TYPES)
584
+ fixIntNum(possible_types)
585
+
586
+ # make current explicit types consistent with context
587
+ possible_types.intersection_update(context)
588
+ fixIntNum(possible_types)
589
+
590
+ # then reduce with other type informations
591
+ if "const" in jdata:
592
+ possible_types.intersection_update({ typeof(jdata["const"]) })
593
+ fixIntNum(possible_types)
594
+
595
+ if "enum" in jdata and isinstance(jdata["enum"], (tuple, list)):
596
+ possible_types.intersection_update(typeof(i) for i in jdata["enum"])
597
+ fixIntNum(possible_types)
598
+
599
+ if "$ref" in jdata:
600
+ rpath = jdata["$ref"]
601
+ if isinstance(rpath, str):
602
+ rpathu = url_unquote(rpath)
603
+ if rpathu in recs:
604
+ log.warning(f"preventing recursion on {rpath}")
605
+ # possible_types is left "as-is"?
606
+ elif rpathu in defs:
607
+ possible_types.intersection_update(getTypes(defs[rpathu], defs, recs + [ rpathu ],
608
+ rpathu, possible_types))
609
+ fixIntNum(possible_types)
610
+ else:
611
+ log.warning(f"definition not available: {rpath}")
612
+ else:
613
+ log.warning(f"unexpected $ref value type: {typeof(rpath)}")
614
+
615
+ if "allOf" in jdata:
616
+ alls = jdata["allOf"]
617
+ atypes = set(ALL_TYPES)
618
+ if isinstance(alls, (tuple, list)):
619
+ for i, a in enumerate(alls):
620
+ atypes.intersection_update(getTypes(a, defs, recs, # pyright: ignore
621
+ f"{path}.allOf[{i}]", possible_types))
622
+ else:
623
+ log.warning(f"unexpected allOf type: {typeof(alls)}")
624
+ possible_types.intersection_update(atypes)
625
+ fixIntNum(possible_types)
626
+
627
+ for prop in ("anyOf", "oneOf"):
628
+ if prop in jdata:
629
+ anys = jdata[prop]
630
+ atypes = set()
631
+ if isinstance(anys, (tuple, list)):
632
+ for i, a in enumerate(anys):
633
+ atypes.update(getTypes(a, defs, recs, # pyright: ignore
634
+ f"{path}.{prop}[{i}]", possible_types))
635
+ else:
636
+ log.warning(f"unexpected {prop} type: {typeof(anys)}")
637
+ possible_types.intersection_update(atypes)
638
+ fixIntNum(possible_types)
639
+
640
+ # FIXME if/then/else/not *could* maybe constraint some types as well
641
+
642
+ if path_ctx not in GET_TYPES_CACHE:
643
+ GET_TYPES_CACHE[path_ctx] = possible_types
644
+
645
+ # log.warning(f"types on {path} -> {possible_types}")
646
+
647
+ return possible_types
648
+
649
+ #
650
+ # COLLECT TYPE HINTS
651
+ #
652
+
653
+
654
+ GET_HINTS_CACHE: dict[str, set[str]] = {}
655
+
656
+
657
+ def getHints(
658
+ jdata: JsonSchema, # JSON data
659
+ defs: dict[str, Any], # current definitions
660
+ recs: list[str], # paths to detect recursion
661
+ path: str # current path
662
+ ) -> set[str]:
663
+ """Gather hints about types."""
664
+
665
+ if isinstance(jdata, bool):
666
+ return NO_TYPE
667
+
668
+ if not isinstance(jdata, dict):
669
+ log.warning(f"bad schema type at {path}: {typeof(jdata)}")
670
+ return NO_TYPE
671
+
672
+ if path in GET_HINTS_CACHE:
673
+ return GET_HINTS_CACHE[path]
674
+
675
+ hints = set()
676
+
677
+ # handle direct hints
678
+ for prop in jdata.keys():
679
+ if prop in PROP_TO_TYPE:
680
+ hints.add(PROP_TO_TYPE[prop])
681
+
682
+ # format hint depends on the value
683
+ if "format" in jdata:
684
+ fmt = jdata["format"]
685
+ if isinstance(fmt, str):
686
+ if fmt in FORMAT_ALL_VERSIONS or fmt in FORMAT_VERSIONS or fmt == "password":
687
+ hints.add("string")
688
+ elif fmt in ("integer", "int32", "int64", "float", "double", "uint",
689
+ "uint32", "uint64"):
690
+ hints.add("number")
691
+ else:
692
+ # unknown value, not hint…
693
+ pass
694
+
695
+ # update with indirect hints
696
+ if "$ref" in jdata:
697
+ ref = jdata["$ref"]
698
+
699
+ if isinstance(ref, str):
700
+ refu = url_unquote(ref)
701
+ if refu in recs:
702
+ log.warning(f"preventing recursion for hints on {ref}")
703
+ elif refu in defs:
704
+ hints.update(getHints(defs[refu], defs, recs + [refu], refu))
705
+ else:
706
+ log.warning(f"ignoring $ref hints: {ref}")
707
+ else:
708
+ log.warning(f"ignoring bad $ref value type: {typeof(ref)}")
709
+
710
+ # combinators
711
+ if "allOf" in jdata:
712
+ schemas = jdata["allOf"]
713
+ if isinstance(schemas, (tuple, list)):
714
+ shints = set()
715
+ for i, s in enumerate(schemas):
716
+ shints.update(getHints(s, defs, recs, f"{path}.allOf[{i}]")) # pyright: ignore
717
+ hints.update(shints)
718
+ else:
719
+ log.warning(f"ignoring bad allOf value type: {typeof(schemas)}")
720
+
721
+ for prop in ("anyOf", "oneOf"):
722
+ if prop in jdata:
723
+ schemas = jdata[prop]
724
+ if isinstance(schemas, (tuple, list)):
725
+ shints = set(ALL_TYPES)
726
+ for i, s in enumerate(schemas):
727
+ shints.intersection_update(
728
+ getHints(s, defs, recs, f"{path}.{prop}[{i}]")) # pyright: ignore
729
+ hints.update(shints)
730
+ else:
731
+ log.warning(f"ignoring bad {prop} value type: {typeof(schemas)}")
732
+
733
+ # FIXME should it do something with not/if/then/else?
734
+ # FIXME format?
735
+
736
+ if path not in GET_HINTS_CACHE:
737
+ GET_HINTS_CACHE[path] = hints
738
+ # else cannot happen? or check consistency?
739
+
740
+ return hints
741
+
742
+
743
+ def looks_like_simple_dependencies(data) -> bool:
744
+ """Check whether it is a simple { "": [""] }."""
745
+ if not isinstance(data, dict):
746
+ return False
747
+ for k, v in data.items():
748
+ if not isinstance(k, str) or not isinstance(v, (tuple, list)):
749
+ return False
750
+ for s in v:
751
+ if not isinstance(s, str):
752
+ return False
753
+ return True
754
+
755
+
756
+ class Defs:
757
+ """Keep track of definitions and uses."""
758
+
759
+ def __init__(self):
760
+ # is it an official definition?
761
+ self._isdef = re.compile(r"/(\$defs|definitions)/\w+$").search
762
+ self._defs: dict[str, Any] = {}
763
+ self._uses: dict[str, int] = {}
764
+
765
+ def __setitem__(self, p: str, v):
766
+ if self._isdef(p):
767
+ self._uses[p] = 0
768
+ self._defs[p] = v
769
+
770
+ def __contains__(self, p: str):
771
+ return p in self._defs
772
+
773
+ def __getitem__(self, p: str):
774
+ if self._isdef(p):
775
+ self._uses[p] += 1
776
+ return self._defs[p]
777
+
778
+ def __delitem__(self, p: str):
779
+ del self._defs[p]
780
+
781
+ # vs unreachable?
782
+ def unusedDefs(self):
783
+ return { p for p in self._uses.keys() if self._uses[p] == 0 }
784
+
785
+
786
+ # maybe too much, it could collect the root and use the path when needed.
787
+ def _collect_all_defs_rec(data, defs, path: str = "#"):
788
+ """Collect all possible local definitions just in case…"""
789
+ if isinstance(data, (bool, dict, list, tuple)):
790
+ defs[path] = data
791
+ if data is None or isinstance(data, (bool, int, float, str)):
792
+ pass
793
+ elif isinstance(data, (list, tuple)):
794
+ for i, item in enumerate(data):
795
+ # TODO check JSON Schema url path stuff
796
+ _collect_all_defs_rec(item, defs, f"{path}/{i}")
797
+ elif isinstance(data, dict):
798
+ for k, v in data.items():
799
+ _collect_all_defs_rec(v, defs, f"{path}/{k}")
800
+
801
+
802
+ def _json_schema_stats_rec(
803
+ jdata: JsonSchema, # schema
804
+ path: str, # path to ~
805
+ collection: dict[str, Any], # collected data
806
+ defs: dict[str, Any] = {}, # definitions
807
+ type_context: set[str] = ALL_TYPES, # type restrictions at this point
808
+ is_defs: bool = False, # is this just a definition
809
+ is_logic: bool = False # are we inside a if/then/else/not?
810
+ ) -> None:
811
+ """Recursive usage stats collection about JSON Schema features."""
812
+
813
+ if isinstance(jdata, bool):
814
+ if jdata:
815
+ collection["true"] += 1
816
+ else:
817
+ collection["false"] += 1
818
+ return
819
+
820
+ if not isinstance(jdata, dict):
821
+ collectErr(collection, "invalid root schema type", typeof(jdata), path)
822
+ log.warning(f"skipping: [{path}] {str(jdata)[:64]}")
823
+ return
824
+
825
+ if len(jdata) == 0:
826
+ collection["{}"] += 1
827
+ return
828
+
829
+ # current schema guessing
830
+ if "$schema" in jdata:
831
+ version = jdata["$schema"]
832
+ if isinstance(version, str):
833
+ if "/json-schema.org/" in version:
834
+ for pat, vers in SCHEMA_VERSIONS.items():
835
+ if pat in version:
836
+ cvers = collection["<version>"]
837
+ if cvers == 0:
838
+ collection["<version>"] = vers
839
+ collection["<version-path>"] = path
840
+ elif cvers > 0 and cvers != vers:
841
+ collection["<version>"] = -1
842
+ collectErr(collection, "multiple schema versions",
843
+ f"{cvers} {vers}", path)
844
+ break
845
+ if collection["<version>"] == 0: # not assigned
846
+ collectErr(collection, "unexpected $schema", version, path)
847
+ else:
848
+ collectErr(collection, "unexpected $schema version", version, path)
849
+ else:
850
+ collectErr(collection, "unexpected $schema value type", typeof(version), path)
851
+
852
+ # memoize defs for later
853
+ if "$defs" in jdata and "definitions" in jdata:
854
+ collectErr(collection, "definition issue", "$defs/definitions mix", path)
855
+
856
+ kdefs = "$defs" if "$defs" in jdata else "definitions"
857
+ ldefs = jdata[kdefs] if kdefs in jdata else None
858
+
859
+ if kdefs in jdata and not isinstance(jdata[kdefs], dict):
860
+ collectErr(collection, "definition issue", f"unexpected type {typeof(ldefs)}", path)
861
+
862
+ # get actually possible types
863
+ types = getTypes(jdata, defs, [], path, type_context)
864
+
865
+ if not types:
866
+ collectErr(collection, "type error", "no possible type", path)
867
+
868
+ # condition analysis
869
+ # not?
870
+ if "if" in jdata:
871
+ lpath = path + ".if"
872
+ if "then" not in jdata and "else" not in jdata:
873
+ collectErr(collection, "cond error", "if no then/else", lpath)
874
+ ifs = jdata["if"]
875
+ if isinstance(ifs, dict):
876
+ ifprops, reqprops = set(), set()
877
+ if "properties" in ifs and isinstance(ifs["properties"], dict):
878
+ ifprops.update(ifs["properties"].keys())
879
+ if "required" in ifs and isinstance(ifs["required"], list):
880
+ reqprops.update(ifs["required"])
881
+ if "required" in jdata and isinstance(jdata["required"], list):
882
+ reqprops.update(jdata["required"])
883
+ unreqprops = ifprops - reqprops
884
+ if unreqprops:
885
+ collectErr(collection, "cond error",
886
+ f"ignored un-required if props: {" ".join(sorted(unreqprops))}", lpath)
887
+ elif "then" in jdata or "else" in jdata:
888
+ collectErr(collection, "cond error", "then/else no if", path)
889
+
890
+ # scan all properties
891
+ for prop, val in jdata.items():
892
+
893
+ lpath = ap(path, prop)
894
+
895
+ # TODO be less aggressive when recurring in if/then/else, depends on the outside object
896
+ # TODO check not/if/then/else keywords consistency with outside type
897
+
898
+ # count (expected) prop occurences
899
+ if prop in collection:
900
+ collection[prop] += 1
901
+ else:
902
+ # count typos and unknown and keep track of openapi
903
+ if prop.startswith("x-"):
904
+ collectSet(collection, "<extensions>", prop)
905
+ elif prop in OPENAPI_310_KEYWORDS:
906
+ collectSet(collection, "<openapi>", prop)
907
+ elif prop in SCHEMA_KEYS_TYPOS:
908
+ collection["<typos>"] += 1
909
+ collectTypo(collection, prop, lpath)
910
+ else:
911
+ collection["<unknown>"] += 1
912
+ collectSet(collection, "<unknown-keywords>", prop)
913
+ # this may really
914
+ log.warning(f"unexpected: {prop}")
915
+
916
+ # unknow keyword, try subschemas for draft 01-04
917
+ # this may result in false positive, eg for OpenAPI "example"
918
+ if (isinstance(val, dict) and prop not in ("example", "discriminator") and
919
+ not prop.startswith("x-")):
920
+ _json_schema_stats_rec(val, lpath, collection, defs, is_logic=is_logic)
921
+
922
+ # FIXME because of extensions any keyword should be ignored,
923
+ # probaby hidding away any typo…
924
+
925
+ # recurse in some cases, or specials
926
+ if prop == "type":
927
+
928
+ if isinstance(val, (list, tuple)):
929
+ collection["type-list"] += 1
930
+ if len(val) == 0:
931
+ collection["type-list-empty"] += 1
932
+ elif len(val) == 1:
933
+ collection["type-list-one"] += 1
934
+ vals = val
935
+ elif isinstance(val, str):
936
+ vals = [val]
937
+ else:
938
+ collectErr(collection, "invalid type value", typeof(val), lpath)
939
+ continue
940
+
941
+ nonstr = list(filter(lambda i: not isinstance(i, str), vals))
942
+ if nonstr:
943
+ collectErr(collection, "invalid type value in list", str(nonstr), lpath)
944
+ continue
945
+
946
+ for v in vals:
947
+ if isinstance(v, str):
948
+ pval = f"{prop}={v}"
949
+ if pval in collection:
950
+ collection[pval] += 1
951
+ else:
952
+ collectErr(collection, "unexpected type data", v, path)
953
+ elif isinstance(v, dict):
954
+ collectErr(collection, "maybe unexpected type type", str(v), lpath)
955
+ # try a sub-type object
956
+ _json_schema_stats_rec(v, lpath, collection, defs, is_logic=is_logic)
957
+ else:
958
+ collectErr(collection, "unexpected type type", str(v), lpath)
959
+
960
+ elif prop == "format":
961
+
962
+ # TODO improve format analysis! count all occurrences!?
963
+ if isinstance(val, str):
964
+ if val in OPENAPI_310_FORMATS:
965
+ collectSet(collection, "<openapi>", val)
966
+ continue
967
+ elif val not in FORMATS:
968
+ collectErr(collection, "unexpected format", val, lpath)
969
+ continue
970
+ # count expected format values
971
+ collectCnt(collection, f"format={val}")
972
+ else:
973
+ collectErr(collection, "invalid format type", typeof(val), lpath)
974
+ log.warning(f"ignoring {prop} value")
975
+ continue
976
+
977
+ elif prop == "pattern":
978
+
979
+ if not isinstance(val, str):
980
+ collectErr(collection, "invalid pattern type", typeof(val), lpath)
981
+ elif not is_regex(val):
982
+ collectErr(collection, "invalid regex", val, lpath)
983
+
984
+ elif prop == "patternProperties":
985
+
986
+ if isinstance(val, dict):
987
+ for k, v in val.items():
988
+ # assert isinstance(k, str)
989
+ if not is_regex(k):
990
+ collectErr(collection, "invalid regex", k, lpath)
991
+ else:
992
+ collectErr(collection, "invalid patternProperties type", typeof(val), lpath)
993
+
994
+ elif prop in ("dependencies", "properties"):
995
+
996
+ if not isinstance(val, dict):
997
+ collectErr(collection, f"non object {prop}", typeof(val), lpath)
998
+
999
+ elif prop == "items":
1000
+
1001
+ if isinstance(val, (list, tuple)):
1002
+ collection["items-list"] += 1
1003
+ if collection["<version>"] >= 9 and isinstance(val, (list, tuple)):
1004
+ collectErr(collection, "draft incompatibility",
1005
+ "invalid array value for items after Draft 9", lpath)
1006
+
1007
+ elif (prop == "id" and collection["<version>"] <= 5 and collection["<version>"] != 0 or
1008
+ prop == "$id" and collection["<version>"] >= 6):
1009
+
1010
+ if not isinstance(val, str):
1011
+ collectErr(collection, f"invalid {prop} value type", typeof(val), lpath)
1012
+
1013
+ elif (prop == "id" and collection["<version>"] >= 6 or
1014
+ prop == "$id" and collection["<version>"] <= 5 and collection["<version>"] != 0):
1015
+
1016
+ collectErr(collection, "draft incompatibility", "id/$id draft confusion", lpath)
1017
+
1018
+ elif prop == "$ref":
1019
+ # NOTE "#" is used for recursion at the root
1020
+ if not isinstance(val, str):
1021
+ collectErr(collection, "invalid $ref type", typeof(val), lpath)
1022
+ elif val.startswith("#/"):
1023
+ valu = url_unquote(val)
1024
+ if valu not in defs:
1025
+ collectErr(collection, "dangling $ref value", val, lpath)
1026
+ else:
1027
+ log.warning(f"ignoring $ref value: {val}")
1028
+
1029
+ elif prop == "default":
1030
+ if typeof(val) not in types:
1031
+ collectErr(collection, "type inconsistency",
1032
+ f"default {typeof(val)} / {types}", lpath)
1033
+
1034
+ elif prop == "examples":
1035
+ if isinstance(val, (list, tuple)):
1036
+ for v in val:
1037
+ if typeof(v) not in types:
1038
+ collectErr(collection, "type inconsistency",
1039
+ f"examples {typeof(v)} / {types}", lpath)
1040
+ else:
1041
+ collectErr(collection, f"invalid {prop} value type", typeof(val), lpath)
1042
+
1043
+ # possible recursions
1044
+ if prop in SCHEMA_KEYS_OBJECT_VALUES_SCHEMAS:
1045
+ # log.info(f"object {prop}: {val}")
1046
+ is_a_defs = prop in ("$defs", "definitions")
1047
+ if isinstance(val, dict):
1048
+ for k, v in val.items():
1049
+ _json_schema_stats_rec(v, ap(lpath, k), collection, # pyright: ignore
1050
+ defs, is_defs=is_a_defs, is_logic=is_logic)
1051
+ collection[f"{prop}-count"] += len(val)
1052
+ else:
1053
+ log.warning(f"ignoring {prop} non-object value")
1054
+
1055
+ if prop in SCHEMA_KEYS_ARRAY_OF_SCHEMAS: # combinators and others
1056
+ if isinstance(val, (list, tuple)):
1057
+ if len(val) == 0:
1058
+ collectErr(collection, "empty schema array", prop, lpath)
1059
+ context = types if prop in ("allOf", "anyOf", "oneOf") else ALL_TYPES
1060
+ for i, v in enumerate(val):
1061
+ _json_schema_stats_rec(v, f"{lpath}[{i}]", collection, # pyright: ignore
1062
+ defs, context, is_logic=is_logic)
1063
+ else:
1064
+ log.warning(f"ignoring {prop} non-array value")
1065
+
1066
+ if prop in SCHEMA_KEYS_VALUE_SCHEMA:
1067
+ if prop == "dependencies" and looks_like_simple_dependencies(val):
1068
+ # TODO we should check that properties exists!
1069
+ pass
1070
+ elif isinstance(val, (bool, dict)):
1071
+ _json_schema_stats_rec(val, lpath, collection, defs,
1072
+ is_logic=is_logic or prop in ("if", "then", "else", "not"))
1073
+ else:
1074
+ log.warning(f"ignoring {prop} non-schema value")
1075
+
1076
+ if prop in SCHEMA_KEYS_VALUE_SCHEMA and prop in SCHEMA_KEYS_ARRAY_OF_SCHEMAS:
1077
+ if not isinstance(val, (bool, dict, list, tuple)):
1078
+ collectErr(collection, "unexpected type", f"{prop} / {typeof(val)}", lpath)
1079
+
1080
+ # special case for which we keep a (truncated) value
1081
+ if prop in SPECIAL_VALUES:
1082
+ key = f"{prop}={str(val)[:64]}"
1083
+ if key not in collection:
1084
+ collection[key] = 0
1085
+ collection[key] += 1
1086
+
1087
+ if prop in INTEGER_KEYWORDS and not isinstance(val, int):
1088
+ collectErr(collection, "non integer value", f"{prop} / {typeof(val)}", lpath)
1089
+
1090
+ elif prop in NUMBER_KEYWORDS and not isinstance(val, (int, float)):
1091
+ collectErr(collection, "non number value", f"{prop} / {typeof(val)}", lpath)
1092
+
1093
+ # Case additionalProperties in Properties
1094
+ if prop == "properties" and isinstance(val, dict):
1095
+ # dans val il y a des propriétés PER_TYPE["object"] mais pas la propriété "properties"
1096
+ doubts = list(filter(lambda k: k in DOUBTFUL_PROPERTY_NAMES, val.keys()))
1097
+ nb_js_prop = len(doubts)
1098
+ # nb_prop = len(val)
1099
+ if nb_js_prop >= 1 and "properties" not in val:
1100
+ collectCnt(collection, "<bad-properties-nesting>")
1101
+ collectSet(collection, "<bad-properties-nesting-where>", f"{path}: {doubts}")
1102
+
1103
+ elif prop == "additionalProperties" and isinstance(val, bool):
1104
+ if val:
1105
+ collection["additionalProperties=true"] += 1
1106
+ else:
1107
+ collection["additionalProperties=false"] += 1
1108
+
1109
+ elif prop == "additionalItems" and isinstance(val, bool):
1110
+ if val:
1111
+ collection["additionalItems=true"] += 1
1112
+ else:
1113
+ collection["additionalItems=false"] += 1
1114
+
1115
+ elif prop == "required":
1116
+ if collection["<version>"] >= 4 and isinstance(val, bool):
1117
+ collectErr(collection, "draft incompatibility",
1118
+ "invalid bool required for Draft 4 and later", lpath)
1119
+ elif collection["<version>"] == 3 and isinstance(val, (list, tuple)):
1120
+ collectErr(collection, "draft incompatibility",
1121
+ "invalid array required for Draft 3", lpath)
1122
+ elif not isinstance(val, (bool, list, tuple)):
1123
+ collectErr(collection, "invalid required type", typeof(val), lpath)
1124
+
1125
+ if isinstance(val, (list, tuple)):
1126
+ collection["required-list"] += 1
1127
+ collection["required-count"] += len(val)
1128
+ if len(val) == 0:
1129
+ collection["required-empty"] += 1
1130
+ if collection["<version>"] == 4:
1131
+ # but is is okay for 6 and later:-/
1132
+ collectErr(collection, "draft incompatibility",
1133
+ "invalid empty required for Draft 4", lpath)
1134
+ elif isinstance(val, bool): # OLD
1135
+ collection["required-bool"] += 1
1136
+ collection[f"required={str(val).lower()}"] += 1
1137
+ else:
1138
+ log.warning(f"unexpected required: {val}")
1139
+ continue
1140
+
1141
+ elif prop == "enum":
1142
+ if isinstance(val, (list, tuple)):
1143
+ if len(val) == 0:
1144
+ collectErr(collection, "empty array enum", "", lpath)
1145
+ collection["enum-empty"] += 1
1146
+ elif len(val) == 1:
1147
+ collection["enum-one"] += 1
1148
+ else:
1149
+ collectErr(collection, "non array enum", typeof(val), lpath)
1150
+
1151
+ elif prop in ("exclusiveMinimum", "exclusiveMaximum"):
1152
+ if isinstance(val, bool):
1153
+ collection[f"{prop}={str(val).lower()}"] += 1
1154
+
1155
+ if prop in SCHEMA_KEYS_ARRAY_OF_SCHEMAS:
1156
+ # this silently ignores non lists
1157
+ if isinstance(val, (list, tuple)):
1158
+ collection[f"{prop}-count"] += len(val)
1159
+ if len(val) == 0:
1160
+ collection[f"{prop}-empty"] += 1
1161
+ elif len(val) == 1:
1162
+ collection[f"{prop}-one"] += 1
1163
+
1164
+ if prop in ("type", "required", "enum") and isinstance(val, (list, tuple)):
1165
+ if not distinct_values(val):
1166
+ collectErr(collection, "non unique array", f"{prop} {len(val)}", lpath)
1167
+
1168
+ # FIXME should follow references as well!
1169
+ # FIXME should take care of adjacent keywords in the resolution!
1170
+ # build type hints based on keywords
1171
+ hints = getHints(jdata, defs, [], path)
1172
+
1173
+ # special case for required
1174
+ required_list = "required" in jdata and isinstance(jdata["required"], (list, tuple))
1175
+ if required_list:
1176
+ hints.add(PROP_TO_TYPE["required-list"])
1177
+ # TODO resolve references? recurse??
1178
+ # NOTE filter out constructs which may bring hidden properties
1179
+ if "properties" in jdata and not set(jdata.keys()).intersection({"oneOf", "anyOf", "$ref"}):
1180
+ required = jdata["required"]
1181
+ assert isinstance(required, list) # pyright hint
1182
+ properties = jdata["properties"]
1183
+ assert isinstance(properties, dict) # pyright hint
1184
+ for p in required:
1185
+ assert isinstance(p, str) # pyright hint
1186
+ if p not in properties:
1187
+ collectErr(collection, "unknown required property", p, path)
1188
+ # else: maybe properties are in a reference…
1189
+
1190
+ # check whether found types are compatible with declared types
1191
+ for m in hints:
1192
+ # a type hint is not compatible with possible types
1193
+ if m in JSON_SCHEMA_TYPES and m not in types:
1194
+ if m == "number" and "integer" in types:
1195
+ # ok, integer is a kind of number
1196
+ pass
1197
+ else:
1198
+ # extract direct keywords which hinted to type "m"
1199
+ keywords = set(filter(lambda p: PROP_TO_TYPE.get(p, "") == m, jdata.keys()))
1200
+ if required_list:
1201
+ keywords.add('required')
1202
+ # actual types found
1203
+ # FIXME probably useless
1204
+ foundtypes = set(filter(lambda t: t in JSON_SCHEMA_TYPES, types))
1205
+ if not foundtypes:
1206
+ foundtypes = set(types)
1207
+ collectErr(collection, "bad mix",
1208
+ f"{m}: {sorted(foundtypes)} {sorted(keywords)}", path)
1209
+
1210
+ # no type declarations *BUT* some type hints
1211
+ # NOTE *direct* definitions are skipped, should be triggered when/if used
1212
+ # TODO <= 2? other?
1213
+ if len(types) == 7:
1214
+ type_hints = hints.difference({"meta", "combi", "hyper", "alone"})
1215
+ if not is_defs and not is_logic and len(type_hints) == 1:
1216
+ collectErr(collection, "missing type declaration", f"{type_hints}", path)
1217
+ elif len(type_hints) == 0 and len(path) > 1:
1218
+ collectErr(collection, "suspicious empty type", "*", path)
1219
+
1220
+ # log.debug(f"mixins: {mixins}")
1221
+ mix = "*-" + "-".join(sorted(types)) + "/" + "-".join(sorted(hints))
1222
+
1223
+ if mix != "*-/":
1224
+ if mix in collection:
1225
+ collection[mix] += 1
1226
+ else:
1227
+ collection[mix] = 1
1228
+
1229
+
1230
+ def json_schema_stats(jdata):
1231
+ """Return stats about a JSON data structure."""
1232
+
1233
+ # global GET_TYPES_CACHE, GET_HINTS_CACHE
1234
+ GET_TYPES_CACHE.clear()
1235
+ GET_HINTS_CACHE.clear()
1236
+
1237
+ # we first collect all possible local definitions, just in case
1238
+ defs = Defs()
1239
+ _collect_all_defs_rec(jdata, defs)
1240
+
1241
+ # then proceed to analyze the schema
1242
+ collection: dict[str, Any] = { k: 0 for k in SCHEMA_KEYS_INIT }
1243
+ _json_schema_stats_rec(jdata, "$", collection, defs) # type: ignore
1244
+
1245
+ # unused definitions
1246
+ collection["<unused-defs>"] = list(sorted(defs.unusedDefs()))
1247
+
1248
+ # do version guessing on the result
1249
+ guess_version(collection)
1250
+
1251
+ # official $schema at root?
1252
+ if isinstance(jdata, dict) and "$schema" in jdata:
1253
+ if "/json-schema.org/" in jdata["$schema"]:
1254
+ collection["<explicit-schema>"] = True
1255
+ else:
1256
+ collection["<explicit-schema>"] = False
1257
+ collection["<$schema>"] = jdata["$schema"]
1258
+ else:
1259
+ collection["<explicit-schema>"] = False
1260
+ collection["<$schema>"] = "<unknown_explicit_schema>"
1261
+
1262
+ # is the root type compatible with a JSON schema?
1263
+ collection["<bad-root>"] = not isinstance(jdata, (bool, dict))
1264
+
1265
+ # look for schema property name hints
1266
+ collection["<schema-prop>"] = False
1267
+ if isinstance(jdata, dict):
1268
+ for prop in jdata.keys():
1269
+ if prop in PROP_TO_TYPE and PROP_TO_TYPE[prop] in \
1270
+ ["alone", "number", "string", "array", "object", "combi"]:
1271
+ collection["<schema-prop>"] = True
1272
+ break
1273
+
1274
+ # cleanup sets
1275
+ for key in SETS:
1276
+ if key in collection:
1277
+ collection[key] = list(sorted(collection[key]))
1278
+
1279
+ return collection
1280
+
1281
+
1282
+ def normalize_ods(fn, schema):
1283
+ if not isinstance(schema, dict) or len(schema) != 4:
1284
+ return
1285
+ if ("title" not in schema or "definitions" not in schema or "oneOf" not in schema or
1286
+ "type" not in schema):
1287
+ return
1288
+ if schema["type"] != "object":
1289
+ return
1290
+
1291
+ title = schema["title"]
1292
+ oneof = schema["oneOf"]
1293
+
1294
+ if (len(oneof) != 1 or len(oneof[0]) != 1 or "$ref" not in oneof[0] and
1295
+ oneof[0]["$ref"] != f"#/definitions/{title}"):
1296
+ return
1297
+
1298
+ rec = f"{title}_records"
1299
+ defs = schema["definitions"]
1300
+ if title not in defs or rec not in defs or len(defs) != 2:
1301
+ return
1302
+
1303
+ log.warning(f"Anonymizing {fn}")
1304
+ schema["title"] = "ANONYM"
1305
+ oneof[0]["$ref"] = "#/definitions/ANONYM"
1306
+ defs["ANONYM"] = defs[title]
1307
+ del defs[title]
1308
+ defs["ANONYM_records"] = defs[rec]
1309
+ del defs[rec]
1310
+ defs["ANONYM"]["properties"]["records"]["items"]["$ref"] = "#/definitions/ANONYM_records"