json-schema-utils 0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_schema_utils-0.8.dist-info/METADATA +119 -0
- json_schema_utils-0.8.dist-info/RECORD +15 -0
- json_schema_utils-0.8.dist-info/WHEEL +5 -0
- json_schema_utils-0.8.dist-info/entry_points.txt +7 -0
- json_schema_utils-0.8.dist-info/licenses/LICENSE +1 -0
- json_schema_utils-0.8.dist-info/top_level.txt +1 -0
- jsutils/__init__.py +5 -0
- jsutils/convert.py +934 -0
- jsutils/inline.py +206 -0
- jsutils/recurse.py +90 -0
- jsutils/schemas.py +151 -0
- jsutils/scripts.py +396 -0
- jsutils/simplify.py +580 -0
- jsutils/stats.py +1310 -0
- jsutils/utils.py +44 -0
jsutils/stats.py
ADDED
|
@@ -0,0 +1,1310 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
import re
|
|
3
|
+
from urllib.parse import unquote as url_unquote
|
|
4
|
+
|
|
5
|
+
from .utils import JsonSchema, log
|
|
6
|
+
|
|
7
|
+
#
|
|
8
|
+
# FROM JSON MODE
|
|
9
|
+
#
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_regex(s: str) -> bool:
|
|
13
|
+
if isinstance(s, str):
|
|
14
|
+
try:
|
|
15
|
+
re.compile(s)
|
|
16
|
+
return True
|
|
17
|
+
except Exception as e:
|
|
18
|
+
# \c ControlLetter (ECMA 262 v13 - 22.2.1 p. 552)
|
|
19
|
+
# \p{UnicodePropertyValueExpression} (same p. 553)
|
|
20
|
+
good_anyway = re.search(r"\\[Pp]\{", s) or re.search(r"\\c[a-zA-Z]", s)
|
|
21
|
+
if not good_anyway:
|
|
22
|
+
log.warning(f"invalid /{s}/: {e}")
|
|
23
|
+
return good_anyway is not None
|
|
24
|
+
else:
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def distinct_values(val):
|
|
29
|
+
try:
|
|
30
|
+
if isinstance(val, (list, tuple, str)):
|
|
31
|
+
return len(val) == len(set(val))
|
|
32
|
+
else:
|
|
33
|
+
return False
|
|
34
|
+
except TypeError as e:
|
|
35
|
+
# log.warning(f"ignoring error: {e}", exc_info=True)
|
|
36
|
+
log.warning(f"ignoring error: {e}")
|
|
37
|
+
# try slow iterative version which relies on ==
|
|
38
|
+
seen = []
|
|
39
|
+
for i in val:
|
|
40
|
+
if i in seen:
|
|
41
|
+
return False
|
|
42
|
+
else:
|
|
43
|
+
seen.append(i)
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _json_metrics_rec(j, counts: dict[str, int]):
|
|
48
|
+
|
|
49
|
+
if j is None:
|
|
50
|
+
counts["null"] += 1
|
|
51
|
+
elif isinstance(j, bool):
|
|
52
|
+
counts["bool"] += 1
|
|
53
|
+
elif isinstance(j, int):
|
|
54
|
+
counts["int"] += 1
|
|
55
|
+
elif isinstance(j, float):
|
|
56
|
+
counts["float"] += 1
|
|
57
|
+
elif isinstance(j, str):
|
|
58
|
+
counts["string"] += 1
|
|
59
|
+
elif isinstance(j, (list, tuple)):
|
|
60
|
+
counts["array"] += 1
|
|
61
|
+
counts["items"] += len(j)
|
|
62
|
+
for i in j:
|
|
63
|
+
_json_metrics_rec(i, counts)
|
|
64
|
+
elif isinstance(j, dict):
|
|
65
|
+
counts["object"] += 1
|
|
66
|
+
counts["props"] += len(j)
|
|
67
|
+
for p, v in j.items():
|
|
68
|
+
assert isinstance(p, str), "json property must be a string"
|
|
69
|
+
counts["string"] += 1
|
|
70
|
+
_json_metrics_rec(v, counts)
|
|
71
|
+
else:
|
|
72
|
+
raise Exception(f"unexpected type: {type(j)}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def json_metrics(j) -> dict[str, int]:
|
|
76
|
+
|
|
77
|
+
counts = {
|
|
78
|
+
"null": 0,
|
|
79
|
+
"bool": 0,
|
|
80
|
+
"int": 0,
|
|
81
|
+
"float": 0,
|
|
82
|
+
"string": 0,
|
|
83
|
+
"array": 0,
|
|
84
|
+
"object": 0,
|
|
85
|
+
"props": 0,
|
|
86
|
+
"items": 0,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
_json_metrics_rec(j, counts)
|
|
90
|
+
|
|
91
|
+
return counts
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# Properties names which suggest a JSON schema
|
|
95
|
+
DOUBTFUL_PROPERTY_NAMES = {
|
|
96
|
+
"$vocabulary", "exclusiveMinimum", "exclusiveMaximum", "multipleOf",
|
|
97
|
+
"prefixItems", "additionalItems",
|
|
98
|
+
"minContains", "maxContains", "unevaluatedItems",
|
|
99
|
+
"properties", "minProperties", "maxProperties", "patternProperties", "additionalProperties",
|
|
100
|
+
"unevaluatedProperties", "dependentRequired", "propertyNames",
|
|
101
|
+
"allOf", "anyOf", "oneOf"
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# PER TYPE PROPERTIES
|
|
105
|
+
PER_TYPE = {
|
|
106
|
+
# out: $ref and $dynamicRef type
|
|
107
|
+
"hyper": [ "base", "links", "href", "rel" ], # Draft 3 Section 6 Hyper Schema (partial)
|
|
108
|
+
"meta": [ "$schema", "$vocabulary", "$id", "$anchor", "$dynamicAnchor", "$comment",
|
|
109
|
+
"title", "description", "default", "examples", "deprecated", "readOnly",
|
|
110
|
+
"writeOnly", "id", "context", "notes" ],
|
|
111
|
+
"alone": [ "enum", "const" ],
|
|
112
|
+
# number also stands for integer
|
|
113
|
+
"number": [ "minimum", "maximum", "exclusiveMinimum", "exclusiveMaximum", "multipleOf",
|
|
114
|
+
"divisibleBy" ],
|
|
115
|
+
"string": [ "minLength", "maxLength", "pattern", "contentMediaType", "contentEncoding",
|
|
116
|
+
"contentSchema" ],
|
|
117
|
+
"array": [ "items", "prefixItems", "additionalItems", "minItems", "maxItems",
|
|
118
|
+
"uniqueItems", "contains", "minContains", "maxContains", "unevaluatedItems" ],
|
|
119
|
+
"object": [ "properties", "minProperties", "maxProperties", "patternProperties",
|
|
120
|
+
"additionalProperties", "unevaluatedProperties", "required-list",
|
|
121
|
+
"dependentRequired", "propertyNames" ],
|
|
122
|
+
"combi": [ "allOf", "anyOf", "oneOf", "if", "then", "else", "not" ],
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
PROP_TO_TYPE: dict[str, str] = {}
|
|
126
|
+
for t, props in PER_TYPE.items():
|
|
127
|
+
for prop in props:
|
|
128
|
+
PROP_TO_TYPE[prop] = t
|
|
129
|
+
|
|
130
|
+
# value is not a schema
|
|
131
|
+
SCHEMA_KEYS_SIMPLE = [
|
|
132
|
+
# core
|
|
133
|
+
"$schema", "$vocabulary", "$id", "$anchor", "$dynamicAnchor", "$ref", "$dynamicRef",
|
|
134
|
+
"$comment",
|
|
135
|
+
# metadata
|
|
136
|
+
"title", "description", "default", "examples", "deprecated", "readOnly", "writeOnly",
|
|
137
|
+
# types
|
|
138
|
+
"type", "enum", "const", "format",
|
|
139
|
+
# validation
|
|
140
|
+
"minimum", "maximum", "multipleOf", "exclusiveMaximum", "exclusiveMinimum",
|
|
141
|
+
"minLength", "maxLength", "minItems", "maxItems", "minProperties", "maxProperties",
|
|
142
|
+
"pattern", "minContains", "maxContains", "uniqueItems",
|
|
143
|
+
"contentMediaType", "contentEncoding", "contentSchema",
|
|
144
|
+
"required", "dependentRequired",
|
|
145
|
+
# UNSURE, OLD?
|
|
146
|
+
"id", "context", "notes", "optional", "base", "links", "rel", "href", "requires",
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
SCHEMA_KEYS_VALUE_SCHEMA = [
|
|
150
|
+
"not", "if", "then", "else", "items", "contains", "additionalProperties",
|
|
151
|
+
"propertyNames", "unevaluatedItems", "unevaluatedProperties",
|
|
152
|
+
# OLD?
|
|
153
|
+
# beware of dependencies which is both Schema or {"": [""]}
|
|
154
|
+
"additionalItems", "dependencies", "extends",
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
SCHEMA_KEYS_ARRAY_OF_SCHEMAS = [
|
|
158
|
+
"allOf", "anyOf", "oneOf", "prefixItems",
|
|
159
|
+
# OLD
|
|
160
|
+
"items", "extends",
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
SCHEMA_KEYS_OBJECT_VALUES_SCHEMAS = [
|
|
164
|
+
"$defs", "definitions", # old version
|
|
165
|
+
"dependentSchemas", "properties", "patternProperties",
|
|
166
|
+
]
|
|
167
|
+
|
|
168
|
+
# typical typos…
|
|
169
|
+
SCHEMA_KEYS_TYPOS = [
|
|
170
|
+
"typeof", "min", "max", "comment", "_comment", "comments", "minSize", "maxSize", "example",
|
|
171
|
+
"readonly", "writeonly",
|
|
172
|
+
"desription", "despcription", "descritpion", "descrition", "decription", "descrption",
|
|
173
|
+
"descripiton", "descripition", "decsription", "descripion", "Description", "unique", "@type",
|
|
174
|
+
"defaults", "$default", "ContentType",
|
|
175
|
+
"schema", "schemas", "link", "constant", "required:", "minimum:", "maximum:", "Schema",
|
|
176
|
+
"Default", "Type", "$type", "ref", "@id", "_id", "refs", "__ref", "#ref", "type:",
|
|
177
|
+
"$allOf", "$anyOf", "$oneOf", "anyof", "allof", "oneof",
|
|
178
|
+
"AllOf", "OneOf", "AnyOf", "$types", "#/anyOf", "#/allOf", "#/oneOf", "$extend", "$extends",
|
|
179
|
+
"$rel",
|
|
180
|
+
"read-only", "write-only", "minitems", "maxitems", "maxLen", "minLen", "maxValue", "minValue",
|
|
181
|
+
"max_length", "min_length",
|
|
182
|
+
"maxlength", "minlength", "minLenght", "maxLenght", "regex", "allOf:indexes: 1",
|
|
183
|
+
"allOf:indexes: 0", "$version", "Ref",
|
|
184
|
+
"numItems", "require", "patterns", "properites", "$deprecated", "deprecation",
|
|
185
|
+
"requiredProperties", "property", "Id",
|
|
186
|
+
"minimal", "maximal", "inclusiveMinimum", "inclusiveMaximum", "Comment", "$refs", "enums",
|
|
187
|
+
"Minimum", "Maximum", "totalItems", "additional_properties", "prefix",
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
# "type" is managed manually
|
|
191
|
+
SPECIAL_VALUES = [ "$schema" ]
|
|
192
|
+
|
|
193
|
+
SPECIALS = [
|
|
194
|
+
# boolean JSON schema, empty schema
|
|
195
|
+
"true", "false", "{}",
|
|
196
|
+
# also some values or constructs
|
|
197
|
+
"type-list", "type-list-one", "type-list-empty",
|
|
198
|
+
"type=null", "type=boolean", "type=integer", "type=number", "type=string", "type=array",
|
|
199
|
+
"type=object",
|
|
200
|
+
"items-list",
|
|
201
|
+
"additionalProperties=true", "additionalItems=true",
|
|
202
|
+
"additionalProperties=false", "additionalItems=false",
|
|
203
|
+
"exclusiveMinimum=true", "exclusiveMinimum=false",
|
|
204
|
+
"exclusiveMaximum=true", "exclusiveMaximum=false",
|
|
205
|
+
"required-count", "required=true", "required=false", "required-empty", "required-list",
|
|
206
|
+
"required-bool",
|
|
207
|
+
"allOf-count", "anyOf-count", "oneOf-count", "prefixItems-count", "items-count",
|
|
208
|
+
"allOf-one", "anyOf-one", "oneOf-one", "prefixItems-one", "items-one", "enum-one",
|
|
209
|
+
"allOf-empty", "anyOf-empty", "oneOf-empty", "prefixItems-empty", "items-empty", "enum-empty",
|
|
210
|
+
# schemas
|
|
211
|
+
"properties-count", "patternProperties-count", "dependentSchemas-count",
|
|
212
|
+
"$defs-count", "definitions-count", "extends-count", "extends-one",
|
|
213
|
+
# missing?
|
|
214
|
+
"<unknown>", "<typos>", "<version>",
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
INTEGER_KEYWORDS = [
|
|
218
|
+
"minItems", "maxItems",
|
|
219
|
+
"minProperties", "maxProperties",
|
|
220
|
+
"minLength", "maxLength",
|
|
221
|
+
"minContains", "maxContains",
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
NUMBER_KEYWORDS = [
|
|
225
|
+
"minimum", "maximum"
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
FORMATS = [
|
|
229
|
+
"date", "date-time", "time", "duration",
|
|
230
|
+
"email", "idn-email",
|
|
231
|
+
"hostname", "idn-hostname", "ipv4", "ipv6",
|
|
232
|
+
"uri", "uri-reference", "uri-template",
|
|
233
|
+
# what is an iri is beyond comprehension, and has been removed
|
|
234
|
+
"iri", "iri-reference",
|
|
235
|
+
"uuid",
|
|
236
|
+
"json-pointer", "relative-json-pointer",
|
|
237
|
+
"regex",
|
|
238
|
+
# OLD
|
|
239
|
+
"color", "phone",
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
# NOTE there are other formats in other OpenAPI versions?
|
|
243
|
+
OPENAPI_310_FORMATS = [
|
|
244
|
+
"int32", "int64", "float", "double", "password"
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
OPENAPI_310_KEYWORDS = [
|
|
248
|
+
"discriminator", "xml", "externalDocs", "example"
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
# collected sets need to be changed to lists for json serialization
|
|
252
|
+
SETS = [
|
|
253
|
+
"<typos-keywords>", "<typos-keywords-where>", "<unknown-keywords>",
|
|
254
|
+
"<errors>", "<bad-properties-nesting-where>", "<openapi>", "<extensions>",
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
# schema keywords
|
|
259
|
+
SCHEMA_KEYS: set[str] = (
|
|
260
|
+
set(SCHEMA_KEYS_SIMPLE) |
|
|
261
|
+
set(SCHEMA_KEYS_VALUE_SCHEMA) |
|
|
262
|
+
set(SCHEMA_KEYS_ARRAY_OF_SCHEMAS) |
|
|
263
|
+
set(SCHEMA_KEYS_OBJECT_VALUES_SCHEMAS)
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# all expected schema keys to initialize
|
|
267
|
+
SCHEMA_KEYS_INIT = list(
|
|
268
|
+
SCHEMA_KEYS_SIMPLE +
|
|
269
|
+
SCHEMA_KEYS_VALUE_SCHEMA +
|
|
270
|
+
SCHEMA_KEYS_ARRAY_OF_SCHEMAS +
|
|
271
|
+
SCHEMA_KEYS_OBJECT_VALUES_SCHEMAS +
|
|
272
|
+
SPECIALS
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
#
|
|
276
|
+
# VERSION GUESSING
|
|
277
|
+
#
|
|
278
|
+
|
|
279
|
+
CURRENT_VERSION = 9
|
|
280
|
+
NEXT_VERSION = CURRENT_VERSION + 1
|
|
281
|
+
LATEST_VERSION = NEXT_VERSION + 1
|
|
282
|
+
|
|
283
|
+
# explicit version identification in $schema
|
|
284
|
+
# 0 for not set, -1 if multiply set; error?
|
|
285
|
+
SCHEMA_VERSIONS = {
|
|
286
|
+
"/draft-01/": 1,
|
|
287
|
+
"/draft-02/": 2,
|
|
288
|
+
"/draft-03/": 3,
|
|
289
|
+
"/draft-04/": 4,
|
|
290
|
+
"/draft-05/": 5, # probably not used anywhere?
|
|
291
|
+
"/draft-06/": 6,
|
|
292
|
+
"/draft-07/": 7,
|
|
293
|
+
"/draft-08/": 8,
|
|
294
|
+
"/draft-2019-09/": 8,
|
|
295
|
+
"/draft/2019-09/": 8,
|
|
296
|
+
"/draft-2020-12/": 9,
|
|
297
|
+
"/draft/2020-12/": 9,
|
|
298
|
+
"/draft-next/": NEXT_VERSION,
|
|
299
|
+
"/draft/next/": NEXT_VERSION,
|
|
300
|
+
"json-schema.org/schema": LATEST_VERSION,
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
# version specific keywords which help guessing the schema
|
|
304
|
+
# note: some keywords type can also help guessing…
|
|
305
|
+
# items list vs simple schema
|
|
306
|
+
# formats…
|
|
307
|
+
SCHEMA_VERSION_GUESS = {
|
|
308
|
+
# TODO boolean schemas, *BUT* problems with "additional{Items,Properties}"…
|
|
309
|
+
"type=any": [1, 2, 3],
|
|
310
|
+
"requires": [1, 2],
|
|
311
|
+
"required-bool": [3],
|
|
312
|
+
"required-list": [4, 5, 6, 7, 8, 9],
|
|
313
|
+
"exclusiveMinimum=true": [3, 4],
|
|
314
|
+
"exclusiveMinimum=false": [3, 4],
|
|
315
|
+
"exclusiveMaximum=true": [3, 4],
|
|
316
|
+
"exclusiveMaximum=false": [3, 4],
|
|
317
|
+
"items-list": [1, 2, 3, 4, 5, 6, 7, 8], # 8: deprecated?
|
|
318
|
+
"maxDecimal": [1],
|
|
319
|
+
"optional": [1, 2],
|
|
320
|
+
"additionalItems": [3, 4, 5, 6, 7, 8], # 8: deprecated?
|
|
321
|
+
"prefixItems": [9],
|
|
322
|
+
"minimumCanEqual": [1, 2],
|
|
323
|
+
"maximumCanEqual": [1, 2],
|
|
324
|
+
"contentEncoding": [1, 2, 7, 8, 9], # disappear then reappears!
|
|
325
|
+
"exclusiveMinimum": [3, 4, 5, 6, 7, 8, 9],
|
|
326
|
+
"exclusiveMaximum": [3, 4, 5, 6, 7, 8, 9],
|
|
327
|
+
"patternProperties": [3, 4, 5, 6, 7, 8, 9],
|
|
328
|
+
"divisibleBy": [2, 3],
|
|
329
|
+
"disallow": [1, 2, 3],
|
|
330
|
+
"extends": [1, 2, 3],
|
|
331
|
+
"uniqueItems": [2, 3, 4, 5, 6, 7, 8, 9],
|
|
332
|
+
"multipleOf": [4, 5, 6, 7, 8, 9],
|
|
333
|
+
"minProperties": [4, 5, 6, 7, 8, 9],
|
|
334
|
+
"maxProperties": [4, 5, 6, 7, 8, 9],
|
|
335
|
+
"allOf": [4, 5, 6, 7, 8, 9],
|
|
336
|
+
"anyOf": [4, 5, 6, 7, 8, 9],
|
|
337
|
+
"oneOf": [4, 5, 6, 7, 8, 9],
|
|
338
|
+
"not": [4, 5, 6, 7, 8, 9],
|
|
339
|
+
"const": [6, 7, 8, 9],
|
|
340
|
+
"propertyNames": [6, 7, 8, 9],
|
|
341
|
+
"id": [1, 2, 3, 4, 5],
|
|
342
|
+
"$id": [6, 7, 8, 9],
|
|
343
|
+
"if": [7, 8, 9],
|
|
344
|
+
"then": [7, 8, 9],
|
|
345
|
+
"else": [7, 8, 9],
|
|
346
|
+
"contentMediaType": [7, 8, 9],
|
|
347
|
+
"$comment": [7, 8, 9],
|
|
348
|
+
"readOnly": [7, 8, 9],
|
|
349
|
+
"writeOnly": [7, 8, 9],
|
|
350
|
+
"definitions": [4, 5, 6, 7, 8, 9], # deprecated 8- (official 9)
|
|
351
|
+
"dependencies": [3, 4, 5, 6, 7, 8, 9], # deprecated 8- (official 9)
|
|
352
|
+
"$def": [8, 9],
|
|
353
|
+
"deprecated": [8, 9],
|
|
354
|
+
"dependentSchemas": [8, 9],
|
|
355
|
+
"dependentRequired": [8, 9],
|
|
356
|
+
"unevaluatedItems": [8, 9],
|
|
357
|
+
"unevaluatedProperties": [8, 9],
|
|
358
|
+
"$recursiveRef": [8],
|
|
359
|
+
"$recursiveAnchor": [8],
|
|
360
|
+
"$dynamicRef": [9],
|
|
361
|
+
"$dynamicAnchor": [9],
|
|
362
|
+
"propertyDependencies": [10], # new online draft
|
|
363
|
+
# OpenAPI 3.1.0 extension
|
|
364
|
+
"discriminator": [100],
|
|
365
|
+
# "propertyName": [100],
|
|
366
|
+
# "mapping": [100],
|
|
367
|
+
"externalDocs": [100],
|
|
368
|
+
"xml": [100],
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
FORMAT_ALL_VERSIONS = [ "date-time", "uri", "email", "ipv6" ]
|
|
372
|
+
|
|
373
|
+
# which formats are allowed at each versions
|
|
374
|
+
FORMAT_VERSIONS = {
|
|
375
|
+
"date": [1, 2, 3, 7, 8, 9],
|
|
376
|
+
"date-time": [1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
377
|
+
"time": [1, 2, 3, 7, 8, 9],
|
|
378
|
+
"duration": [8, 9],
|
|
379
|
+
"utc-millisec": [3],
|
|
380
|
+
"regex": [1, 2, 3, 7, 8, 9],
|
|
381
|
+
"color": [1, 2, 3],
|
|
382
|
+
"style": [1, 2, 3],
|
|
383
|
+
"phone": [1, 2, 3],
|
|
384
|
+
"uri": [1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
385
|
+
"iri": [7, 8, 9], # RFC 3987
|
|
386
|
+
"uri-ref": [5],
|
|
387
|
+
"uri-reference": [6, 7, 8, 9],
|
|
388
|
+
"iri-reference": [7, 8, 9],
|
|
389
|
+
"uuid": [8, 9], # 9? was it really in 2020-12?
|
|
390
|
+
"uri-template": [6, 7, 8, 9],
|
|
391
|
+
"json-pointer": [6, 7, 8, 9],
|
|
392
|
+
"relative-json-pointer": [7, 8, 9],
|
|
393
|
+
"email": [1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
394
|
+
"idn-email": [7, 8, 9],
|
|
395
|
+
"ip-address": [1, 2, 3],
|
|
396
|
+
"ipv4": [4, 5, 6, 7, 8, 9],
|
|
397
|
+
"ipv6": [1, 2, 3, 4, 5, 6, 7, 8, 9],
|
|
398
|
+
"host-name": [3],
|
|
399
|
+
"hostname": [4, 5, 6, 7, 8, 9],
|
|
400
|
+
"idn-hostname": [7, 8, 9],
|
|
401
|
+
"street-address": [1, 2],
|
|
402
|
+
"locality": [1, 2],
|
|
403
|
+
"region": [1, 2],
|
|
404
|
+
"country": [1, 2],
|
|
405
|
+
# additional custom formats may be defined with a URL to a definition of the format
|
|
406
|
+
# OpenAPI 3.1:
|
|
407
|
+
# - integer: int32, int64
|
|
408
|
+
# - number: float, double
|
|
409
|
+
# - string: password
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
# add special version numbers
|
|
414
|
+
for _, versions in SCHEMA_VERSION_GUESS.items():
|
|
415
|
+
if CURRENT_VERSION in versions:
|
|
416
|
+
versions.append(NEXT_VERSION)
|
|
417
|
+
versions.append(LATEST_VERSION)
|
|
418
|
+
|
|
419
|
+
for f in FORMAT_ALL_VERSIONS:
|
|
420
|
+
del FORMAT_VERSIONS[f]
|
|
421
|
+
|
|
422
|
+
for _, versions in FORMAT_VERSIONS.items():
|
|
423
|
+
if CURRENT_VERSION in versions:
|
|
424
|
+
versions.append(NEXT_VERSION)
|
|
425
|
+
versions.append(LATEST_VERSION)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def guess_version(col: dict):
|
|
429
|
+
ALL = { i for i in range(1, LATEST_VERSION + 1) }
|
|
430
|
+
valid = set()
|
|
431
|
+
invalid = set()
|
|
432
|
+
keywords = []
|
|
433
|
+
|
|
434
|
+
for prop, versions in SCHEMA_VERSION_GUESS.items():
|
|
435
|
+
if prop in col and col[prop] > 0:
|
|
436
|
+
if set(versions).difference(valid) or ALL.difference(versions).difference(invalid):
|
|
437
|
+
keywords.append(prop)
|
|
438
|
+
valid.update(versions)
|
|
439
|
+
invalid.update(ALL.difference(versions))
|
|
440
|
+
|
|
441
|
+
# possible versions
|
|
442
|
+
if not valid and not invalid:
|
|
443
|
+
# no clues
|
|
444
|
+
versions = ALL
|
|
445
|
+
else:
|
|
446
|
+
versions = valid.difference(invalid)
|
|
447
|
+
|
|
448
|
+
col["<versions>"] = list(sorted(versions))
|
|
449
|
+
|
|
450
|
+
if not versions:
|
|
451
|
+
collectErr(col, "incompatible version guesses", f"{keywords}", "$")
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
# all predefined JSON Schema types for Draft 4 and later
|
|
455
|
+
JSON_SCHEMA_TYPES = [ "null", "boolean", "integer", "number", "string", "array", "object" ]
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def typeof(v: Any) -> str:
|
|
459
|
+
return ("null" if v is None else
|
|
460
|
+
"boolean" if isinstance(v, bool) else
|
|
461
|
+
"integer" if isinstance(v, int) else
|
|
462
|
+
"number" if isinstance(v, float) else
|
|
463
|
+
"string" if isinstance(v, str) else
|
|
464
|
+
"array" if isinstance(v, (list, tuple)) else
|
|
465
|
+
"object" if isinstance(v, dict) else
|
|
466
|
+
"<unknown>")
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def collectAdd(collection, key, n):
|
|
470
|
+
if key in collection:
|
|
471
|
+
collection[key] += n
|
|
472
|
+
else:
|
|
473
|
+
collection[key] = n
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def collectCnt(collection, key):
|
|
477
|
+
collectAdd(collection, key, 1)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def collectSet(collection, key, val):
|
|
481
|
+
if key not in collection:
|
|
482
|
+
collection[key] = set()
|
|
483
|
+
collection[key].add(val)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def collectErr(collection, cat, what, path):
|
|
487
|
+
collectSet(collection, "<errors>", (cat, what, path))
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def collectTypo(collection, what, path):
|
|
491
|
+
collectSet(collection, "<typos-keywords>", what)
|
|
492
|
+
collectSet(collection, "<typos-keywords-where>", (what, path))
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def ap(path: str, key: str):
|
|
496
|
+
if re.search(r"^[a-zA-Z0-9_]+$", key):
|
|
497
|
+
return f"{path}.{key}"
|
|
498
|
+
else:
|
|
499
|
+
return f'{path}."{key}"'
|
|
500
|
+
|
|
501
|
+
#
|
|
502
|
+
# TYPE RESOLUTION
|
|
503
|
+
#
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
ALL_TYPES = { "null", "boolean", "integer", "number", "string", "array", "object" }
|
|
507
|
+
NO_TYPE = set()
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def fixIntNum(types: set[str]):
|
|
511
|
+
"""Ensure that integer/number are accepted one for the other."""
|
|
512
|
+
if "integer" in types:
|
|
513
|
+
types.add("number")
|
|
514
|
+
if "number" in types:
|
|
515
|
+
types.add("integer")
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
# getTypes cache path and context to set of types there
|
|
519
|
+
GET_TYPES_CACHE: dict[str, set[str]] = {}
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def getTypes(
|
|
523
|
+
jdata: JsonSchema, # JSON Schema
|
|
524
|
+
defs: dict[str, Any], # current definitions
|
|
525
|
+
recs: list[str], # paths to detect recursion
|
|
526
|
+
path: str, # current path
|
|
527
|
+
context: set[str] # external context for adjacent keywords
|
|
528
|
+
) -> set[str]:
|
|
529
|
+
"""Return the possible types for the current schema."""
|
|
530
|
+
|
|
531
|
+
# log.warning(f"types on {path} <- {context}")
|
|
532
|
+
|
|
533
|
+
if path and path.endswith(".propertyNames"):
|
|
534
|
+
# we know that we are checking a string
|
|
535
|
+
return { "string" }
|
|
536
|
+
|
|
537
|
+
if isinstance(jdata, bool):
|
|
538
|
+
return set(ALL_TYPES if jdata else NO_TYPE)
|
|
539
|
+
|
|
540
|
+
if not isinstance(jdata, dict):
|
|
541
|
+
# FIXME should not be possible!
|
|
542
|
+
return { "BAD" }
|
|
543
|
+
|
|
544
|
+
# cache shortcut
|
|
545
|
+
path_ctx = path + ":" + str(sorted(context))
|
|
546
|
+
if path_ctx in GET_TYPES_CACHE:
|
|
547
|
+
return GET_TYPES_CACHE[path_ctx]
|
|
548
|
+
|
|
549
|
+
# set initial possible types
|
|
550
|
+
if "type" in jdata:
|
|
551
|
+
# if there is an explicit type, it constrains the result
|
|
552
|
+
types = jdata["type"]
|
|
553
|
+
if isinstance(types, str):
|
|
554
|
+
if types == "any": # early versions…
|
|
555
|
+
possible_types = set(ALL_TYPES)
|
|
556
|
+
elif types in ALL_TYPES:
|
|
557
|
+
possible_types = { types }
|
|
558
|
+
else:
|
|
559
|
+
log.warning(f"unexpected string type: {types}")
|
|
560
|
+
# FIXME NO_TYPE?
|
|
561
|
+
possible_types = set(ALL_TYPES)
|
|
562
|
+
elif isinstance(types, (tuple, list)):
|
|
563
|
+
ltypes = set()
|
|
564
|
+
for i, t in enumerate(types):
|
|
565
|
+
if isinstance(t, str):
|
|
566
|
+
if t == "any":
|
|
567
|
+
ltypes.update(ALL_TYPES)
|
|
568
|
+
elif t in ALL_TYPES:
|
|
569
|
+
ltypes.add(t)
|
|
570
|
+
else:
|
|
571
|
+
log.warning(f"coldly ignoring unexpected type: {t}")
|
|
572
|
+
elif isinstance(t, dict): # early versions
|
|
573
|
+
ltypes.update(getTypes(t, defs, recs, f"{path}.type[{i}]", ALL_TYPES))
|
|
574
|
+
else:
|
|
575
|
+
log.warning(f"unexpected type item type: {typeof(t)}")
|
|
576
|
+
possible_types = ltypes
|
|
577
|
+
elif isinstance(types, dict): # early versions
|
|
578
|
+
possible_types = getTypes(types, defs, recs, path + ".type", ALL_TYPES)
|
|
579
|
+
else:
|
|
580
|
+
log.warning(f"unexpected value for type: {typeof(types)}")
|
|
581
|
+
possible_types = { "BOF" }
|
|
582
|
+
else:
|
|
583
|
+
possible_types = set(ALL_TYPES)
|
|
584
|
+
fixIntNum(possible_types)
|
|
585
|
+
|
|
586
|
+
# make current explicit types consistent with context
|
|
587
|
+
possible_types.intersection_update(context)
|
|
588
|
+
fixIntNum(possible_types)
|
|
589
|
+
|
|
590
|
+
# then reduce with other type informations
|
|
591
|
+
if "const" in jdata:
|
|
592
|
+
possible_types.intersection_update({ typeof(jdata["const"]) })
|
|
593
|
+
fixIntNum(possible_types)
|
|
594
|
+
|
|
595
|
+
if "enum" in jdata and isinstance(jdata["enum"], (tuple, list)):
|
|
596
|
+
possible_types.intersection_update(typeof(i) for i in jdata["enum"])
|
|
597
|
+
fixIntNum(possible_types)
|
|
598
|
+
|
|
599
|
+
if "$ref" in jdata:
|
|
600
|
+
rpath = jdata["$ref"]
|
|
601
|
+
if isinstance(rpath, str):
|
|
602
|
+
rpathu = url_unquote(rpath)
|
|
603
|
+
if rpathu in recs:
|
|
604
|
+
log.warning(f"preventing recursion on {rpath}")
|
|
605
|
+
# possible_types is left "as-is"?
|
|
606
|
+
elif rpathu in defs:
|
|
607
|
+
possible_types.intersection_update(getTypes(defs[rpathu], defs, recs + [ rpathu ],
|
|
608
|
+
rpathu, possible_types))
|
|
609
|
+
fixIntNum(possible_types)
|
|
610
|
+
else:
|
|
611
|
+
log.warning(f"definition not available: {rpath}")
|
|
612
|
+
else:
|
|
613
|
+
log.warning(f"unexpected $ref value type: {typeof(rpath)}")
|
|
614
|
+
|
|
615
|
+
if "allOf" in jdata:
|
|
616
|
+
alls = jdata["allOf"]
|
|
617
|
+
atypes = set(ALL_TYPES)
|
|
618
|
+
if isinstance(alls, (tuple, list)):
|
|
619
|
+
for i, a in enumerate(alls):
|
|
620
|
+
atypes.intersection_update(getTypes(a, defs, recs, # pyright: ignore
|
|
621
|
+
f"{path}.allOf[{i}]", possible_types))
|
|
622
|
+
else:
|
|
623
|
+
log.warning(f"unexpected allOf type: {typeof(alls)}")
|
|
624
|
+
possible_types.intersection_update(atypes)
|
|
625
|
+
fixIntNum(possible_types)
|
|
626
|
+
|
|
627
|
+
for prop in ("anyOf", "oneOf"):
|
|
628
|
+
if prop in jdata:
|
|
629
|
+
anys = jdata[prop]
|
|
630
|
+
atypes = set()
|
|
631
|
+
if isinstance(anys, (tuple, list)):
|
|
632
|
+
for i, a in enumerate(anys):
|
|
633
|
+
atypes.update(getTypes(a, defs, recs, # pyright: ignore
|
|
634
|
+
f"{path}.{prop}[{i}]", possible_types))
|
|
635
|
+
else:
|
|
636
|
+
log.warning(f"unexpected {prop} type: {typeof(anys)}")
|
|
637
|
+
possible_types.intersection_update(atypes)
|
|
638
|
+
fixIntNum(possible_types)
|
|
639
|
+
|
|
640
|
+
# FIXME if/then/else/not *could* maybe constraint some types as well
|
|
641
|
+
|
|
642
|
+
if path_ctx not in GET_TYPES_CACHE:
|
|
643
|
+
GET_TYPES_CACHE[path_ctx] = possible_types
|
|
644
|
+
|
|
645
|
+
# log.warning(f"types on {path} -> {possible_types}")
|
|
646
|
+
|
|
647
|
+
return possible_types
|
|
648
|
+
|
|
649
|
+
#
|
|
650
|
+
# COLLECT TYPE HINTS
|
|
651
|
+
#
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
GET_HINTS_CACHE: dict[str, set[str]] = {}
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def getHints(
|
|
658
|
+
jdata: JsonSchema, # JSON data
|
|
659
|
+
defs: dict[str, Any], # current definitions
|
|
660
|
+
recs: list[str], # paths to detect recursion
|
|
661
|
+
path: str # current path
|
|
662
|
+
) -> set[str]:
|
|
663
|
+
"""Gather hints about types."""
|
|
664
|
+
|
|
665
|
+
if isinstance(jdata, bool):
|
|
666
|
+
return NO_TYPE
|
|
667
|
+
|
|
668
|
+
if not isinstance(jdata, dict):
|
|
669
|
+
log.warning(f"bad schema type at {path}: {typeof(jdata)}")
|
|
670
|
+
return NO_TYPE
|
|
671
|
+
|
|
672
|
+
if path in GET_HINTS_CACHE:
|
|
673
|
+
return GET_HINTS_CACHE[path]
|
|
674
|
+
|
|
675
|
+
hints = set()
|
|
676
|
+
|
|
677
|
+
# handle direct hints
|
|
678
|
+
for prop in jdata.keys():
|
|
679
|
+
if prop in PROP_TO_TYPE:
|
|
680
|
+
hints.add(PROP_TO_TYPE[prop])
|
|
681
|
+
|
|
682
|
+
# format hint depends on the value
|
|
683
|
+
if "format" in jdata:
|
|
684
|
+
fmt = jdata["format"]
|
|
685
|
+
if isinstance(fmt, str):
|
|
686
|
+
if fmt in FORMAT_ALL_VERSIONS or fmt in FORMAT_VERSIONS or fmt == "password":
|
|
687
|
+
hints.add("string")
|
|
688
|
+
elif fmt in ("integer", "int32", "int64", "float", "double", "uint",
|
|
689
|
+
"uint32", "uint64"):
|
|
690
|
+
hints.add("number")
|
|
691
|
+
else:
|
|
692
|
+
# unknown value, not hint…
|
|
693
|
+
pass
|
|
694
|
+
|
|
695
|
+
# update with indirect hints
|
|
696
|
+
if "$ref" in jdata:
|
|
697
|
+
ref = jdata["$ref"]
|
|
698
|
+
|
|
699
|
+
if isinstance(ref, str):
|
|
700
|
+
refu = url_unquote(ref)
|
|
701
|
+
if refu in recs:
|
|
702
|
+
log.warning(f"preventing recursion for hints on {ref}")
|
|
703
|
+
elif refu in defs:
|
|
704
|
+
hints.update(getHints(defs[refu], defs, recs + [refu], refu))
|
|
705
|
+
else:
|
|
706
|
+
log.warning(f"ignoring $ref hints: {ref}")
|
|
707
|
+
else:
|
|
708
|
+
log.warning(f"ignoring bad $ref value type: {typeof(ref)}")
|
|
709
|
+
|
|
710
|
+
# combinators
|
|
711
|
+
if "allOf" in jdata:
|
|
712
|
+
schemas = jdata["allOf"]
|
|
713
|
+
if isinstance(schemas, (tuple, list)):
|
|
714
|
+
shints = set()
|
|
715
|
+
for i, s in enumerate(schemas):
|
|
716
|
+
shints.update(getHints(s, defs, recs, f"{path}.allOf[{i}]")) # pyright: ignore
|
|
717
|
+
hints.update(shints)
|
|
718
|
+
else:
|
|
719
|
+
log.warning(f"ignoring bad allOf value type: {typeof(schemas)}")
|
|
720
|
+
|
|
721
|
+
for prop in ("anyOf", "oneOf"):
|
|
722
|
+
if prop in jdata:
|
|
723
|
+
schemas = jdata[prop]
|
|
724
|
+
if isinstance(schemas, (tuple, list)):
|
|
725
|
+
shints = set(ALL_TYPES)
|
|
726
|
+
for i, s in enumerate(schemas):
|
|
727
|
+
shints.intersection_update(
|
|
728
|
+
getHints(s, defs, recs, f"{path}.{prop}[{i}]")) # pyright: ignore
|
|
729
|
+
hints.update(shints)
|
|
730
|
+
else:
|
|
731
|
+
log.warning(f"ignoring bad {prop} value type: {typeof(schemas)}")
|
|
732
|
+
|
|
733
|
+
# FIXME should it do something with not/if/then/else?
|
|
734
|
+
# FIXME format?
|
|
735
|
+
|
|
736
|
+
if path not in GET_HINTS_CACHE:
|
|
737
|
+
GET_HINTS_CACHE[path] = hints
|
|
738
|
+
# else cannot happen? or check consistency?
|
|
739
|
+
|
|
740
|
+
return hints
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
def looks_like_simple_dependencies(data) -> bool:
|
|
744
|
+
"""Check whether it is a simple { "": [""] }."""
|
|
745
|
+
if not isinstance(data, dict):
|
|
746
|
+
return False
|
|
747
|
+
for k, v in data.items():
|
|
748
|
+
if not isinstance(k, str) or not isinstance(v, (tuple, list)):
|
|
749
|
+
return False
|
|
750
|
+
for s in v:
|
|
751
|
+
if not isinstance(s, str):
|
|
752
|
+
return False
|
|
753
|
+
return True
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
class Defs:
|
|
757
|
+
"""Keep track of definitions and uses."""
|
|
758
|
+
|
|
759
|
+
def __init__(self):
|
|
760
|
+
# is it an official definition?
|
|
761
|
+
self._isdef = re.compile(r"/(\$defs|definitions)/\w+$").search
|
|
762
|
+
self._defs: dict[str, Any] = {}
|
|
763
|
+
self._uses: dict[str, int] = {}
|
|
764
|
+
|
|
765
|
+
def __setitem__(self, p: str, v):
|
|
766
|
+
if self._isdef(p):
|
|
767
|
+
self._uses[p] = 0
|
|
768
|
+
self._defs[p] = v
|
|
769
|
+
|
|
770
|
+
def __contains__(self, p: str):
|
|
771
|
+
return p in self._defs
|
|
772
|
+
|
|
773
|
+
def __getitem__(self, p: str):
|
|
774
|
+
if self._isdef(p):
|
|
775
|
+
self._uses[p] += 1
|
|
776
|
+
return self._defs[p]
|
|
777
|
+
|
|
778
|
+
def __delitem__(self, p: str):
|
|
779
|
+
del self._defs[p]
|
|
780
|
+
|
|
781
|
+
# vs unreachable?
|
|
782
|
+
def unusedDefs(self):
|
|
783
|
+
return { p for p in self._uses.keys() if self._uses[p] == 0 }
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
# maybe too much, it could collect the root and use the path when needed.
|
|
787
|
+
def _collect_all_defs_rec(data, defs, path: str = "#"):
|
|
788
|
+
"""Collect all possible local definitions just in case…"""
|
|
789
|
+
if isinstance(data, (bool, dict, list, tuple)):
|
|
790
|
+
defs[path] = data
|
|
791
|
+
if data is None or isinstance(data, (bool, int, float, str)):
|
|
792
|
+
pass
|
|
793
|
+
elif isinstance(data, (list, tuple)):
|
|
794
|
+
for i, item in enumerate(data):
|
|
795
|
+
# TODO check JSON Schema url path stuff
|
|
796
|
+
_collect_all_defs_rec(item, defs, f"{path}/{i}")
|
|
797
|
+
elif isinstance(data, dict):
|
|
798
|
+
for k, v in data.items():
|
|
799
|
+
_collect_all_defs_rec(v, defs, f"{path}/{k}")
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def _json_schema_stats_rec(
|
|
803
|
+
jdata: JsonSchema, # schema
|
|
804
|
+
path: str, # path to ~
|
|
805
|
+
collection: dict[str, Any], # collected data
|
|
806
|
+
defs: dict[str, Any] = {}, # definitions
|
|
807
|
+
type_context: set[str] = ALL_TYPES, # type restrictions at this point
|
|
808
|
+
is_defs: bool = False, # is this just a definition
|
|
809
|
+
is_logic: bool = False # are we inside a if/then/else/not?
|
|
810
|
+
) -> None:
|
|
811
|
+
"""Recursive usage stats collection about JSON Schema features."""
|
|
812
|
+
|
|
813
|
+
if isinstance(jdata, bool):
|
|
814
|
+
if jdata:
|
|
815
|
+
collection["true"] += 1
|
|
816
|
+
else:
|
|
817
|
+
collection["false"] += 1
|
|
818
|
+
return
|
|
819
|
+
|
|
820
|
+
if not isinstance(jdata, dict):
|
|
821
|
+
collectErr(collection, "invalid root schema type", typeof(jdata), path)
|
|
822
|
+
log.warning(f"skipping: [{path}] {str(jdata)[:64]}")
|
|
823
|
+
return
|
|
824
|
+
|
|
825
|
+
if len(jdata) == 0:
|
|
826
|
+
collection["{}"] += 1
|
|
827
|
+
return
|
|
828
|
+
|
|
829
|
+
# current schema guessing
|
|
830
|
+
if "$schema" in jdata:
|
|
831
|
+
version = jdata["$schema"]
|
|
832
|
+
if isinstance(version, str):
|
|
833
|
+
if "/json-schema.org/" in version:
|
|
834
|
+
for pat, vers in SCHEMA_VERSIONS.items():
|
|
835
|
+
if pat in version:
|
|
836
|
+
cvers = collection["<version>"]
|
|
837
|
+
if cvers == 0:
|
|
838
|
+
collection["<version>"] = vers
|
|
839
|
+
collection["<version-path>"] = path
|
|
840
|
+
elif cvers > 0 and cvers != vers:
|
|
841
|
+
collection["<version>"] = -1
|
|
842
|
+
collectErr(collection, "multiple schema versions",
|
|
843
|
+
f"{cvers} {vers}", path)
|
|
844
|
+
break
|
|
845
|
+
if collection["<version>"] == 0: # not assigned
|
|
846
|
+
collectErr(collection, "unexpected $schema", version, path)
|
|
847
|
+
else:
|
|
848
|
+
collectErr(collection, "unexpected $schema version", version, path)
|
|
849
|
+
else:
|
|
850
|
+
collectErr(collection, "unexpected $schema value type", typeof(version), path)
|
|
851
|
+
|
|
852
|
+
# memoize defs for later
|
|
853
|
+
if "$defs" in jdata and "definitions" in jdata:
|
|
854
|
+
collectErr(collection, "definition issue", "$defs/definitions mix", path)
|
|
855
|
+
|
|
856
|
+
kdefs = "$defs" if "$defs" in jdata else "definitions"
|
|
857
|
+
ldefs = jdata[kdefs] if kdefs in jdata else None
|
|
858
|
+
|
|
859
|
+
if kdefs in jdata and not isinstance(jdata[kdefs], dict):
|
|
860
|
+
collectErr(collection, "definition issue", f"unexpected type {typeof(ldefs)}", path)
|
|
861
|
+
|
|
862
|
+
# get actually possible types
|
|
863
|
+
types = getTypes(jdata, defs, [], path, type_context)
|
|
864
|
+
|
|
865
|
+
if not types:
|
|
866
|
+
collectErr(collection, "type error", "no possible type", path)
|
|
867
|
+
|
|
868
|
+
# condition analysis
|
|
869
|
+
# not?
|
|
870
|
+
if "if" in jdata:
|
|
871
|
+
lpath = path + ".if"
|
|
872
|
+
if "then" not in jdata and "else" not in jdata:
|
|
873
|
+
collectErr(collection, "cond error", "if no then/else", lpath)
|
|
874
|
+
ifs = jdata["if"]
|
|
875
|
+
if isinstance(ifs, dict):
|
|
876
|
+
ifprops, reqprops = set(), set()
|
|
877
|
+
if "properties" in ifs and isinstance(ifs["properties"], dict):
|
|
878
|
+
ifprops.update(ifs["properties"].keys())
|
|
879
|
+
if "required" in ifs and isinstance(ifs["required"], list):
|
|
880
|
+
reqprops.update(ifs["required"])
|
|
881
|
+
if "required" in jdata and isinstance(jdata["required"], list):
|
|
882
|
+
reqprops.update(jdata["required"])
|
|
883
|
+
unreqprops = ifprops - reqprops
|
|
884
|
+
if unreqprops:
|
|
885
|
+
collectErr(collection, "cond error",
|
|
886
|
+
f"ignored un-required if props: {" ".join(sorted(unreqprops))}", lpath)
|
|
887
|
+
elif "then" in jdata or "else" in jdata:
|
|
888
|
+
collectErr(collection, "cond error", "then/else no if", path)
|
|
889
|
+
|
|
890
|
+
# scan all properties
|
|
891
|
+
for prop, val in jdata.items():
|
|
892
|
+
|
|
893
|
+
lpath = ap(path, prop)
|
|
894
|
+
|
|
895
|
+
# TODO be less aggressive when recurring in if/then/else, depends on the outside object
|
|
896
|
+
# TODO check not/if/then/else keywords consistency with outside type
|
|
897
|
+
|
|
898
|
+
# count (expected) prop occurences
|
|
899
|
+
if prop in collection:
|
|
900
|
+
collection[prop] += 1
|
|
901
|
+
else:
|
|
902
|
+
# count typos and unknown and keep track of openapi
|
|
903
|
+
if prop.startswith("x-"):
|
|
904
|
+
collectSet(collection, "<extensions>", prop)
|
|
905
|
+
elif prop in OPENAPI_310_KEYWORDS:
|
|
906
|
+
collectSet(collection, "<openapi>", prop)
|
|
907
|
+
elif prop in SCHEMA_KEYS_TYPOS:
|
|
908
|
+
collection["<typos>"] += 1
|
|
909
|
+
collectTypo(collection, prop, lpath)
|
|
910
|
+
else:
|
|
911
|
+
collection["<unknown>"] += 1
|
|
912
|
+
collectSet(collection, "<unknown-keywords>", prop)
|
|
913
|
+
# this may really
|
|
914
|
+
log.warning(f"unexpected: {prop}")
|
|
915
|
+
|
|
916
|
+
# unknow keyword, try subschemas for draft 01-04
|
|
917
|
+
# this may result in false positive, eg for OpenAPI "example"
|
|
918
|
+
if (isinstance(val, dict) and prop not in ("example", "discriminator") and
|
|
919
|
+
not prop.startswith("x-")):
|
|
920
|
+
_json_schema_stats_rec(val, lpath, collection, defs, is_logic=is_logic)
|
|
921
|
+
|
|
922
|
+
# FIXME because of extensions any keyword should be ignored,
|
|
923
|
+
# probaby hidding away any typo…
|
|
924
|
+
|
|
925
|
+
# recurse in some cases, or specials
|
|
926
|
+
if prop == "type":
|
|
927
|
+
|
|
928
|
+
if isinstance(val, (list, tuple)):
|
|
929
|
+
collection["type-list"] += 1
|
|
930
|
+
if len(val) == 0:
|
|
931
|
+
collection["type-list-empty"] += 1
|
|
932
|
+
elif len(val) == 1:
|
|
933
|
+
collection["type-list-one"] += 1
|
|
934
|
+
vals = val
|
|
935
|
+
elif isinstance(val, str):
|
|
936
|
+
vals = [val]
|
|
937
|
+
else:
|
|
938
|
+
collectErr(collection, "invalid type value", typeof(val), lpath)
|
|
939
|
+
continue
|
|
940
|
+
|
|
941
|
+
nonstr = list(filter(lambda i: not isinstance(i, str), vals))
|
|
942
|
+
if nonstr:
|
|
943
|
+
collectErr(collection, "invalid type value in list", str(nonstr), lpath)
|
|
944
|
+
continue
|
|
945
|
+
|
|
946
|
+
for v in vals:
|
|
947
|
+
if isinstance(v, str):
|
|
948
|
+
pval = f"{prop}={v}"
|
|
949
|
+
if pval in collection:
|
|
950
|
+
collection[pval] += 1
|
|
951
|
+
else:
|
|
952
|
+
collectErr(collection, "unexpected type data", v, path)
|
|
953
|
+
elif isinstance(v, dict):
|
|
954
|
+
collectErr(collection, "maybe unexpected type type", str(v), lpath)
|
|
955
|
+
# try a sub-type object
|
|
956
|
+
_json_schema_stats_rec(v, lpath, collection, defs, is_logic=is_logic)
|
|
957
|
+
else:
|
|
958
|
+
collectErr(collection, "unexpected type type", str(v), lpath)
|
|
959
|
+
|
|
960
|
+
elif prop == "format":
|
|
961
|
+
|
|
962
|
+
# TODO improve format analysis! count all occurrences!?
|
|
963
|
+
if isinstance(val, str):
|
|
964
|
+
if val in OPENAPI_310_FORMATS:
|
|
965
|
+
collectSet(collection, "<openapi>", val)
|
|
966
|
+
continue
|
|
967
|
+
elif val not in FORMATS:
|
|
968
|
+
collectErr(collection, "unexpected format", val, lpath)
|
|
969
|
+
continue
|
|
970
|
+
# count expected format values
|
|
971
|
+
collectCnt(collection, f"format={val}")
|
|
972
|
+
else:
|
|
973
|
+
collectErr(collection, "invalid format type", typeof(val), lpath)
|
|
974
|
+
log.warning(f"ignoring {prop} value")
|
|
975
|
+
continue
|
|
976
|
+
|
|
977
|
+
elif prop == "pattern":
|
|
978
|
+
|
|
979
|
+
if not isinstance(val, str):
|
|
980
|
+
collectErr(collection, "invalid pattern type", typeof(val), lpath)
|
|
981
|
+
elif not is_regex(val):
|
|
982
|
+
collectErr(collection, "invalid regex", val, lpath)
|
|
983
|
+
|
|
984
|
+
elif prop == "patternProperties":
|
|
985
|
+
|
|
986
|
+
if isinstance(val, dict):
|
|
987
|
+
for k, v in val.items():
|
|
988
|
+
# assert isinstance(k, str)
|
|
989
|
+
if not is_regex(k):
|
|
990
|
+
collectErr(collection, "invalid regex", k, lpath)
|
|
991
|
+
else:
|
|
992
|
+
collectErr(collection, "invalid patternProperties type", typeof(val), lpath)
|
|
993
|
+
|
|
994
|
+
elif prop in ("dependencies", "properties"):
|
|
995
|
+
|
|
996
|
+
if not isinstance(val, dict):
|
|
997
|
+
collectErr(collection, f"non object {prop}", typeof(val), lpath)
|
|
998
|
+
|
|
999
|
+
elif prop == "items":
|
|
1000
|
+
|
|
1001
|
+
if isinstance(val, (list, tuple)):
|
|
1002
|
+
collection["items-list"] += 1
|
|
1003
|
+
if collection["<version>"] >= 9 and isinstance(val, (list, tuple)):
|
|
1004
|
+
collectErr(collection, "draft incompatibility",
|
|
1005
|
+
"invalid array value for items after Draft 9", lpath)
|
|
1006
|
+
|
|
1007
|
+
elif (prop == "id" and collection["<version>"] <= 5 and collection["<version>"] != 0 or
|
|
1008
|
+
prop == "$id" and collection["<version>"] >= 6):
|
|
1009
|
+
|
|
1010
|
+
if not isinstance(val, str):
|
|
1011
|
+
collectErr(collection, f"invalid {prop} value type", typeof(val), lpath)
|
|
1012
|
+
|
|
1013
|
+
elif (prop == "id" and collection["<version>"] >= 6 or
|
|
1014
|
+
prop == "$id" and collection["<version>"] <= 5 and collection["<version>"] != 0):
|
|
1015
|
+
|
|
1016
|
+
collectErr(collection, "draft incompatibility", "id/$id draft confusion", lpath)
|
|
1017
|
+
|
|
1018
|
+
elif prop == "$ref":
|
|
1019
|
+
# NOTE "#" is used for recursion at the root
|
|
1020
|
+
if not isinstance(val, str):
|
|
1021
|
+
collectErr(collection, "invalid $ref type", typeof(val), lpath)
|
|
1022
|
+
elif val.startswith("#/"):
|
|
1023
|
+
valu = url_unquote(val)
|
|
1024
|
+
if valu not in defs:
|
|
1025
|
+
collectErr(collection, "dangling $ref value", val, lpath)
|
|
1026
|
+
else:
|
|
1027
|
+
log.warning(f"ignoring $ref value: {val}")
|
|
1028
|
+
|
|
1029
|
+
elif prop == "default":
|
|
1030
|
+
if typeof(val) not in types:
|
|
1031
|
+
collectErr(collection, "type inconsistency",
|
|
1032
|
+
f"default {typeof(val)} / {types}", lpath)
|
|
1033
|
+
|
|
1034
|
+
elif prop == "examples":
|
|
1035
|
+
if isinstance(val, (list, tuple)):
|
|
1036
|
+
for v in val:
|
|
1037
|
+
if typeof(v) not in types:
|
|
1038
|
+
collectErr(collection, "type inconsistency",
|
|
1039
|
+
f"examples {typeof(v)} / {types}", lpath)
|
|
1040
|
+
else:
|
|
1041
|
+
collectErr(collection, f"invalid {prop} value type", typeof(val), lpath)
|
|
1042
|
+
|
|
1043
|
+
# possible recursions
|
|
1044
|
+
if prop in SCHEMA_KEYS_OBJECT_VALUES_SCHEMAS:
|
|
1045
|
+
# log.info(f"object {prop}: {val}")
|
|
1046
|
+
is_a_defs = prop in ("$defs", "definitions")
|
|
1047
|
+
if isinstance(val, dict):
|
|
1048
|
+
for k, v in val.items():
|
|
1049
|
+
_json_schema_stats_rec(v, ap(lpath, k), collection, # pyright: ignore
|
|
1050
|
+
defs, is_defs=is_a_defs, is_logic=is_logic)
|
|
1051
|
+
collection[f"{prop}-count"] += len(val)
|
|
1052
|
+
else:
|
|
1053
|
+
log.warning(f"ignoring {prop} non-object value")
|
|
1054
|
+
|
|
1055
|
+
if prop in SCHEMA_KEYS_ARRAY_OF_SCHEMAS: # combinators and others
|
|
1056
|
+
if isinstance(val, (list, tuple)):
|
|
1057
|
+
if len(val) == 0:
|
|
1058
|
+
collectErr(collection, "empty schema array", prop, lpath)
|
|
1059
|
+
context = types if prop in ("allOf", "anyOf", "oneOf") else ALL_TYPES
|
|
1060
|
+
for i, v in enumerate(val):
|
|
1061
|
+
_json_schema_stats_rec(v, f"{lpath}[{i}]", collection, # pyright: ignore
|
|
1062
|
+
defs, context, is_logic=is_logic)
|
|
1063
|
+
else:
|
|
1064
|
+
log.warning(f"ignoring {prop} non-array value")
|
|
1065
|
+
|
|
1066
|
+
if prop in SCHEMA_KEYS_VALUE_SCHEMA:
|
|
1067
|
+
if prop == "dependencies" and looks_like_simple_dependencies(val):
|
|
1068
|
+
# TODO we should check that properties exists!
|
|
1069
|
+
pass
|
|
1070
|
+
elif isinstance(val, (bool, dict)):
|
|
1071
|
+
_json_schema_stats_rec(val, lpath, collection, defs,
|
|
1072
|
+
is_logic=is_logic or prop in ("if", "then", "else", "not"))
|
|
1073
|
+
else:
|
|
1074
|
+
log.warning(f"ignoring {prop} non-schema value")
|
|
1075
|
+
|
|
1076
|
+
if prop in SCHEMA_KEYS_VALUE_SCHEMA and prop in SCHEMA_KEYS_ARRAY_OF_SCHEMAS:
|
|
1077
|
+
if not isinstance(val, (bool, dict, list, tuple)):
|
|
1078
|
+
collectErr(collection, "unexpected type", f"{prop} / {typeof(val)}", lpath)
|
|
1079
|
+
|
|
1080
|
+
# special case for which we keep a (truncated) value
|
|
1081
|
+
if prop in SPECIAL_VALUES:
|
|
1082
|
+
key = f"{prop}={str(val)[:64]}"
|
|
1083
|
+
if key not in collection:
|
|
1084
|
+
collection[key] = 0
|
|
1085
|
+
collection[key] += 1
|
|
1086
|
+
|
|
1087
|
+
if prop in INTEGER_KEYWORDS and not isinstance(val, int):
|
|
1088
|
+
collectErr(collection, "non integer value", f"{prop} / {typeof(val)}", lpath)
|
|
1089
|
+
|
|
1090
|
+
elif prop in NUMBER_KEYWORDS and not isinstance(val, (int, float)):
|
|
1091
|
+
collectErr(collection, "non number value", f"{prop} / {typeof(val)}", lpath)
|
|
1092
|
+
|
|
1093
|
+
# Case additionalProperties in Properties
|
|
1094
|
+
if prop == "properties" and isinstance(val, dict):
|
|
1095
|
+
# dans val il y a des propriétés PER_TYPE["object"] mais pas la propriété "properties"
|
|
1096
|
+
doubts = list(filter(lambda k: k in DOUBTFUL_PROPERTY_NAMES, val.keys()))
|
|
1097
|
+
nb_js_prop = len(doubts)
|
|
1098
|
+
# nb_prop = len(val)
|
|
1099
|
+
if nb_js_prop >= 1 and "properties" not in val:
|
|
1100
|
+
collectCnt(collection, "<bad-properties-nesting>")
|
|
1101
|
+
collectSet(collection, "<bad-properties-nesting-where>", f"{path}: {doubts}")
|
|
1102
|
+
|
|
1103
|
+
elif prop == "additionalProperties" and isinstance(val, bool):
|
|
1104
|
+
if val:
|
|
1105
|
+
collection["additionalProperties=true"] += 1
|
|
1106
|
+
else:
|
|
1107
|
+
collection["additionalProperties=false"] += 1
|
|
1108
|
+
|
|
1109
|
+
elif prop == "additionalItems" and isinstance(val, bool):
|
|
1110
|
+
if val:
|
|
1111
|
+
collection["additionalItems=true"] += 1
|
|
1112
|
+
else:
|
|
1113
|
+
collection["additionalItems=false"] += 1
|
|
1114
|
+
|
|
1115
|
+
elif prop == "required":
|
|
1116
|
+
if collection["<version>"] >= 4 and isinstance(val, bool):
|
|
1117
|
+
collectErr(collection, "draft incompatibility",
|
|
1118
|
+
"invalid bool required for Draft 4 and later", lpath)
|
|
1119
|
+
elif collection["<version>"] == 3 and isinstance(val, (list, tuple)):
|
|
1120
|
+
collectErr(collection, "draft incompatibility",
|
|
1121
|
+
"invalid array required for Draft 3", lpath)
|
|
1122
|
+
elif not isinstance(val, (bool, list, tuple)):
|
|
1123
|
+
collectErr(collection, "invalid required type", typeof(val), lpath)
|
|
1124
|
+
|
|
1125
|
+
if isinstance(val, (list, tuple)):
|
|
1126
|
+
collection["required-list"] += 1
|
|
1127
|
+
collection["required-count"] += len(val)
|
|
1128
|
+
if len(val) == 0:
|
|
1129
|
+
collection["required-empty"] += 1
|
|
1130
|
+
if collection["<version>"] == 4:
|
|
1131
|
+
# but is is okay for 6 and later:-/
|
|
1132
|
+
collectErr(collection, "draft incompatibility",
|
|
1133
|
+
"invalid empty required for Draft 4", lpath)
|
|
1134
|
+
elif isinstance(val, bool): # OLD
|
|
1135
|
+
collection["required-bool"] += 1
|
|
1136
|
+
collection[f"required={str(val).lower()}"] += 1
|
|
1137
|
+
else:
|
|
1138
|
+
log.warning(f"unexpected required: {val}")
|
|
1139
|
+
continue
|
|
1140
|
+
|
|
1141
|
+
elif prop == "enum":
|
|
1142
|
+
if isinstance(val, (list, tuple)):
|
|
1143
|
+
if len(val) == 0:
|
|
1144
|
+
collectErr(collection, "empty array enum", "", lpath)
|
|
1145
|
+
collection["enum-empty"] += 1
|
|
1146
|
+
elif len(val) == 1:
|
|
1147
|
+
collection["enum-one"] += 1
|
|
1148
|
+
else:
|
|
1149
|
+
collectErr(collection, "non array enum", typeof(val), lpath)
|
|
1150
|
+
|
|
1151
|
+
elif prop in ("exclusiveMinimum", "exclusiveMaximum"):
|
|
1152
|
+
if isinstance(val, bool):
|
|
1153
|
+
collection[f"{prop}={str(val).lower()}"] += 1
|
|
1154
|
+
|
|
1155
|
+
if prop in SCHEMA_KEYS_ARRAY_OF_SCHEMAS:
|
|
1156
|
+
# this silently ignores non lists
|
|
1157
|
+
if isinstance(val, (list, tuple)):
|
|
1158
|
+
collection[f"{prop}-count"] += len(val)
|
|
1159
|
+
if len(val) == 0:
|
|
1160
|
+
collection[f"{prop}-empty"] += 1
|
|
1161
|
+
elif len(val) == 1:
|
|
1162
|
+
collection[f"{prop}-one"] += 1
|
|
1163
|
+
|
|
1164
|
+
if prop in ("type", "required", "enum") and isinstance(val, (list, tuple)):
|
|
1165
|
+
if not distinct_values(val):
|
|
1166
|
+
collectErr(collection, "non unique array", f"{prop} {len(val)}", lpath)
|
|
1167
|
+
|
|
1168
|
+
# FIXME should follow references as well!
|
|
1169
|
+
# FIXME should take care of adjacent keywords in the resolution!
|
|
1170
|
+
# build type hints based on keywords
|
|
1171
|
+
hints = getHints(jdata, defs, [], path)
|
|
1172
|
+
|
|
1173
|
+
# special case for required
|
|
1174
|
+
required_list = "required" in jdata and isinstance(jdata["required"], (list, tuple))
|
|
1175
|
+
if required_list:
|
|
1176
|
+
hints.add(PROP_TO_TYPE["required-list"])
|
|
1177
|
+
# TODO resolve references? recurse??
|
|
1178
|
+
# NOTE filter out constructs which may bring hidden properties
|
|
1179
|
+
if "properties" in jdata and not set(jdata.keys()).intersection({"oneOf", "anyOf", "$ref"}):
|
|
1180
|
+
required = jdata["required"]
|
|
1181
|
+
assert isinstance(required, list) # pyright hint
|
|
1182
|
+
properties = jdata["properties"]
|
|
1183
|
+
assert isinstance(properties, dict) # pyright hint
|
|
1184
|
+
for p in required:
|
|
1185
|
+
assert isinstance(p, str) # pyright hint
|
|
1186
|
+
if p not in properties:
|
|
1187
|
+
collectErr(collection, "unknown required property", p, path)
|
|
1188
|
+
# else: maybe properties are in a reference…
|
|
1189
|
+
|
|
1190
|
+
# check whether found types are compatible with declared types
|
|
1191
|
+
for m in hints:
|
|
1192
|
+
# a type hint is not compatible with possible types
|
|
1193
|
+
if m in JSON_SCHEMA_TYPES and m not in types:
|
|
1194
|
+
if m == "number" and "integer" in types:
|
|
1195
|
+
# ok, integer is a kind of number
|
|
1196
|
+
pass
|
|
1197
|
+
else:
|
|
1198
|
+
# extract direct keywords which hinted to type "m"
|
|
1199
|
+
keywords = set(filter(lambda p: PROP_TO_TYPE.get(p, "") == m, jdata.keys()))
|
|
1200
|
+
if required_list:
|
|
1201
|
+
keywords.add('required')
|
|
1202
|
+
# actual types found
|
|
1203
|
+
# FIXME probably useless
|
|
1204
|
+
foundtypes = set(filter(lambda t: t in JSON_SCHEMA_TYPES, types))
|
|
1205
|
+
if not foundtypes:
|
|
1206
|
+
foundtypes = set(types)
|
|
1207
|
+
collectErr(collection, "bad mix",
|
|
1208
|
+
f"{m}: {sorted(foundtypes)} {sorted(keywords)}", path)
|
|
1209
|
+
|
|
1210
|
+
# no type declarations *BUT* some type hints
|
|
1211
|
+
# NOTE *direct* definitions are skipped, should be triggered when/if used
|
|
1212
|
+
# TODO <= 2? other?
|
|
1213
|
+
if len(types) == 7:
|
|
1214
|
+
type_hints = hints.difference({"meta", "combi", "hyper", "alone"})
|
|
1215
|
+
if not is_defs and not is_logic and len(type_hints) == 1:
|
|
1216
|
+
collectErr(collection, "missing type declaration", f"{type_hints}", path)
|
|
1217
|
+
elif len(type_hints) == 0 and len(path) > 1:
|
|
1218
|
+
collectErr(collection, "suspicious empty type", "*", path)
|
|
1219
|
+
|
|
1220
|
+
# log.debug(f"mixins: {mixins}")
|
|
1221
|
+
mix = "*-" + "-".join(sorted(types)) + "/" + "-".join(sorted(hints))
|
|
1222
|
+
|
|
1223
|
+
if mix != "*-/":
|
|
1224
|
+
if mix in collection:
|
|
1225
|
+
collection[mix] += 1
|
|
1226
|
+
else:
|
|
1227
|
+
collection[mix] = 1
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
def json_schema_stats(jdata):
|
|
1231
|
+
"""Return stats about a JSON data structure."""
|
|
1232
|
+
|
|
1233
|
+
# global GET_TYPES_CACHE, GET_HINTS_CACHE
|
|
1234
|
+
GET_TYPES_CACHE.clear()
|
|
1235
|
+
GET_HINTS_CACHE.clear()
|
|
1236
|
+
|
|
1237
|
+
# we first collect all possible local definitions, just in case
|
|
1238
|
+
defs = Defs()
|
|
1239
|
+
_collect_all_defs_rec(jdata, defs)
|
|
1240
|
+
|
|
1241
|
+
# then proceed to analyze the schema
|
|
1242
|
+
collection: dict[str, Any] = { k: 0 for k in SCHEMA_KEYS_INIT }
|
|
1243
|
+
_json_schema_stats_rec(jdata, "$", collection, defs) # type: ignore
|
|
1244
|
+
|
|
1245
|
+
# unused definitions
|
|
1246
|
+
collection["<unused-defs>"] = list(sorted(defs.unusedDefs()))
|
|
1247
|
+
|
|
1248
|
+
# do version guessing on the result
|
|
1249
|
+
guess_version(collection)
|
|
1250
|
+
|
|
1251
|
+
# official $schema at root?
|
|
1252
|
+
if isinstance(jdata, dict) and "$schema" in jdata:
|
|
1253
|
+
if "/json-schema.org/" in jdata["$schema"]:
|
|
1254
|
+
collection["<explicit-schema>"] = True
|
|
1255
|
+
else:
|
|
1256
|
+
collection["<explicit-schema>"] = False
|
|
1257
|
+
collection["<$schema>"] = jdata["$schema"]
|
|
1258
|
+
else:
|
|
1259
|
+
collection["<explicit-schema>"] = False
|
|
1260
|
+
collection["<$schema>"] = "<unknown_explicit_schema>"
|
|
1261
|
+
|
|
1262
|
+
# is the root type compatible with a JSON schema?
|
|
1263
|
+
collection["<bad-root>"] = not isinstance(jdata, (bool, dict))
|
|
1264
|
+
|
|
1265
|
+
# look for schema property name hints
|
|
1266
|
+
collection["<schema-prop>"] = False
|
|
1267
|
+
if isinstance(jdata, dict):
|
|
1268
|
+
for prop in jdata.keys():
|
|
1269
|
+
if prop in PROP_TO_TYPE and PROP_TO_TYPE[prop] in \
|
|
1270
|
+
["alone", "number", "string", "array", "object", "combi"]:
|
|
1271
|
+
collection["<schema-prop>"] = True
|
|
1272
|
+
break
|
|
1273
|
+
|
|
1274
|
+
# cleanup sets
|
|
1275
|
+
for key in SETS:
|
|
1276
|
+
if key in collection:
|
|
1277
|
+
collection[key] = list(sorted(collection[key]))
|
|
1278
|
+
|
|
1279
|
+
return collection
|
|
1280
|
+
|
|
1281
|
+
|
|
1282
|
+
def normalize_ods(fn, schema):
|
|
1283
|
+
if not isinstance(schema, dict) or len(schema) != 4:
|
|
1284
|
+
return
|
|
1285
|
+
if ("title" not in schema or "definitions" not in schema or "oneOf" not in schema or
|
|
1286
|
+
"type" not in schema):
|
|
1287
|
+
return
|
|
1288
|
+
if schema["type"] != "object":
|
|
1289
|
+
return
|
|
1290
|
+
|
|
1291
|
+
title = schema["title"]
|
|
1292
|
+
oneof = schema["oneOf"]
|
|
1293
|
+
|
|
1294
|
+
if (len(oneof) != 1 or len(oneof[0]) != 1 or "$ref" not in oneof[0] and
|
|
1295
|
+
oneof[0]["$ref"] != f"#/definitions/{title}"):
|
|
1296
|
+
return
|
|
1297
|
+
|
|
1298
|
+
rec = f"{title}_records"
|
|
1299
|
+
defs = schema["definitions"]
|
|
1300
|
+
if title not in defs or rec not in defs or len(defs) != 2:
|
|
1301
|
+
return
|
|
1302
|
+
|
|
1303
|
+
log.warning(f"Anonymizing {fn}")
|
|
1304
|
+
schema["title"] = "ANONYM"
|
|
1305
|
+
oneof[0]["$ref"] = "#/definitions/ANONYM"
|
|
1306
|
+
defs["ANONYM"] = defs[title]
|
|
1307
|
+
del defs[title]
|
|
1308
|
+
defs["ANONYM_records"] = defs[rec]
|
|
1309
|
+
del defs[rec]
|
|
1310
|
+
defs["ANONYM"]["properties"]["records"]["items"]["$ref"] = "#/definitions/ANONYM_records"
|