json-schema-utils 0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jsutils/simplify.py ADDED
@@ -0,0 +1,580 @@
1
+ # TODO
2
+ # oneOf [ { "enum": [] }, { "const": } ]
3
+ # import urllib
4
+ from typing import Any
5
+ import copy
6
+ from .utils import JsonSchema, log, JSUError, only
7
+ from .recurse import recurseSchema
8
+ from .inline import mergeProperty
9
+
10
+ # type-specific properties
11
+ # TODO complete
12
+ TYPED_PROPS: dict[str, set[str]] = {
13
+ # format: not in theory, quite often in practice
14
+ "string": {"minLength", "maxLength", "pattern"},
15
+ "number": {"minimum", "exclusiveMinimum", "maximum", "exclusiveMaximum", "multipleOf"},
16
+ "object": {"additionalProperties", "unevaluatedProperties", "propertyNames", "required",
17
+ "properties", "minProperties", "maxProperties", "patternProperties"},
18
+ "array": {"items", "minItems", "maxItems", "prefixItems", "contains", "minContains",
19
+ "maxContains", "unevaluatedItems", "additionalItems"},
20
+ "boolean": set(),
21
+ "null": set()
22
+ }
23
+
24
+
25
+ def incompatibleProps(st: str):
26
+ props = set()
27
+ [ props := props.union(p) for t, p in TYPED_PROPS.items() if t != st ]
28
+ return props
29
+
30
+
31
+ # string-specific predefined formats
32
+ # NOTE some extensions use other formats, eg for "int32" for numbers
33
+ STRING_FORMATS: set[str] = {
34
+ "date", "date-time", "time", "duration",
35
+ "email", "idn-email",
36
+ "hostname", "idn-hostname", "ipv4", "ipv6",
37
+ "uri", "uri-reference", "uri-template",
38
+ "iri", "iri-reference",
39
+ "uuid",
40
+ "json-pointer", "relative-json-pointer",
41
+ "regex",
42
+ }
43
+
44
+
45
+ def counts(lv: list[Any]) -> dict[Any, int]:
46
+ """Count values in list. Probably exists elsewhere."""
47
+ cnt = {}
48
+ for v in lv:
49
+ cnt[v] = (cnt[v] + 1) if v in cnt else 1
50
+ return cnt
51
+
52
+
53
+ def getEnum(ls: list[JsonSchema], is_one: bool) -> list[Any]|None:
54
+ """Attempt to extract a list of constants."""
55
+ assert isinstance(ls, list)
56
+ lv = []
57
+ for s in ls:
58
+ if isinstance(s, dict):
59
+ if "const" in s:
60
+ lv.append(s["const"])
61
+ elif "enum" in s:
62
+ assert isinstance(s["enum"], list)
63
+ lv.extend(dict.fromkeys(s["enum"]))
64
+ else:
65
+ return None
66
+ else:
67
+ return None
68
+ cnt = counts(lv)
69
+ if is_one:
70
+ # fully remove duplicates
71
+ lv = list(filter(lambda i: cnt[i] == 1, lv))
72
+ else:
73
+ # only remove duplicates
74
+ lv = list(dict.fromkeys(lv))
75
+ return lv
76
+
77
+
78
+ def _typeCompat(t: str, v: Any) -> bool:
79
+ """Check JSON type / value compatibility."""
80
+ return ((t == "null" and v is None) or
81
+ (t == "boolean" and isinstance(v, bool)) or
82
+ (t == "number" and isinstance(v, (int, float))) or
83
+ (t == "string" and isinstance(v, str)) or
84
+ (t == "array" and isinstance(v, (list, tuple))) or
85
+ (t == "object" and isinstance(v, dict)))
86
+
87
+ _IGNORABLE = (
88
+ # core
89
+ "$schema", "$id", "$comment", "$vocabulary", "$anchor", "$dynamicAnchor",
90
+ # metadata
91
+ "description", "title", "readOnly", "writeOnly", "default", "examples", "deprecated",
92
+ # namespace
93
+ "definitions", "$defs",
94
+ )
95
+
96
+ def _ignored(schema: JsonSchema) -> JsonSchema:
97
+ """Remove preperties dans do not need to be considered."""
98
+ if isinstance(schema, bool):
99
+ return schema
100
+ schema = copy.deepcopy(schema)
101
+ for keyword in _IGNORABLE:
102
+ if keyword in schema:
103
+ del schema[keyword]
104
+ return schema
105
+
106
+ def same(s1: JsonSchema, s2: JsonSchema) -> bool:
107
+ return _ignored(s1) == _ignored(s2)
108
+
109
+ def simplifySchema(schema: JsonSchema, url: str):
110
+ """Simplify a JSON Schema with various rules."""
111
+
112
+ # schema version for $ref aggressive pruning
113
+ version: int
114
+ if isinstance(schema, dict) and "$schema" in schema and isinstance(schema["$schema"], str):
115
+ ds = schema["$schema"]
116
+ version = \
117
+ 9 if "2020-12" in ds else \
118
+ 8 if "2019-09" in ds else \
119
+ 7 if "draft-07" in ds else \
120
+ 6 if "draft-06" in ds else \
121
+ 4 if "draft-04" in ds else \
122
+ 3 if "draft-03" in ds else \
123
+ 2 if "draft-02" in ds else \
124
+ 1 if "draft-01" in ds else \
125
+ 9
126
+ else:
127
+ version = 9 # 2020-12
128
+
129
+ # TODO more generic dynamicAnchor removal
130
+ # TODO anchor removal?
131
+ # FIXME check that there is only one dynamicAnchor of this name?!
132
+ dynroot: str|None = None
133
+ if isinstance(schema, dict) and "$dynamicAnchor" in schema:
134
+ dynroot = schema["$dynamicAnchor"]
135
+ del schema["$dynamicAnchor"]
136
+
137
+ def rwtSimpler(schema: JsonSchema, path: list[str]) -> JsonSchema:
138
+
139
+ lpath = ".".join(path) if path else "."
140
+
141
+ if isinstance(schema, bool):
142
+ return schema
143
+ assert isinstance(schema, dict)
144
+
145
+ # references
146
+ if "$ref" in schema and version <= 7:
147
+ # https://json-schema.org/draft-07/draft-handrews-json-schema-01#rfc.section.8.3
148
+ keep = { p: v for p, v in schema.items() if p in _IGNORABLE or p == "$ref" }
149
+ if len(keep) != len(schema):
150
+ log.warning(f"dropping all props adjacent to $ref on old schemas at {path}")
151
+ return keep
152
+
153
+ if isinstance(dynroot, str):
154
+ if path and "$dynamicAnchor" in schema and schema["$dynamicAnchor"] == dynroot:
155
+ log.error(f"Ooops: multiple root dynamic anchor: {dynroot}")
156
+ raise Exception("FIXME!")
157
+
158
+ if "$dynamicRef" in schema:
159
+ dref = schema["$dynamicRef"]
160
+ if dref == "#" + dynroot:
161
+ log.info(f"replacing root $dynamicAnchor with simple $ref at {path}")
162
+ del schema["$dynamicRef"]
163
+ schema["$ref"] = "#"
164
+
165
+ # TODO anyOf/oneOf/allOf of length 0?
166
+ # anyOf/oneOf/allOf of length 1
167
+ for prop in ("anyOf", "oneOf", "allOf"):
168
+ if (isinstance(schema, dict) and prop in schema and
169
+ len(schema[prop]) == 1): # type: ignore
170
+ try:
171
+ nschema = copy.deepcopy(schema)
172
+ sub = schema[prop][0] # pyright: ignore
173
+ for p, v in sub.items(): # pyright: ignore
174
+ nschema = mergeProperty(nschema, p, v)
175
+ # success!
176
+ schema = nschema
177
+ if isinstance(schema, dict):
178
+ del schema[prop]
179
+ except JSUError as e:
180
+ log.debug(e)
181
+ log.warning(f"{prop} of one merge failed")
182
+
183
+ if isinstance(schema, bool):
184
+ return schema
185
+ assert isinstance(schema, dict)
186
+
187
+ # TODO detect inconsistent allOf?
188
+
189
+ # switch oneOf/anyOf const/enum to enum/const
190
+ for prop in ("oneOf", "anyOf"):
191
+ if prop in schema:
192
+ val = schema[prop]
193
+ assert isinstance(val, list)
194
+ lv = getEnum(val, prop == "oneOf") # pyright: ignore
195
+ if lv is not None:
196
+ del schema[prop]
197
+ log.info(f"{prop} to enum/const/false at {lpath}")
198
+ if len(lv) == 0:
199
+ # FIXME check
200
+ return False
201
+ else: # at least one
202
+ if "enum" in schema:
203
+ lev = schema["enum"]
204
+ del schema["enum"]
205
+ assert isinstance(lev, list)
206
+ # intersect in initial order
207
+ nlv = []
208
+ for v in lev:
209
+ if v in lv:
210
+ nlv.append(v)
211
+ schema["enum"] = nlv
212
+ else:
213
+ schema["enum"] = lv
214
+
215
+ # void condition application
216
+ for kw in ("then", "else"):
217
+ if kw in schema:
218
+ subs = schema[kw]
219
+ compat = True
220
+ for k, v in subs.items():
221
+ if k in _IGNORABLE:
222
+ pass
223
+ elif k in schema and v == schema[k]:
224
+ pass
225
+ elif k in schema:
226
+ # special case, check for inclusion
227
+ if k == "required":
228
+ assert isinstance(v, list) # and str
229
+ for n in v:
230
+ if n not in schema["required"]:
231
+ compat = False
232
+ else:
233
+ compat = False
234
+ if compat:
235
+ log.info(f"removing ineffective {kw}")
236
+ del schema[kw]
237
+
238
+ # if/then/else
239
+ if "if" not in schema:
240
+ for kw in ("then", "else"):
241
+ if kw in schema:
242
+ log.info(f"removing {kw} without if")
243
+ del schema[kw]
244
+ if "if" in schema and not ("then" in schema or "else" in schema):
245
+ log.info(f"removing lone if at {path}")
246
+ del schema["if"]
247
+
248
+ # simplify condition if possible
249
+ if "if" in schema:
250
+ cond = schema["if"]
251
+ if "not" in cond and only(cond, "not", *_IGNORABLE):
252
+ log.info("simplifying if not")
253
+ schema["if"] = cond["not"]
254
+ sthen = schema.get("then", None)
255
+ selse = schema.get("else", None)
256
+ if sthen is not None:
257
+ schema["else"] = sthen
258
+ if selse is not None:
259
+ schema["then"] = selse
260
+ else:
261
+ del schema["then"]
262
+ else:
263
+ assert selse is not None
264
+ schema["then"] = selse
265
+ del schema["else"]
266
+
267
+ # short type list
268
+ if "type" in schema and isinstance(schema["type"], list):
269
+ types = schema["type"]
270
+ if len(types) == 0:
271
+ return False
272
+ elif len(types) == 1:
273
+ schema["type"] = types[0]
274
+ # type/props…
275
+ if "type" in schema and isinstance(schema["type"], str):
276
+ stype = schema["type"]
277
+ if stype == "number":
278
+ if "multipleOf" in schema and schema["multipleOf"] == 1:
279
+ schema["type"] = "integer"
280
+ del schema["multipleOf"]
281
+ if stype == "integer":
282
+ if "multipleOf" in schema and schema["multipleOf"] == 1:
283
+ del schema["multipleOf"]
284
+ # use this for later type-related checks
285
+ stype = "number"
286
+ # remove type-specific properties
287
+ if stype in TYPED_PROPS:
288
+ for p in incompatibleProps(stype):
289
+ if p in schema:
290
+ log.info(f"unused property {p} for {stype} at {lpath}")
291
+ del schema[p]
292
+ if stype != "string" and "format" in schema and schema["format"] in STRING_FORMATS:
293
+ log.info(f"unused string format on {stype}: {schema['format']}")
294
+ del schema["format"]
295
+ # type/const
296
+ if "const" in schema:
297
+ cst = schema["const"]
298
+ if _typeCompat(stype, cst):
299
+ log.info(f"removing redundant type with const at {lpath}")
300
+ del schema["type"]
301
+ else:
302
+ log.info(f"incompatible type {stype} for {cst} at {lpath}")
303
+ return False
304
+ # type/enum
305
+ if "enum" in schema:
306
+ vals = schema["enum"]
307
+ assert isinstance(vals, list)
308
+ nvals = list(filter(lambda v: _typeCompat(stype, v), vals))
309
+ if len(vals) != len(nvals):
310
+ log.info(f"removing {len(vals) - len(nvals)} incompatible values "
311
+ f"from enum at {lpath}")
312
+ schema["enum"] = nvals
313
+ del schema["type"]
314
+ # simplify any array
315
+ if stype == "array":
316
+ simpler = _ignored(schema)
317
+ assert isinstance(simpler, dict) # pyright hint
318
+ if len(simpler) == 2 and "type" in schema:
319
+ # lone keyword
320
+ for kw in ("items", "additionalItems", "unevaluatedItems"):
321
+ if kw in schema:
322
+ subschema = _ignored(schema[kw]) # pyright: ignore
323
+ if subschema in (True, {}):
324
+ log.info(f"removing useless {kw} keyword at {lpath}")
325
+ del schema[kw]
326
+ # simplify any object
327
+ if stype == "object":
328
+ simpler = _ignored(schema)
329
+ assert isinstance(simpler, dict) # pyright hint
330
+ if len(simpler) == 2 and "type" in schema:
331
+ # lone keyword
332
+ for kw in ("additionalProperties", "unevaluatedProperties"):
333
+ if kw in schema:
334
+ subschema = _ignored(schema[kw]) # pyright: ignore
335
+ if subschema in (True, {}):
336
+ log.info(f"removing useless {kw} keyword at {lpath}")
337
+ del schema[kw]
338
+
339
+ # simplify propertyNames + additionalProperties to patternProperties
340
+ if "propertyNames" in schema and "additionalProperties" in schema and \
341
+ "properties" not in schema and "patternProperties" not in schema:
342
+ pn = schema["propertyNames"]
343
+ ap = schema["additionalProperties"]
344
+ if "pattern" in pn and only(pn, "pattern", "type", *_IGNORABLE):
345
+ log.info(f"switching propertyNames and additionalProperties to patternProperties at {lpath}")
346
+ del schema["propertyNames"]
347
+ del schema["additionalProperties"]
348
+ schema["patternProperties"] = { pn["pattern"]: ap }
349
+
350
+ # const/enum
351
+ if "const" in schema and "enum" in schema:
352
+ log.info(f"const/enum at {lpath}")
353
+ assert isinstance(schema["enum"], list)
354
+ if schema["const"] in schema["enum"]:
355
+ del schema["enum"]
356
+ else:
357
+ return False
358
+ elif "enum" in schema:
359
+ assert isinstance(schema["enum"], list)
360
+ nenum = len(schema["enum"])
361
+ if nenum == 0:
362
+ log.info(f"empty enum at {lpath}")
363
+ return False
364
+ elif nenum == 1:
365
+ log.info(f"enum of one at {lpath}")
366
+ schema["const"] = schema["enum"][0]
367
+ del schema["enum"]
368
+
369
+ return schema
370
+
371
+ return recurseSchema(schema, url, rwt=rwtSimpler)
372
+
373
+ #
374
+ # move definitions at the root and resolve ids
375
+ #
376
+ from urllib.parse import quote, unquote
377
+
378
+ def _defId(schema) -> tuple[str|None, str|None]:
379
+ """return name of definitions and id properties."""
380
+ if not isinstance(schema, dict):
381
+ return (None, None)
382
+ defn = "$defs" if "$defs" in schema else \
383
+ "definitions" if "definitions" in schema else \
384
+ None
385
+ idn = "$id" if "$id" in schema else \
386
+ "id" if "id" in schema else \
387
+ None
388
+ return (defn, idn)
389
+
390
+ _SUBCOUNT: int = 0
391
+
392
+ # TODO handle arbitrary path references
393
+
394
+ def _scopeSubDefs(schema: JsonSchema, defs: dict[str, JsonSchema], rootdef: str,
395
+ moved: dict[str, str], ids: dict[str, str], delete: list[tuple[Any, str]],
396
+ path: list[str|int] = []):
397
+
398
+ log.debug(f"handing $ids/$defs at {path}")
399
+
400
+ global _SUBCOUNT
401
+ defn, idn = _defId(schema)
402
+
403
+ if defn is None:
404
+ return
405
+
406
+ if path and defn and not idn:
407
+ # nested definitions, move them up
408
+
409
+ prefix = f"_defs_{_SUBCOUNT}_"
410
+ _SUBCOUNT += 1
411
+
412
+ for name, sschema in schema[defn].items():
413
+ # FIXME name may be quite ugly… eg a full URL
414
+ if "/" not in name: # reuse name if simple
415
+ new_name = prefix + name
416
+ old_name = name
417
+ else:
418
+ new_name = f"_dsub_{_SUBCOUNT}_"
419
+ _SUBCOUNT += 1
420
+ old_name = quote(name).replace("~", "~0").replace("/", "~1")
421
+ npath = rootdef + "/" + new_name
422
+ opath = f"#/{'/'.join(path)}/{defn}/{old_name}"
423
+ sschema["$comment"] = f"origin: {opath}"
424
+ moved[opath] = npath
425
+ defs[new_name] = sschema
426
+
427
+ schema["$comment"] = f"{defn} {_SUBCOUNT} moved"
428
+
429
+ delete.append((schema, defn, None, None, None))
430
+
431
+ elif path and defn and idn:
432
+ # if we have a nested id, we move definitions to defs and rewrite local refs
433
+
434
+ sid = schema[idn]
435
+ assert isinstance(sid, str)
436
+
437
+ del schema[idn]
438
+ if "id" in schema: # WTF: both $id and id…
439
+ del schema["id"]
440
+
441
+ # keep track of changes
442
+ schema["$comment"] = f"{idn} {_SUBCOUNT}: {sid}"
443
+
444
+ prefix = f"_id_{_SUBCOUNT}_"
445
+ _SUBCOUNT += 1
446
+
447
+ # to remap long references later
448
+ # we have a local path for an external url
449
+ ids[sid] = rootdef + "/" + prefix
450
+ iddefs = f"#/{defn}/"
451
+ # id's defs with be there
452
+ moved[sid + iddefs] = rootdef + "/" + prefix
453
+ # "#/" + "/".join(p if "/" not in p and "%" not in p else
454
+ # quote(p).replace("~", "~0").replace("/", "~1")
455
+ # for p in path)
456
+
457
+ # remap all sub-schema local references
458
+ def rwtRef(schema, lpath):
459
+ if isinstance(schema, dict) and "$ref" in schema:
460
+ dest = schema["$ref"]
461
+ assert isinstance(dest, str)
462
+ if dest.startswith(iddefs): # local ref
463
+ schema["$ref"] = rootdef + "/" + prefix + dest[len(iddefs):]
464
+ elif dest in ("#", "#/"): # myself, will have to be made consistent later!
465
+ schema["$ref"] = ids[sid]
466
+ return schema
467
+
468
+ recurseSchema(schema, "", rwt=rwtRef)
469
+
470
+ # move local definitions as global
471
+ for name, sschem in schema[defn].items():
472
+ pname = prefix + name
473
+ assert pname not in defs
474
+ defs[pname] = sschem
475
+
476
+ # we need to keep the schema in place for handling arbitrary url
477
+ # whole object will be moved later
478
+ delete.append((schema, defn, prefix, ids[sid], sid))
479
+
480
+
481
+ def scopeDefs(schema: JsonSchema):
482
+ """Move internal definitions/$defs to root schema, possibly handing nested $id"""
483
+
484
+ # collect $id/id and $defs/definitions
485
+ todo_ids, todo_defs = [], []
486
+
487
+ def fltDefs(schema, path):
488
+ if path and isinstance(schema, dict):
489
+ defn, idn = _defId(schema)
490
+ if idn is not None:
491
+ todo_ids.append((schema, path))
492
+ elif defn is not None:
493
+ todo_defs.append((schema, path))
494
+ return True
495
+
496
+ recurseSchema(schema, "", flt=fltDefs)
497
+
498
+ if not todo_ids and not todo_defs:
499
+ return
500
+
501
+ # ensure definitions root
502
+ defn, idn = _defId(schema)
503
+
504
+ if defn is None:
505
+ defn = "$defs"
506
+ schema[defn] = {}
507
+
508
+ # do internal renamings
509
+ rootdef, moved, ids, delete = f"#/{defn}", {}, {}, []
510
+
511
+ for s, p in todo_ids:
512
+ _scopeSubDefs(s, schema[defn], rootdef, moved, ids, delete, p)
513
+
514
+ for s, p in todo_defs:
515
+ _scopeSubDefs(s, schema[defn], rootdef, moved, ids, delete, p)
516
+
517
+ # move arbitrary references
518
+ def mvRef(rschema, path):
519
+ if isinstance(rschema, dict) and "$ref" in rschema:
520
+ dest = rschema["$ref"]
521
+ # log.debug(f"found {dest} at {path}")
522
+ if dest.startswith("#/") and dest not in moved:
523
+ dpath = dest[2:].split("/")
524
+ if len(dpath) != 2 or dpath[0] != defn:
525
+ # not a simple name, follow path
526
+ jdest = schema
527
+ for segment in dpath:
528
+ if isinstance(jdest, dict):
529
+ # hmmm
530
+ if segment in jdest:
531
+ jdest = jdest[segment]
532
+ elif "~" in segment or "%" in segment:
533
+ segment = unquote(segment).replace("~1", "/").replace("~0", "~")
534
+ jdest = jdest[segment]
535
+ elif isinstance(jdest, list):
536
+ jdest = jdest[int(segment)] # TODO proper exception
537
+ else:
538
+ raise Exception(f"cannot follow path {dpath} at {segment}")
539
+ global _SUBCOUNT
540
+ name = f"_psub_{_SUBCOUNT}_"
541
+ _SUBCOUNT += 1
542
+ ndest = f"#/{defn}/{name}"
543
+ # log.info(f"moving {dest} to {ndest}")
544
+ schema[defn][name] = copy.deepcopy(jdest)
545
+ rschema["$ref"] = ndest
546
+ moved[dest] = ndest # for other identical references
547
+ # TODO also rename ugly references?
548
+ return rschema
549
+
550
+ recurseSchema(schema, "", rwt=mvRef)
551
+
552
+ # do full url renamings and other references renamings
553
+ def rwtGref(schema, path):
554
+ if isinstance(schema, dict) and "$ref" in schema:
555
+ dest = schema["$ref"]
556
+ assert isinstance(dest, str), f"str $ref at {path}"
557
+ if dest in moved:
558
+ schema["$ref"] = moved[dest]
559
+ elif dest and dest[0] != "#":
560
+ # inefficient
561
+ for old, new in moved.items():
562
+ if dest.startswith(old):
563
+ # log.warning(f"dest={dest} old={old} new={new}")
564
+ schema["$ref"] = new + dest[len(old):]
565
+ if dest in ids:
566
+ log.warning(f"rewriting raw url: {dest} as {ids[dest]}")
567
+ schema["$ref"] = ids[dest]
568
+ return schema
569
+
570
+ recurseSchema(schema, "", rwt=rwtGref)
571
+
572
+ # cleanup internal definitions
573
+ for j, n, prefix, dest, sid in delete:
574
+ del j[n]
575
+ if prefix is not None:
576
+ # move whole id-ed object as global as well, replaced with a ref
577
+ schema[defn][prefix] = { p: s for p, s in j.items() }
578
+ j.clear()
579
+ j["$comment"] = f"{sid} moved as $def"
580
+ j["$ref"] = dest