tskit 1.0.1__cp314-cp314-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tskit/metadata.py ADDED
@@ -0,0 +1,1147 @@
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2020-2025 Tskit Developers
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+ """
23
+ Classes for metadata decoding, encoding and validation
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import abc
28
+ import builtins
29
+ import collections
30
+ import copy
31
+ import functools
32
+ import json
33
+ import pprint
34
+ import struct
35
+ import types
36
+ from collections.abc import Mapping
37
+ from itertools import islice
38
+ from typing import Any
39
+
40
+ import jsonschema
41
+ import numpy as np
42
+
43
+ import tskit
44
+ import tskit.exceptions as exceptions
45
+ import tskit.util as util
46
+
47
+ __builtins__object__setattr__ = builtins.object.__setattr__
48
+
49
+
50
+ def replace_root_refs(obj):
51
+ if type(obj) is list:
52
+ return [replace_root_refs(j) for j in obj]
53
+ elif type(obj) is dict:
54
+ ret = {k: replace_root_refs(v) for k, v in obj.items()}
55
+ if ret.get("$ref") == "#":
56
+ ret["$ref"] = "#/definitions/root"
57
+ return ret
58
+ else:
59
+ return obj
60
+
61
+
62
+ # Our schema is the Draft7Validator schema with added codec information.
63
+ TSKITMetadataSchemaValidator = jsonschema.validators.extend(
64
+ jsonschema.validators.Draft7Validator
65
+ )
66
+ deref_meta_schema: Mapping[str, Any] = copy.deepcopy(
67
+ TSKITMetadataSchemaValidator.META_SCHEMA
68
+ )
69
+ # We need a top-level only required property so we need to rewrite any reference
70
+ # to the top-level schema to a copy in a definition.
71
+ deref_meta_schema = replace_root_refs(deref_meta_schema)
72
+ deref_meta_schema["definitions"]["root"] = copy.deepcopy(deref_meta_schema)
73
+ deref_meta_schema["codec"] = {"type": "string"}
74
+ deref_meta_schema["required"] = ["codec"]
75
+ # For interoperability reasons, force the top-level to be an object or union
76
+ # of object and null
77
+ deref_meta_schema["properties"]["type"] = {"enum": ["object", ["object", "null"]]}
78
+ # Change the schema URL to avoid jsonschema's cache
79
+ deref_meta_schema["$schema"] = "http://json-schema.org/draft-o=07/schema#tskit"
80
+ TSKITMetadataSchemaValidator.META_SCHEMA = deref_meta_schema
81
+
82
+
83
+ class AbstractMetadataCodec(metaclass=abc.ABCMeta):
84
+ """
85
+ Superclass of all MetadataCodecs.
86
+ """
87
+
88
+ def __init__(self, schema: Mapping[str, Any]) -> None:
89
+ raise NotImplementedError # pragma: no cover
90
+
91
+ @classmethod
92
+ def modify_schema(self, schema: Mapping) -> Mapping:
93
+ return schema
94
+
95
+ @classmethod
96
+ def is_schema_trivial(self, schema: Mapping) -> bool:
97
+ return False
98
+
99
+ @abc.abstractmethod
100
+ def encode(self, obj: Any) -> bytes:
101
+ raise NotImplementedError # pragma: no cover
102
+
103
+ @abc.abstractmethod
104
+ def decode(self, encoded: bytes) -> Any:
105
+ raise NotImplementedError # pragma: no cover
106
+
107
+ def numpy_dtype(self, schema) -> Any:
108
+ raise NotImplementedError
109
+
110
+
111
+ codec_registry = {}
112
+
113
+
114
+ def register_metadata_codec(
115
+ codec_cls: type[AbstractMetadataCodec], codec_id: str
116
+ ) -> None:
117
+ """
118
+ Register a metadata codec class.
119
+ This function maintains a mapping from metadata codec identifiers used in schemas
120
+ to codec classes. When a codec class is registered, it will replace any class
121
+ previously registered under the same codec identifier, if present.
122
+
123
+ :param str codec_id: String to use to refer to the codec in the schema.
124
+ """
125
+ codec_registry[codec_id] = codec_cls
126
+
127
+
128
+ class JSONCodec(AbstractMetadataCodec):
129
+ def default_validator(validator, types, instance, schema):
130
+ # For json codec defaults must be at the top level
131
+ if validator.is_type(instance, "object"):
132
+ for v in instance.get("properties", {}).values():
133
+ for v2 in v.get("properties", {}).values():
134
+ if "default" in v2:
135
+ yield jsonschema.ValidationError(
136
+ "Defaults can only be specified at the top level"
137
+ " for JSON codec"
138
+ )
139
+
140
+ schema_validator = jsonschema.validators.extend(
141
+ TSKITMetadataSchemaValidator, {"default": default_validator}
142
+ )
143
+
144
+ @classmethod
145
+ def is_schema_trivial(self, schema: Mapping) -> bool:
146
+ return len(schema.get("properties", {})) == 0
147
+
148
+ def __init__(self, schema: Mapping[str, Any]) -> None:
149
+ try:
150
+ self.schema_validator.check_schema(schema)
151
+ except jsonschema.exceptions.SchemaError as ve:
152
+ raise exceptions.MetadataSchemaValidationError(str(ve)) from ve
153
+
154
+ # Find default values to fill in on decode, top level only
155
+ self.defaults = {
156
+ key: prop["default"]
157
+ for key, prop in schema.get("properties", {}).items()
158
+ if "default" in prop
159
+ }
160
+
161
+ def encode(self, obj: Any) -> bytes:
162
+ try:
163
+ return tskit.canonical_json(obj).encode()
164
+ except TypeError as e:
165
+ raise exceptions.MetadataEncodingError(
166
+ f"Could not encode metadata of type {str(e).split()[3]}"
167
+ )
168
+
169
+ def decode(self, encoded: bytes) -> Any:
170
+ if len(encoded) == 0:
171
+ result = {}
172
+ else:
173
+ result = json.loads(encoded.decode())
174
+
175
+ # Assign default values
176
+ if isinstance(result, dict):
177
+ return dict(self.defaults, **result)
178
+ else:
179
+ return result
180
+
181
+
182
+ register_metadata_codec(JSONCodec, "json")
183
+
184
+
185
+ class NOOPCodec(AbstractMetadataCodec):
186
+ def __init__(self, schema: Mapping[str, Any]) -> None:
187
+ pass
188
+
189
+ def encode(self, data: bytes) -> bytes:
190
+ return data
191
+
192
+ def decode(self, data: bytes) -> bytes:
193
+ return data
194
+
195
+
196
+ def binary_format_validator(validator, types, instance, schema):
197
+ # We're hooking into jsonschemas validation code here, which works by creating
198
+ # generators of exceptions, hence the yielding
199
+
200
+ # Make sure the normal type validation gets done
201
+ try:
202
+ yield from jsonschema._validators.type(validator, types, instance, schema)
203
+ except AttributeError:
204
+ # Needed since jsonschema==4.19.1
205
+ yield from jsonschema._keywords.type(validator, types, instance, schema)
206
+
207
+ # Non-composite types must have a binaryFormat
208
+ if validator.is_type(instance, "object"):
209
+ for v in instance.values():
210
+ if (
211
+ isinstance(v, dict)
212
+ and v.get("type")
213
+ not in (None, "object", "array", "null", ["object", "null"])
214
+ and "binaryFormat" not in v
215
+ ):
216
+ yield jsonschema.ValidationError(
217
+ f"{v['type']} type must have binaryFormat set"
218
+ )
219
+ # null type must be padding
220
+ if (
221
+ validator.is_type(instance, "object")
222
+ and "null" in instance
223
+ and instance["null"].get("type") == "null"
224
+ and "binaryFormat" in instance["null"]
225
+ and instance["null"]["binaryFormat"][-1] != "x"
226
+ ):
227
+ yield jsonschema.ValidationError(
228
+ 'null type binaryFormat must be padding ("x") if set'
229
+ )
230
+
231
+
232
+ def array_length_validator(validator, types, instance, schema):
233
+ # Validate that array schema doesn't have both length and
234
+ # noLengthEncodingExhaustBuffer set. Also ensure that arrayLengthFormat
235
+ # is not set when length is set.
236
+
237
+ # Call the normal properties validator first
238
+ try:
239
+ yield from jsonschema._validators.properties(validator, types, instance, schema)
240
+ except AttributeError:
241
+ # Needed since jsonschema==4.19.1
242
+ yield from jsonschema._keywords.properties(validator, types, instance, schema)
243
+ for prop, sub_schema in instance["properties"].items():
244
+ if sub_schema.get("type") == "array":
245
+ has_length = "length" in sub_schema
246
+ has_exhaust = sub_schema.get("noLengthEncodingExhaustBuffer", False)
247
+
248
+ if has_length and has_exhaust:
249
+ yield jsonschema.ValidationError(
250
+ f"{prop} array cannot have both 'length' and "
251
+ "'noLengthEncodingExhaustBuffer' set"
252
+ )
253
+
254
+ if has_length and "arrayLengthFormat" in sub_schema:
255
+ yield jsonschema.ValidationError(
256
+ f"{prop} fixed-length array should not specify 'arrayLengthFormat'"
257
+ )
258
+
259
+ if has_length and sub_schema["length"] < 0:
260
+ yield jsonschema.ValidationError(
261
+ f"{prop} array length must be non-negative, got "
262
+ f"{sub_schema['length']}"
263
+ )
264
+
265
+
266
+ def required_validator(validator, required, instance, schema):
267
+ # Do the normal validation
268
+ try:
269
+ yield from jsonschema._validators.required(
270
+ validator, required, instance, schema
271
+ )
272
+ except AttributeError:
273
+ # Needed since jsonschema==4.19.1
274
+ yield from jsonschema._keywords.required(validator, required, instance, schema)
275
+
276
+ # For struct codec if a property is not required, then it must have a default
277
+ for prop, sub_schema in instance["properties"].items():
278
+ if prop not in instance["required"] and "default" not in sub_schema:
279
+ yield jsonschema.ValidationError(
280
+ f"Optional property '{prop}' must have" f" a default value"
281
+ )
282
+
283
+
284
+ StructCodecSchemaValidator = jsonschema.validators.extend(
285
+ TSKITMetadataSchemaValidator,
286
+ {
287
+ "type": binary_format_validator,
288
+ "required": required_validator,
289
+ "properties": array_length_validator,
290
+ },
291
+ )
292
+ struct_meta_schema: Mapping[str, Any] = copy.deepcopy(
293
+ StructCodecSchemaValidator.META_SCHEMA
294
+ )
295
+ # No union types
296
+ struct_meta_schema["definitions"]["root"]["properties"]["type"] = {
297
+ "$ref": "#/definitions/simpleTypes"
298
+ }
299
+ # No hetrogeneous arrays
300
+ struct_meta_schema["properties"]["items"] = {"$ref": "#/definitions/root"}
301
+ struct_meta_schema["definitions"]["root"]["properties"]["items"] = struct_meta_schema[
302
+ "properties"
303
+ ]["items"]
304
+ # binaryFormat matches regex
305
+ struct_meta_schema["properties"]["binaryFormat"] = {
306
+ "type": "string",
307
+ "pattern": r"^([cbB\?hHiIlLqQfd]|\d*[spx])$",
308
+ }
309
+ struct_meta_schema["definitions"]["root"]["properties"]["binaryFormat"] = (
310
+ struct_meta_schema["properties"]["binaryFormat"]
311
+ )
312
+ # arrayLengthFormat matches regex and has default
313
+ struct_meta_schema["properties"]["arrayLengthFormat"] = {
314
+ "type": "string",
315
+ "pattern": r"^[BHILQ]$",
316
+ "default": "L",
317
+ }
318
+ struct_meta_schema["definitions"]["root"]["properties"]["arrayLengthFormat"] = (
319
+ struct_meta_schema["properties"]["arrayLengthFormat"]
320
+ )
321
+ # index is numeric
322
+ struct_meta_schema["properties"]["index"] = {"type": "number"}
323
+ struct_meta_schema["definitions"]["root"]["properties"]["index"] = struct_meta_schema[
324
+ "properties"
325
+ ]["index"]
326
+ # stringEncoding is string and has default
327
+ struct_meta_schema["properties"]["stringEncoding"] = {
328
+ "type": "string",
329
+ "default": "utf-8",
330
+ }
331
+ struct_meta_schema["definitions"]["root"]["properties"]["stringEncoding"] = (
332
+ struct_meta_schema["properties"]["stringEncoding"]
333
+ )
334
+ # nullTerminated is a boolean
335
+ struct_meta_schema["properties"]["nullTerminated"] = {"type": "boolean"}
336
+ struct_meta_schema["definitions"]["root"]["properties"]["nullTerminated"] = (
337
+ struct_meta_schema["properties"]["nullTerminated"]
338
+ )
339
+ # noLengthEncodingExhaustBuffer is a boolean
340
+ struct_meta_schema["properties"]["noLengthEncodingExhaustBuffer"] = {"type": "boolean"}
341
+ struct_meta_schema["definitions"]["root"]["properties"][
342
+ "noLengthEncodingExhaustBuffer"
343
+ ] = struct_meta_schema["properties"]["noLengthEncodingExhaustBuffer"]
344
+
345
+ # length is numeric (for fixed-length arrays)
346
+ struct_meta_schema["properties"]["length"] = {"type": "integer"}
347
+ struct_meta_schema["definitions"]["root"]["properties"]["length"] = struct_meta_schema[
348
+ "properties"
349
+ ]["length"]
350
+
351
+ StructCodecSchemaValidator.META_SCHEMA = struct_meta_schema
352
+
353
+
354
+ class StructCodec(AbstractMetadataCodec):
355
+ """
356
+ Codec that encodes data using struct. Note that this codec has extra restrictions
357
+ Namely that object keys must be fixed (all present and no extra); each entry should
358
+ have a binaryFormat; that arrays are homogeneous and that types are not unions.
359
+ """
360
+
361
+ @classmethod
362
+ def order_by_index(cls, obj, do_sort=False):
363
+ """
364
+ Take a schema and recursively convert any dict that is under the key
365
+ name ``properties`` to an OrderedDict.
366
+ """
367
+ if isinstance(obj, collections.abc.Mapping):
368
+ items = obj.items()
369
+ if do_sort:
370
+ # Python sort is stable so we can do the sorts in reverse priority
371
+ items = sorted(items, key=lambda k_v: k_v[0])
372
+ items = sorted(items, key=lambda k_v: k_v[1].get("index", 0))
373
+ items = ((k, cls.order_by_index(v, k == "properties")) for k, v in items)
374
+ if do_sort:
375
+ return collections.OrderedDict(items)
376
+ else:
377
+ return dict(items)
378
+ elif isinstance(obj, list) or isinstance(obj, tuple):
379
+ return [cls.order_by_index(v, False) for v in obj]
380
+ else:
381
+ return obj
382
+
383
+ @classmethod
384
+ def make_decode(cls, sub_schema):
385
+ """
386
+ Create a function that can decode objects of this schema
387
+ """
388
+ if set(sub_schema["type"]) == {"object", "null"}:
389
+ return StructCodec.make_object_or_null_decode(sub_schema)
390
+ else:
391
+ return {
392
+ "array": StructCodec.make_array_decode,
393
+ "object": StructCodec.make_object_decode,
394
+ "string": StructCodec.make_string_decode,
395
+ "null": StructCodec.make_null_decode,
396
+ "number": StructCodec.make_numeric_decode,
397
+ "integer": StructCodec.make_numeric_decode,
398
+ "boolean": StructCodec.make_numeric_decode,
399
+ }[sub_schema["type"]](sub_schema)
400
+
401
+ @classmethod
402
+ def make_array_decode(cls, sub_schema):
403
+ element_decoder = StructCodec.make_decode(sub_schema["items"])
404
+ fixed_length = sub_schema.get("length")
405
+ array_length_f = "<" + sub_schema.get("arrayLengthFormat", "L")
406
+ array_length_size = struct.calcsize(array_length_f)
407
+ exhaust_buffer = sub_schema.get("noLengthEncodingExhaustBuffer", False)
408
+
409
+ def array_decode(buffer):
410
+ array_length = struct.unpack(
411
+ array_length_f, bytes(islice(buffer, array_length_size))
412
+ )[0]
413
+ return [element_decoder(buffer) for _ in range(array_length)]
414
+
415
+ def array_decode_exhaust(buffer):
416
+ ret = []
417
+ while True:
418
+ try:
419
+ ret.append(element_decoder(buffer))
420
+ except struct.error as e:
421
+ if "unpack requires a buffer" in str(e):
422
+ break
423
+ else:
424
+ raise e
425
+ return ret
426
+
427
+ def array_decode_fixed_length(buffer):
428
+ return [element_decoder(buffer) for _ in range(fixed_length)]
429
+
430
+ if fixed_length is not None:
431
+ return array_decode_fixed_length
432
+ elif exhaust_buffer:
433
+ return array_decode_exhaust
434
+ else:
435
+ return array_decode
436
+
437
+ @classmethod
438
+ def make_object_decode(cls, sub_schema):
439
+ sub_decoders = {
440
+ key: StructCodec.make_decode(prop)
441
+ for key, prop in sub_schema["properties"].items()
442
+ }
443
+ return lambda buffer: {
444
+ key: sub_decoder(buffer) for key, sub_decoder in sub_decoders.items()
445
+ }
446
+
447
+ @classmethod
448
+ def make_object_or_null_decode(cls, sub_schema):
449
+ sub_decoders = {
450
+ key: StructCodec.make_decode(prop)
451
+ for key, prop in sub_schema["properties"].items()
452
+ }
453
+
454
+ def decode_object_or_null(buffer):
455
+ # We have to check the buffer length for null, as the islices in
456
+ # sub-decoders won't raise StopIteration
457
+ buffer = list(buffer)
458
+ if len(buffer) == 0:
459
+ return None
460
+ else:
461
+ buffer = iter(buffer)
462
+ return {
463
+ key: sub_decoder(buffer)
464
+ for key, sub_decoder in sub_decoders.items()
465
+ }
466
+
467
+ return decode_object_or_null
468
+
469
+ @classmethod
470
+ def make_string_decode(cls, sub_schema):
471
+ f = "<" + sub_schema["binaryFormat"]
472
+ size = struct.calcsize(f)
473
+ encoding = sub_schema.get("stringEncoding", "utf-8")
474
+ null_terminated = sub_schema.get("nullTerminated", False)
475
+ if not null_terminated:
476
+ return lambda buffer: struct.unpack(f, bytes(islice(buffer, size)))[
477
+ 0
478
+ ].decode(encoding)
479
+ else:
480
+
481
+ def decode_string(buffer):
482
+ s = struct.unpack(f, bytes(islice(buffer, size)))[0].decode(encoding)
483
+ i = s.find("\x00")
484
+ if i == -1:
485
+ return s
486
+ return s[:i]
487
+
488
+ return decode_string
489
+
490
+ @classmethod
491
+ def make_null_decode(cls, sub_schema):
492
+ if sub_schema.get("binaryFormat") is not None:
493
+ f = sub_schema["binaryFormat"]
494
+ size = struct.calcsize(f)
495
+
496
+ def padding_decode(buffer):
497
+ struct.unpack(f, bytes(islice(buffer, size)))
498
+
499
+ return padding_decode
500
+ else:
501
+ return lambda _: None
502
+
503
+ @classmethod
504
+ def make_numeric_decode(cls, sub_schema):
505
+ f = "<" + sub_schema["binaryFormat"]
506
+ size = struct.calcsize(f)
507
+ return lambda buffer: struct.unpack(f, bytes(islice(buffer, size)))[0]
508
+
509
+ @classmethod
510
+ def make_encode(cls, sub_schema):
511
+ """
512
+ Create a function that can encode objects of this schema
513
+ """
514
+ if set(sub_schema["type"]) == {"object", "null"}:
515
+ return StructCodec.make_object_or_null_encode(sub_schema)
516
+ else:
517
+ return {
518
+ "array": StructCodec.make_array_encode,
519
+ "object": StructCodec.make_object_encode,
520
+ "string": StructCodec.make_string_encode,
521
+ "null": StructCodec.make_null_encode,
522
+ "number": StructCodec.make_numeric_encode,
523
+ "integer": StructCodec.make_numeric_encode,
524
+ "boolean": StructCodec.make_numeric_encode,
525
+ }[sub_schema["type"]](sub_schema)
526
+
527
+ @classmethod
528
+ def make_array_encode(cls, sub_schema):
529
+ element_encoder = StructCodec.make_encode(sub_schema["items"])
530
+ fixed_length = sub_schema.get("length")
531
+ array_length_f = "<" + sub_schema.get("arrayLengthFormat", "L")
532
+ exhaust_buffer = sub_schema.get("noLengthEncodingExhaustBuffer", False)
533
+
534
+ def array_encode_fixed_length(array):
535
+ if len(array) != fixed_length:
536
+ raise ValueError(
537
+ f"Array length {len(array)} does not match schema"
538
+ f" fixed length {fixed_length}"
539
+ )
540
+ return b"".join(element_encoder(ele) for ele in array)
541
+
542
+ def array_encode_exhaust(array):
543
+ return b"".join(element_encoder(ele) for ele in array)
544
+
545
+ def array_encode_with_length(array):
546
+ try:
547
+ packed_length = struct.pack(array_length_f, len(array))
548
+ except struct.error:
549
+ raise ValueError(
550
+ "Couldn't pack array size - it is likely too long"
551
+ " for the specified arrayLengthFormat"
552
+ )
553
+ return packed_length + b"".join(element_encoder(ele) for ele in array)
554
+
555
+ if fixed_length is not None:
556
+ return array_encode_fixed_length
557
+ elif exhaust_buffer:
558
+ return array_encode_exhaust
559
+ else:
560
+ return array_encode_with_length
561
+
562
+ @classmethod
563
+ def make_object_encode(cls, sub_schema):
564
+ sub_encoders = {
565
+ key: StructCodec.make_encode(prop)
566
+ for key, prop in sub_schema["properties"].items()
567
+ }
568
+ defaults = {
569
+ key: prop["default"]
570
+ for key, prop in sub_schema["properties"].items()
571
+ if "default" in prop
572
+ }
573
+
574
+ def object_encode(obj):
575
+ values = []
576
+ for key, sub_encoder in sub_encoders.items():
577
+ try:
578
+ values.append(sub_encoder(obj[key]))
579
+ except KeyError:
580
+ values.append(sub_encoder(defaults[key]))
581
+ return b"".join(values)
582
+
583
+ return object_encode
584
+
585
+ @classmethod
586
+ def make_object_or_null_encode(cls, sub_schema):
587
+ sub_encoders = {
588
+ key: StructCodec.make_encode(prop)
589
+ for key, prop in sub_schema["properties"].items()
590
+ }
591
+ defaults = {
592
+ key: prop["default"]
593
+ for key, prop in sub_schema["properties"].items()
594
+ if "default" in prop
595
+ }
596
+
597
+ def object_encode(obj):
598
+ values = []
599
+ if obj is not None:
600
+ for key, sub_encoder in sub_encoders.items():
601
+ try:
602
+ values.append(sub_encoder(obj[key]))
603
+ except KeyError:
604
+ values.append(sub_encoder(defaults[key]))
605
+ return b"".join(values)
606
+
607
+ return object_encode
608
+
609
+ @classmethod
610
+ def make_string_encode(cls, sub_schema):
611
+ encoding = sub_schema.get("stringEncoding", "utf-8")
612
+ return lambda string: struct.pack(
613
+ "<" + sub_schema["binaryFormat"], string.encode(encoding)
614
+ )
615
+
616
+ @classmethod
617
+ def make_null_encode(cls, sub_schema):
618
+ return lambda _: struct.pack(sub_schema.get("binaryFormat", "0x"))
619
+
620
+ @classmethod
621
+ def make_numeric_encode(cls, sub_schema):
622
+ return struct.Struct("<" + sub_schema["binaryFormat"]).pack
623
+
624
+ @classmethod
625
+ def modify_schema(cls, schema: Mapping) -> Mapping:
626
+ # This codec requires that additional properties are
627
+ # not allowed. Rather than get schema authors to repeat that everywhere
628
+ # we add it here, sadly we can't do this in the metaschema as "default" isn't
629
+ # used by the validator.
630
+ def enforce_fixed_properties(obj):
631
+ if type(obj) is list:
632
+ return [enforce_fixed_properties(j) for j in obj]
633
+ elif type(obj) is dict:
634
+ ret = {k: enforce_fixed_properties(v) for k, v in obj.items()}
635
+ if "object" in ret.get("type", []):
636
+ if ret.get("additional_properties"):
637
+ raise ValueError(
638
+ "Struct codec does not support additional_properties"
639
+ )
640
+ # To prevent authors having to list required properties the default
641
+ # is that all without a default are required.
642
+ if "required" not in ret:
643
+ ret["required"] = [
644
+ prop
645
+ for prop, sub_schema in ret.get("properties", {}).items()
646
+ if "default" not in sub_schema
647
+ ]
648
+ ret["additionalProperties"] = False
649
+ return ret
650
+ else:
651
+ return obj
652
+
653
+ schema = enforce_fixed_properties(schema)
654
+
655
+ # We also give the schema an explicit ordering
656
+ return StructCodec.order_by_index(schema)
657
+
658
+ def __init__(self, schema: Mapping[str, Any]) -> None:
659
+ try:
660
+ StructCodecSchemaValidator.check_schema(schema)
661
+ except jsonschema.exceptions.SchemaError as ve:
662
+ raise exceptions.MetadataSchemaValidationError(str(ve)) from ve
663
+
664
+ self.encode = StructCodec.make_encode(schema)
665
+ decoder = StructCodec.make_decode(schema)
666
+ self.decode = lambda buffer: decoder(iter(buffer))
667
+
668
+ def encode(self, obj: Any) -> bytes:
669
+ # Set by __init__
670
+ pass # pragma: nocover
671
+
672
+ def decode(self, encoded: bytes) -> Any:
673
+ # Set by __init__
674
+ pass # pragma: nocover
675
+
676
+ def numpy_dtype(self, schema):
677
+ # Mapping from struct format characters to NumPy dtype strings
678
+ # Note: All are little-endian as enforced by the struct codec
679
+ # This means they will be the standard size across platforms
680
+ FORMAT_TO_DTYPE = {
681
+ # Boolean
682
+ "?": "?",
683
+ # Integers
684
+ "b": "i1",
685
+ "B": "u1",
686
+ "h": "i2",
687
+ "H": "u2",
688
+ "i": "i4",
689
+ "I": "u4",
690
+ "l": "i4",
691
+ "L": "u4",
692
+ "q": "i8",
693
+ "Q": "u8",
694
+ # Floats
695
+ "f": "f4",
696
+ "d": "f8",
697
+ # Single character
698
+ "c": "S1",
699
+ }
700
+
701
+ def _convert_binary_format(fmt):
702
+ if fmt.endswith("x"):
703
+ if fmt == "x":
704
+ return "V1"
705
+ n = int(fmt[:-1])
706
+ return f"V{n}"
707
+
708
+ if fmt.endswith("s"):
709
+ if fmt == "s":
710
+ return "S1"
711
+ n = int(fmt[:-1])
712
+ return f"S{n}"
713
+
714
+ if fmt.endswith("p"):
715
+ raise ValueError(
716
+ "Pascal string format ('p') is not supported by NumPy dtypes."
717
+ )
718
+
719
+ if fmt in FORMAT_TO_DTYPE:
720
+ return FORMAT_TO_DTYPE[fmt]
721
+
722
+ # As schemas are validated on __init__ this should never happen
723
+ raise ValueError(f"Unsupported binary format: {fmt}") # pragma: no cover
724
+
725
+ def _process_schema_node(node):
726
+ # The null type with union can only occur at the top-level
727
+ if set(node.get("type", [])) == {"object", "null"}:
728
+ raise ValueError("Top level object/null union not supported")
729
+ elif node.get("type") == "object":
730
+ fields = []
731
+ for prop_name, prop_schema in node.get("properties", {}).items():
732
+ fields.append((prop_name, _process_schema_node(prop_schema)))
733
+ return fields
734
+
735
+ elif node.get("type") == "array":
736
+ if "length" not in node:
737
+ raise ValueError(
738
+ "Only fixed-length arrays are supported for NumPy dtype"
739
+ " conversion. Variable-length arrays cannot be represented"
740
+ " in a structured dtype."
741
+ )
742
+
743
+ length = node["length"]
744
+ item_dtype = _process_schema_node(node["items"])
745
+
746
+ # Return the item dtype with shape information
747
+ return (item_dtype, (length,))
748
+
749
+ elif node.get("type") in ("number", "integer", "boolean", "string", "null"):
750
+ fmt = node["binaryFormat"]
751
+ dtype_str = _convert_binary_format(fmt)
752
+
753
+ if dtype_str[0] not in "VSU?":
754
+ # Don't add endianness to void, string, unicode or bool types
755
+ dtype_str = "<" + dtype_str
756
+
757
+ return dtype_str
758
+
759
+ dtype_spec = _process_schema_node(schema)
760
+ return np.dtype(dtype_spec)
761
+
762
+
763
+ register_metadata_codec(StructCodec, "struct")
764
+
765
+
766
+ def validate_bytes(data: bytes | None) -> None:
767
+ if data is not None and not isinstance(data, bytes):
768
+ raise TypeError(
769
+ f"If no encoding is set metadata should be bytes, found {type(data)}"
770
+ )
771
+
772
+
773
+ class MetadataSchema:
774
+ """
775
+ Class for validating, encoding and decoding metadata.
776
+
777
+ :param dict schema: A dict containing a valid JSONSchema object.
778
+ """
779
+
780
+ def __init__(self, schema: Mapping[str, Any] | None) -> None:
781
+ self._schema = schema
782
+ self._unmodified_schema = schema
783
+ self._bypass_validation = False
784
+
785
+ if schema is None:
786
+ self._string = ""
787
+ self._validate_row = validate_bytes
788
+ self.encode_row = NOOPCodec({}).encode
789
+ self.decode_row = NOOPCodec({}).decode
790
+ self.empty_value = b""
791
+ self.codec_instance = NOOPCodec({})
792
+ else:
793
+ try:
794
+ TSKITMetadataSchemaValidator.check_schema(schema)
795
+ except jsonschema.exceptions.SchemaError as ve:
796
+ raise exceptions.MetadataSchemaValidationError(str(ve)) from ve
797
+ try:
798
+ codec_cls = codec_registry[schema["codec"]]
799
+ except KeyError:
800
+ raise exceptions.MetadataSchemaValidationError(
801
+ f"Unrecognised metadata codec '{schema['codec']}'. "
802
+ f"Valid options are {str(list(codec_registry.keys()))}."
803
+ )
804
+ # Codecs can modify the schema, for example to set defaults as the validator
805
+ # does not.
806
+ self._schema = codec_cls.modify_schema(schema)
807
+ self.codec_instance = codec_cls(self._schema)
808
+ self._string = tskit.canonical_json(self._schema)
809
+ self._validate_row = TSKITMetadataSchemaValidator(self._schema).validate
810
+ self._bypass_validation = codec_cls.is_schema_trivial(schema)
811
+ self.encode_row = self.codec_instance.encode
812
+ self.decode_row = self.codec_instance.decode
813
+
814
+ # If None is allowed by the schema as the top-level type, it gets used even
815
+ # in the presence of default and required values.
816
+ if "type" in self._schema and "null" in self._schema["type"]:
817
+ self.empty_value = None
818
+ else:
819
+ self.empty_value = {}
820
+
821
+ def __repr__(self) -> str:
822
+ return self._string
823
+
824
+ def __str__(self) -> str:
825
+ if isinstance(self._schema, collections.OrderedDict):
826
+ s = pprint.pformat(dict(self._schema))
827
+ else:
828
+ s = pprint.pformat(self._schema)
829
+ if "\n" in s:
830
+ return f"tskit.MetadataSchema(\n{s}\n)"
831
+ else:
832
+ return f"tskit.MetadataSchema({s})"
833
+
834
+ def __eq__(self, other) -> bool:
835
+ return self._string == other._string
836
+
837
+ @property
838
+ def schema(self) -> Mapping[str, Any] | None:
839
+ # Return a copy to avoid unintentional mutation
840
+ return copy.deepcopy(self._unmodified_schema)
841
+
842
+ def asdict(self) -> Mapping[str, Any] | None:
843
+ """
844
+ Returns a dict representation of this schema. One possible use of this is to
845
+ modify this dict and then pass it to the ``MetadataSchema`` constructor to create
846
+ a similar schema.
847
+ """
848
+ return self.schema
849
+
850
+ def validate_and_encode_row(self, row: Any) -> bytes:
851
+ """
852
+ Validate a row (dict) of metadata against this schema and return the encoded
853
+ representation (bytes) using the codec specified in the schema.
854
+ """
855
+ # If the schema is permissive then validation can't fail
856
+ if not self._bypass_validation:
857
+ try:
858
+ self._validate_row(row)
859
+ except jsonschema.exceptions.ValidationError as ve:
860
+ raise exceptions.MetadataValidationError(str(ve)) from ve
861
+ return self.encode_row(row)
862
+
863
+ def decode_row(self, row: bytes) -> Any:
864
+ """
865
+ Decode an encoded row (bytes) of metadata, using the codec specifed in the schema
866
+ and return a python dict. Note that no validation of the metadata against the
867
+ schema is performed.
868
+ """
869
+ # Set by __init__
870
+ pass # pragma: no cover
871
+
872
+ def encode_row(self, row: Any) -> bytes:
873
+ """
874
+ Encode a row (dict) of metadata to its binary representation (bytes)
875
+ using the codec specified in the schema. Note that unlike
876
+ :meth:`validate_and_encode_row` no validation against the schema is performed.
877
+ This should only be used for performance if a validation check is not needed.
878
+ """
879
+ # Set by __init__
880
+ pass # pragma: no cover
881
+
882
+ def numpy_dtype(self) -> Any:
883
+ return self.codec_instance.numpy_dtype(self._schema)
884
+
885
+ def structured_array_from_buffer(self, buffer: Any) -> Any:
886
+ """
887
+ Convert a buffer of metadata into a structured NumPy array.
888
+ """
889
+ dtype = self.numpy_dtype()
890
+ return np.frombuffer(buffer, dtype=dtype)
891
+
892
+ @staticmethod
893
+ def permissive_json():
894
+ """
895
+ The simplest, permissive JSON schema. Only specifies the JSON codec and has
896
+ no constraints on the properties.
897
+ """
898
+ return MetadataSchema({"codec": "json"})
899
+
900
+ @staticmethod
901
+ def null():
902
+ """
903
+ The null schema which defines no properties and results in raw bytes
904
+ being returned on accessing metadata column.
905
+ """
906
+ return MetadataSchema(None)
907
+
908
+
909
+ # Often many replicate tree sequences are processed with identical schemas, so cache them
910
+ @functools.lru_cache(maxsize=128)
911
+ def parse_metadata_schema(encoded_schema: str) -> MetadataSchema:
912
+ """
913
+ Create a schema object from its string encoding. The exact class returned is
914
+ determined by the ``encoding`` specification in the string.
915
+
916
+ :param str encoded_schema: The string encoded schema.
917
+ :return: A subclass of AbstractMetadataSchema.
918
+ """
919
+ if encoded_schema == "":
920
+ return MetadataSchema.null()
921
+ else:
922
+ try:
923
+ decoded = json.loads(
924
+ encoded_schema, object_pairs_hook=collections.OrderedDict
925
+ )
926
+ except json.decoder.JSONDecodeError:
927
+ raise ValueError(f"Metadata schema is not JSON, found {encoded_schema}")
928
+ return MetadataSchema(decoded)
929
+
930
+
931
+ class _CachedMetadata:
932
+ """
933
+ Descriptor for lazy decoding of metadata on attribute access.
934
+ """
935
+
936
+ def __get__(self, row, owner):
937
+ if row._metadata_decoder is not None:
938
+ # Some classes that use this are frozen so we need to directly setattr.
939
+ __builtins__object__setattr__(
940
+ row, "_metadata", row._metadata_decoder(row._metadata)
941
+ )
942
+ # Decoder being None indicates that metadata is decoded
943
+ __builtins__object__setattr__(row, "_metadata_decoder", None)
944
+ return row._metadata
945
+
946
+ def __set__(self, row, value):
947
+ __builtins__object__setattr__(row, "_metadata", value)
948
+
949
+
950
+ def lazy_decode(own_init=False):
951
+ def _lazy_decode(cls):
952
+ """
953
+ Modifies a dataclass such that it lazily decodes metadata, if it is encoded.
954
+ If the metadata passed to the constructor is encoded a `metadata_decoder`
955
+ parameter must be also be passed.
956
+ """
957
+ if not own_init:
958
+ wrapped_init = cls.__init__
959
+
960
+ # Intercept the init to record the decoder
961
+ def new_init(self, *args, metadata_decoder=None, **kwargs):
962
+ __builtins__object__setattr__(
963
+ self, "_metadata_decoder", metadata_decoder
964
+ )
965
+ wrapped_init(self, *args, **kwargs)
966
+
967
+ cls.__init__ = new_init
968
+
969
+ # Add a descriptor to the class to decode and cache metadata
970
+ cls.metadata = _CachedMetadata()
971
+
972
+ # Add slots needed to the class
973
+ slots = cls.__slots__
974
+ slots.extend(["_metadata", "_metadata_decoder"])
975
+ dict_ = dict()
976
+ sloted_members = dict()
977
+ for k, v in cls.__dict__.items():
978
+ if k not in slots:
979
+ dict_[k] = v
980
+ elif not isinstance(v, types.MemberDescriptorType):
981
+ sloted_members[k] = v
982
+ new_cls = type(cls.__name__, cls.__bases__, dict_)
983
+ for k, v in sloted_members.items():
984
+ setattr(new_cls, k, v)
985
+ return new_cls
986
+
987
+ return _lazy_decode
988
+
989
+
990
+ class MetadataProvider:
991
+ """
992
+ Abstract superclass of container objects that provide metadata.
993
+ """
994
+
995
+ def __init__(self, ll_object):
996
+ self._ll_object = ll_object
997
+
998
+ @property
999
+ def metadata_schema(self) -> MetadataSchema:
1000
+ """
1001
+ The :class:`tskit.MetadataSchema` for this object.
1002
+ """
1003
+ return parse_metadata_schema(self._ll_object.metadata_schema)
1004
+
1005
+ @metadata_schema.setter
1006
+ def metadata_schema(self, schema: MetadataSchema) -> None:
1007
+ # Check the schema is a valid schema instance by roundtripping it.
1008
+ text_version = repr(schema)
1009
+ parse_metadata_schema(text_version)
1010
+ self._ll_object.metadata_schema = text_version
1011
+
1012
+ @property
1013
+ def metadata(self) -> Any:
1014
+ """
1015
+ The decoded metadata for this object.
1016
+ """
1017
+ return self.metadata_schema.decode_row(self.metadata_bytes)
1018
+
1019
+ @metadata.setter
1020
+ def metadata(self, metadata: bytes | dict | None) -> None:
1021
+ encoded = self.metadata_schema.validate_and_encode_row(metadata)
1022
+ self._ll_object.metadata = encoded
1023
+
1024
+ @property
1025
+ def metadata_bytes(self) -> Any:
1026
+ """
1027
+ The raw bytes of metadata for this TableCollection
1028
+ """
1029
+ return self._ll_object.metadata
1030
+
1031
+ @property
1032
+ def nbytes(self) -> int:
1033
+ return len(self._ll_object.metadata) + len(self._ll_object.metadata_schema)
1034
+
1035
+ def assert_equals(self, other: MetadataProvider):
1036
+ if self.metadata_schema != other.metadata_schema:
1037
+ raise AssertionError(
1038
+ f"Metadata schemas differ: self={self.metadata_schema} "
1039
+ f"other={other.metadata_schema}"
1040
+ )
1041
+ if self.metadata != other.metadata:
1042
+ raise AssertionError(
1043
+ f"Metadata differs: self={self.metadata} " f"other={other.metadata}"
1044
+ )
1045
+
1046
+
1047
+ NOTSET = object() # Sentinel for unset default values
1048
+
1049
+
1050
+ class TableMetadataReader:
1051
+ # Mixin for table classes that expose decoded metadata
1052
+
1053
+ @property
1054
+ def metadata_schema(self) -> MetadataSchema:
1055
+ """
1056
+ The :class:`tskit.MetadataSchema` for this table.
1057
+ """
1058
+ # This isn't as inefficient as it looks because we're using an LRU cache on
1059
+ # the parse_metadata_schema function. Thus, we're really only incurring the
1060
+ # cost of creating the unicode string from the low-level schema and looking
1061
+ # up the functools cache.
1062
+ return parse_metadata_schema(self.ll_table.metadata_schema)
1063
+
1064
+ def metadata_vector(self, key, *, dtype=None, default_value=NOTSET):
1065
+ """
1066
+ Returns a numpy array of metadata values obtained by extracting ``key``
1067
+ from each metadata entry, and using ``default_value`` if the key is
1068
+ not present. ``key`` may be a list, in which case nested values are returned.
1069
+ For instance, ``key = ["a", "x"]`` will return an array of
1070
+ ``row.metadata["a"]["x"]`` values, iterated over rows in this table.
1071
+
1072
+ :param str key: The name, or a list of names, of metadata entries.
1073
+ :param str dtype: The dtype of the result (can usually be omitted).
1074
+ :param object default_value: The value to be inserted if the metadata key
1075
+ is not present. Note that for numeric columns, a default value of None
1076
+ will result in a non-numeric array. The default behaviour is to raise
1077
+ ``KeyError`` on missing entries.
1078
+ """
1079
+ from collections.abc import Mapping
1080
+
1081
+ if default_value == NOTSET:
1082
+
1083
+ def getter(d, k):
1084
+ return d[k]
1085
+
1086
+ else:
1087
+
1088
+ def getter(d, k):
1089
+ return (
1090
+ d.get(k, default_value) if isinstance(d, Mapping) else default_value
1091
+ )
1092
+
1093
+ if isinstance(key, list):
1094
+ out = np.array(
1095
+ [functools.reduce(getter, key, row.metadata) for row in self],
1096
+ dtype=dtype,
1097
+ )
1098
+ else:
1099
+ out = np.array(
1100
+ [getter(row.metadata, key) for row in self],
1101
+ dtype=dtype,
1102
+ )
1103
+ return out
1104
+
1105
+ def _make_row(self, *args):
1106
+ return self.row_class(*args, metadata_decoder=self.metadata_schema.decode_row)
1107
+
1108
+
1109
+ class TableMetadataWriter(TableMetadataReader):
1110
+ # Mixin for tables writing metadata
1111
+
1112
+ @TableMetadataReader.metadata_schema.setter
1113
+ def metadata_schema(self, schema: MetadataSchema) -> None:
1114
+ if not isinstance(schema, MetadataSchema):
1115
+ raise TypeError(
1116
+ "Only instances of tskit.MetadataSchema can be assigned to "
1117
+ f"metadata_schema, not {type(schema)}"
1118
+ )
1119
+ self.ll_table.metadata_schema = repr(schema)
1120
+
1121
+ def packset_metadata(self, metadatas):
1122
+ """
1123
+ Packs the specified list of metadata values and updates the ``metadata``
1124
+ and ``metadata_offset`` columns. The length of the metadatas array
1125
+ must be equal to the number of rows in the table.
1126
+
1127
+ :param list metadatas: A list of metadata bytes values.
1128
+ """
1129
+ packed, offset = util.pack_bytes(metadatas)
1130
+ data = self.asdict()
1131
+ data["metadata"] = packed
1132
+ data["metadata_offset"] = offset
1133
+ self.set_columns(**data)
1134
+
1135
+ def drop_metadata(self, *, keep_schema=False):
1136
+ """
1137
+ Drops all metadata in this table. By default, the schema is also cleared,
1138
+ except if ``keep_schema`` is True.
1139
+
1140
+ :param bool keep_schema: True if the current schema should be kept intact.
1141
+ """
1142
+ data = self.asdict()
1143
+ data["metadata"] = []
1144
+ data["metadata_offset"][:] = 0
1145
+ self.set_columns(**data)
1146
+ if not keep_schema:
1147
+ self.metadata_schema = MetadataSchema.null()