tskit 1.0.1__cp314-cp314-macosx_10_15_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _tskit.cpython-314-darwin.so +0 -0
- tskit/__init__.py +92 -0
- tskit/__main__.py +4 -0
- tskit/_version.py +4 -0
- tskit/cli.py +273 -0
- tskit/combinatorics.py +1522 -0
- tskit/drawing.py +2809 -0
- tskit/exceptions.py +70 -0
- tskit/genotypes.py +410 -0
- tskit/intervals.py +601 -0
- tskit/jit/__init__.py +0 -0
- tskit/jit/numba.py +674 -0
- tskit/metadata.py +1147 -0
- tskit/provenance.py +150 -0
- tskit/provenance.schema.json +72 -0
- tskit/stats.py +165 -0
- tskit/tables.py +4858 -0
- tskit/text_formats.py +456 -0
- tskit/trees.py +11457 -0
- tskit/util.py +901 -0
- tskit/vcf.py +219 -0
- tskit-1.0.1.dist-info/METADATA +105 -0
- tskit-1.0.1.dist-info/RECORD +27 -0
- tskit-1.0.1.dist-info/WHEEL +5 -0
- tskit-1.0.1.dist-info/entry_points.txt +2 -0
- tskit-1.0.1.dist-info/licenses/LICENSE +21 -0
- tskit-1.0.1.dist-info/top_level.txt +2 -0
tskit/metadata.py
ADDED
|
@@ -0,0 +1,1147 @@
|
|
|
1
|
+
# MIT License
|
|
2
|
+
#
|
|
3
|
+
# Copyright (c) 2020-2025 Tskit Developers
|
|
4
|
+
#
|
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
# furnished to do so, subject to the following conditions:
|
|
11
|
+
#
|
|
12
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
# copies or substantial portions of the Software.
|
|
14
|
+
#
|
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
# SOFTWARE.
|
|
22
|
+
"""
|
|
23
|
+
Classes for metadata decoding, encoding and validation
|
|
24
|
+
"""
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import abc
|
|
28
|
+
import builtins
|
|
29
|
+
import collections
|
|
30
|
+
import copy
|
|
31
|
+
import functools
|
|
32
|
+
import json
|
|
33
|
+
import pprint
|
|
34
|
+
import struct
|
|
35
|
+
import types
|
|
36
|
+
from collections.abc import Mapping
|
|
37
|
+
from itertools import islice
|
|
38
|
+
from typing import Any
|
|
39
|
+
|
|
40
|
+
import jsonschema
|
|
41
|
+
import numpy as np
|
|
42
|
+
|
|
43
|
+
import tskit
|
|
44
|
+
import tskit.exceptions as exceptions
|
|
45
|
+
import tskit.util as util
|
|
46
|
+
|
|
47
|
+
__builtins__object__setattr__ = builtins.object.__setattr__
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def replace_root_refs(obj):
|
|
51
|
+
if type(obj) is list:
|
|
52
|
+
return [replace_root_refs(j) for j in obj]
|
|
53
|
+
elif type(obj) is dict:
|
|
54
|
+
ret = {k: replace_root_refs(v) for k, v in obj.items()}
|
|
55
|
+
if ret.get("$ref") == "#":
|
|
56
|
+
ret["$ref"] = "#/definitions/root"
|
|
57
|
+
return ret
|
|
58
|
+
else:
|
|
59
|
+
return obj
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# Our schema is the Draft7Validator schema with added codec information.
|
|
63
|
+
TSKITMetadataSchemaValidator = jsonschema.validators.extend(
|
|
64
|
+
jsonschema.validators.Draft7Validator
|
|
65
|
+
)
|
|
66
|
+
deref_meta_schema: Mapping[str, Any] = copy.deepcopy(
|
|
67
|
+
TSKITMetadataSchemaValidator.META_SCHEMA
|
|
68
|
+
)
|
|
69
|
+
# We need a top-level only required property so we need to rewrite any reference
|
|
70
|
+
# to the top-level schema to a copy in a definition.
|
|
71
|
+
deref_meta_schema = replace_root_refs(deref_meta_schema)
|
|
72
|
+
deref_meta_schema["definitions"]["root"] = copy.deepcopy(deref_meta_schema)
|
|
73
|
+
deref_meta_schema["codec"] = {"type": "string"}
|
|
74
|
+
deref_meta_schema["required"] = ["codec"]
|
|
75
|
+
# For interoperability reasons, force the top-level to be an object or union
|
|
76
|
+
# of object and null
|
|
77
|
+
deref_meta_schema["properties"]["type"] = {"enum": ["object", ["object", "null"]]}
|
|
78
|
+
# Change the schema URL to avoid jsonschema's cache
|
|
79
|
+
deref_meta_schema["$schema"] = "http://json-schema.org/draft-o=07/schema#tskit"
|
|
80
|
+
TSKITMetadataSchemaValidator.META_SCHEMA = deref_meta_schema
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class AbstractMetadataCodec(metaclass=abc.ABCMeta):
|
|
84
|
+
"""
|
|
85
|
+
Superclass of all MetadataCodecs.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def __init__(self, schema: Mapping[str, Any]) -> None:
|
|
89
|
+
raise NotImplementedError # pragma: no cover
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def modify_schema(self, schema: Mapping) -> Mapping:
|
|
93
|
+
return schema
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def is_schema_trivial(self, schema: Mapping) -> bool:
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
@abc.abstractmethod
|
|
100
|
+
def encode(self, obj: Any) -> bytes:
|
|
101
|
+
raise NotImplementedError # pragma: no cover
|
|
102
|
+
|
|
103
|
+
@abc.abstractmethod
|
|
104
|
+
def decode(self, encoded: bytes) -> Any:
|
|
105
|
+
raise NotImplementedError # pragma: no cover
|
|
106
|
+
|
|
107
|
+
def numpy_dtype(self, schema) -> Any:
|
|
108
|
+
raise NotImplementedError
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
codec_registry = {}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def register_metadata_codec(
|
|
115
|
+
codec_cls: type[AbstractMetadataCodec], codec_id: str
|
|
116
|
+
) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Register a metadata codec class.
|
|
119
|
+
This function maintains a mapping from metadata codec identifiers used in schemas
|
|
120
|
+
to codec classes. When a codec class is registered, it will replace any class
|
|
121
|
+
previously registered under the same codec identifier, if present.
|
|
122
|
+
|
|
123
|
+
:param str codec_id: String to use to refer to the codec in the schema.
|
|
124
|
+
"""
|
|
125
|
+
codec_registry[codec_id] = codec_cls
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class JSONCodec(AbstractMetadataCodec):
|
|
129
|
+
def default_validator(validator, types, instance, schema):
|
|
130
|
+
# For json codec defaults must be at the top level
|
|
131
|
+
if validator.is_type(instance, "object"):
|
|
132
|
+
for v in instance.get("properties", {}).values():
|
|
133
|
+
for v2 in v.get("properties", {}).values():
|
|
134
|
+
if "default" in v2:
|
|
135
|
+
yield jsonschema.ValidationError(
|
|
136
|
+
"Defaults can only be specified at the top level"
|
|
137
|
+
" for JSON codec"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
schema_validator = jsonschema.validators.extend(
|
|
141
|
+
TSKITMetadataSchemaValidator, {"default": default_validator}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def is_schema_trivial(self, schema: Mapping) -> bool:
|
|
146
|
+
return len(schema.get("properties", {})) == 0
|
|
147
|
+
|
|
148
|
+
def __init__(self, schema: Mapping[str, Any]) -> None:
|
|
149
|
+
try:
|
|
150
|
+
self.schema_validator.check_schema(schema)
|
|
151
|
+
except jsonschema.exceptions.SchemaError as ve:
|
|
152
|
+
raise exceptions.MetadataSchemaValidationError(str(ve)) from ve
|
|
153
|
+
|
|
154
|
+
# Find default values to fill in on decode, top level only
|
|
155
|
+
self.defaults = {
|
|
156
|
+
key: prop["default"]
|
|
157
|
+
for key, prop in schema.get("properties", {}).items()
|
|
158
|
+
if "default" in prop
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
def encode(self, obj: Any) -> bytes:
|
|
162
|
+
try:
|
|
163
|
+
return tskit.canonical_json(obj).encode()
|
|
164
|
+
except TypeError as e:
|
|
165
|
+
raise exceptions.MetadataEncodingError(
|
|
166
|
+
f"Could not encode metadata of type {str(e).split()[3]}"
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def decode(self, encoded: bytes) -> Any:
|
|
170
|
+
if len(encoded) == 0:
|
|
171
|
+
result = {}
|
|
172
|
+
else:
|
|
173
|
+
result = json.loads(encoded.decode())
|
|
174
|
+
|
|
175
|
+
# Assign default values
|
|
176
|
+
if isinstance(result, dict):
|
|
177
|
+
return dict(self.defaults, **result)
|
|
178
|
+
else:
|
|
179
|
+
return result
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
register_metadata_codec(JSONCodec, "json")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class NOOPCodec(AbstractMetadataCodec):
|
|
186
|
+
def __init__(self, schema: Mapping[str, Any]) -> None:
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
def encode(self, data: bytes) -> bytes:
|
|
190
|
+
return data
|
|
191
|
+
|
|
192
|
+
def decode(self, data: bytes) -> bytes:
|
|
193
|
+
return data
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def binary_format_validator(validator, types, instance, schema):
|
|
197
|
+
# We're hooking into jsonschemas validation code here, which works by creating
|
|
198
|
+
# generators of exceptions, hence the yielding
|
|
199
|
+
|
|
200
|
+
# Make sure the normal type validation gets done
|
|
201
|
+
try:
|
|
202
|
+
yield from jsonschema._validators.type(validator, types, instance, schema)
|
|
203
|
+
except AttributeError:
|
|
204
|
+
# Needed since jsonschema==4.19.1
|
|
205
|
+
yield from jsonschema._keywords.type(validator, types, instance, schema)
|
|
206
|
+
|
|
207
|
+
# Non-composite types must have a binaryFormat
|
|
208
|
+
if validator.is_type(instance, "object"):
|
|
209
|
+
for v in instance.values():
|
|
210
|
+
if (
|
|
211
|
+
isinstance(v, dict)
|
|
212
|
+
and v.get("type")
|
|
213
|
+
not in (None, "object", "array", "null", ["object", "null"])
|
|
214
|
+
and "binaryFormat" not in v
|
|
215
|
+
):
|
|
216
|
+
yield jsonschema.ValidationError(
|
|
217
|
+
f"{v['type']} type must have binaryFormat set"
|
|
218
|
+
)
|
|
219
|
+
# null type must be padding
|
|
220
|
+
if (
|
|
221
|
+
validator.is_type(instance, "object")
|
|
222
|
+
and "null" in instance
|
|
223
|
+
and instance["null"].get("type") == "null"
|
|
224
|
+
and "binaryFormat" in instance["null"]
|
|
225
|
+
and instance["null"]["binaryFormat"][-1] != "x"
|
|
226
|
+
):
|
|
227
|
+
yield jsonschema.ValidationError(
|
|
228
|
+
'null type binaryFormat must be padding ("x") if set'
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def array_length_validator(validator, types, instance, schema):
|
|
233
|
+
# Validate that array schema doesn't have both length and
|
|
234
|
+
# noLengthEncodingExhaustBuffer set. Also ensure that arrayLengthFormat
|
|
235
|
+
# is not set when length is set.
|
|
236
|
+
|
|
237
|
+
# Call the normal properties validator first
|
|
238
|
+
try:
|
|
239
|
+
yield from jsonschema._validators.properties(validator, types, instance, schema)
|
|
240
|
+
except AttributeError:
|
|
241
|
+
# Needed since jsonschema==4.19.1
|
|
242
|
+
yield from jsonschema._keywords.properties(validator, types, instance, schema)
|
|
243
|
+
for prop, sub_schema in instance["properties"].items():
|
|
244
|
+
if sub_schema.get("type") == "array":
|
|
245
|
+
has_length = "length" in sub_schema
|
|
246
|
+
has_exhaust = sub_schema.get("noLengthEncodingExhaustBuffer", False)
|
|
247
|
+
|
|
248
|
+
if has_length and has_exhaust:
|
|
249
|
+
yield jsonschema.ValidationError(
|
|
250
|
+
f"{prop} array cannot have both 'length' and "
|
|
251
|
+
"'noLengthEncodingExhaustBuffer' set"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
if has_length and "arrayLengthFormat" in sub_schema:
|
|
255
|
+
yield jsonschema.ValidationError(
|
|
256
|
+
f"{prop} fixed-length array should not specify 'arrayLengthFormat'"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
if has_length and sub_schema["length"] < 0:
|
|
260
|
+
yield jsonschema.ValidationError(
|
|
261
|
+
f"{prop} array length must be non-negative, got "
|
|
262
|
+
f"{sub_schema['length']}"
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def required_validator(validator, required, instance, schema):
|
|
267
|
+
# Do the normal validation
|
|
268
|
+
try:
|
|
269
|
+
yield from jsonschema._validators.required(
|
|
270
|
+
validator, required, instance, schema
|
|
271
|
+
)
|
|
272
|
+
except AttributeError:
|
|
273
|
+
# Needed since jsonschema==4.19.1
|
|
274
|
+
yield from jsonschema._keywords.required(validator, required, instance, schema)
|
|
275
|
+
|
|
276
|
+
# For struct codec if a property is not required, then it must have a default
|
|
277
|
+
for prop, sub_schema in instance["properties"].items():
|
|
278
|
+
if prop not in instance["required"] and "default" not in sub_schema:
|
|
279
|
+
yield jsonschema.ValidationError(
|
|
280
|
+
f"Optional property '{prop}' must have" f" a default value"
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
StructCodecSchemaValidator = jsonschema.validators.extend(
|
|
285
|
+
TSKITMetadataSchemaValidator,
|
|
286
|
+
{
|
|
287
|
+
"type": binary_format_validator,
|
|
288
|
+
"required": required_validator,
|
|
289
|
+
"properties": array_length_validator,
|
|
290
|
+
},
|
|
291
|
+
)
|
|
292
|
+
struct_meta_schema: Mapping[str, Any] = copy.deepcopy(
|
|
293
|
+
StructCodecSchemaValidator.META_SCHEMA
|
|
294
|
+
)
|
|
295
|
+
# No union types
|
|
296
|
+
struct_meta_schema["definitions"]["root"]["properties"]["type"] = {
|
|
297
|
+
"$ref": "#/definitions/simpleTypes"
|
|
298
|
+
}
|
|
299
|
+
# No hetrogeneous arrays
|
|
300
|
+
struct_meta_schema["properties"]["items"] = {"$ref": "#/definitions/root"}
|
|
301
|
+
struct_meta_schema["definitions"]["root"]["properties"]["items"] = struct_meta_schema[
|
|
302
|
+
"properties"
|
|
303
|
+
]["items"]
|
|
304
|
+
# binaryFormat matches regex
|
|
305
|
+
struct_meta_schema["properties"]["binaryFormat"] = {
|
|
306
|
+
"type": "string",
|
|
307
|
+
"pattern": r"^([cbB\?hHiIlLqQfd]|\d*[spx])$",
|
|
308
|
+
}
|
|
309
|
+
struct_meta_schema["definitions"]["root"]["properties"]["binaryFormat"] = (
|
|
310
|
+
struct_meta_schema["properties"]["binaryFormat"]
|
|
311
|
+
)
|
|
312
|
+
# arrayLengthFormat matches regex and has default
|
|
313
|
+
struct_meta_schema["properties"]["arrayLengthFormat"] = {
|
|
314
|
+
"type": "string",
|
|
315
|
+
"pattern": r"^[BHILQ]$",
|
|
316
|
+
"default": "L",
|
|
317
|
+
}
|
|
318
|
+
struct_meta_schema["definitions"]["root"]["properties"]["arrayLengthFormat"] = (
|
|
319
|
+
struct_meta_schema["properties"]["arrayLengthFormat"]
|
|
320
|
+
)
|
|
321
|
+
# index is numeric
|
|
322
|
+
struct_meta_schema["properties"]["index"] = {"type": "number"}
|
|
323
|
+
struct_meta_schema["definitions"]["root"]["properties"]["index"] = struct_meta_schema[
|
|
324
|
+
"properties"
|
|
325
|
+
]["index"]
|
|
326
|
+
# stringEncoding is string and has default
|
|
327
|
+
struct_meta_schema["properties"]["stringEncoding"] = {
|
|
328
|
+
"type": "string",
|
|
329
|
+
"default": "utf-8",
|
|
330
|
+
}
|
|
331
|
+
struct_meta_schema["definitions"]["root"]["properties"]["stringEncoding"] = (
|
|
332
|
+
struct_meta_schema["properties"]["stringEncoding"]
|
|
333
|
+
)
|
|
334
|
+
# nullTerminated is a boolean
|
|
335
|
+
struct_meta_schema["properties"]["nullTerminated"] = {"type": "boolean"}
|
|
336
|
+
struct_meta_schema["definitions"]["root"]["properties"]["nullTerminated"] = (
|
|
337
|
+
struct_meta_schema["properties"]["nullTerminated"]
|
|
338
|
+
)
|
|
339
|
+
# noLengthEncodingExhaustBuffer is a boolean
|
|
340
|
+
struct_meta_schema["properties"]["noLengthEncodingExhaustBuffer"] = {"type": "boolean"}
|
|
341
|
+
struct_meta_schema["definitions"]["root"]["properties"][
|
|
342
|
+
"noLengthEncodingExhaustBuffer"
|
|
343
|
+
] = struct_meta_schema["properties"]["noLengthEncodingExhaustBuffer"]
|
|
344
|
+
|
|
345
|
+
# length is numeric (for fixed-length arrays)
|
|
346
|
+
struct_meta_schema["properties"]["length"] = {"type": "integer"}
|
|
347
|
+
struct_meta_schema["definitions"]["root"]["properties"]["length"] = struct_meta_schema[
|
|
348
|
+
"properties"
|
|
349
|
+
]["length"]
|
|
350
|
+
|
|
351
|
+
StructCodecSchemaValidator.META_SCHEMA = struct_meta_schema
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
class StructCodec(AbstractMetadataCodec):
|
|
355
|
+
"""
|
|
356
|
+
Codec that encodes data using struct. Note that this codec has extra restrictions
|
|
357
|
+
Namely that object keys must be fixed (all present and no extra); each entry should
|
|
358
|
+
have a binaryFormat; that arrays are homogeneous and that types are not unions.
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
@classmethod
|
|
362
|
+
def order_by_index(cls, obj, do_sort=False):
|
|
363
|
+
"""
|
|
364
|
+
Take a schema and recursively convert any dict that is under the key
|
|
365
|
+
name ``properties`` to an OrderedDict.
|
|
366
|
+
"""
|
|
367
|
+
if isinstance(obj, collections.abc.Mapping):
|
|
368
|
+
items = obj.items()
|
|
369
|
+
if do_sort:
|
|
370
|
+
# Python sort is stable so we can do the sorts in reverse priority
|
|
371
|
+
items = sorted(items, key=lambda k_v: k_v[0])
|
|
372
|
+
items = sorted(items, key=lambda k_v: k_v[1].get("index", 0))
|
|
373
|
+
items = ((k, cls.order_by_index(v, k == "properties")) for k, v in items)
|
|
374
|
+
if do_sort:
|
|
375
|
+
return collections.OrderedDict(items)
|
|
376
|
+
else:
|
|
377
|
+
return dict(items)
|
|
378
|
+
elif isinstance(obj, list) or isinstance(obj, tuple):
|
|
379
|
+
return [cls.order_by_index(v, False) for v in obj]
|
|
380
|
+
else:
|
|
381
|
+
return obj
|
|
382
|
+
|
|
383
|
+
@classmethod
|
|
384
|
+
def make_decode(cls, sub_schema):
|
|
385
|
+
"""
|
|
386
|
+
Create a function that can decode objects of this schema
|
|
387
|
+
"""
|
|
388
|
+
if set(sub_schema["type"]) == {"object", "null"}:
|
|
389
|
+
return StructCodec.make_object_or_null_decode(sub_schema)
|
|
390
|
+
else:
|
|
391
|
+
return {
|
|
392
|
+
"array": StructCodec.make_array_decode,
|
|
393
|
+
"object": StructCodec.make_object_decode,
|
|
394
|
+
"string": StructCodec.make_string_decode,
|
|
395
|
+
"null": StructCodec.make_null_decode,
|
|
396
|
+
"number": StructCodec.make_numeric_decode,
|
|
397
|
+
"integer": StructCodec.make_numeric_decode,
|
|
398
|
+
"boolean": StructCodec.make_numeric_decode,
|
|
399
|
+
}[sub_schema["type"]](sub_schema)
|
|
400
|
+
|
|
401
|
+
@classmethod
|
|
402
|
+
def make_array_decode(cls, sub_schema):
|
|
403
|
+
element_decoder = StructCodec.make_decode(sub_schema["items"])
|
|
404
|
+
fixed_length = sub_schema.get("length")
|
|
405
|
+
array_length_f = "<" + sub_schema.get("arrayLengthFormat", "L")
|
|
406
|
+
array_length_size = struct.calcsize(array_length_f)
|
|
407
|
+
exhaust_buffer = sub_schema.get("noLengthEncodingExhaustBuffer", False)
|
|
408
|
+
|
|
409
|
+
def array_decode(buffer):
|
|
410
|
+
array_length = struct.unpack(
|
|
411
|
+
array_length_f, bytes(islice(buffer, array_length_size))
|
|
412
|
+
)[0]
|
|
413
|
+
return [element_decoder(buffer) for _ in range(array_length)]
|
|
414
|
+
|
|
415
|
+
def array_decode_exhaust(buffer):
|
|
416
|
+
ret = []
|
|
417
|
+
while True:
|
|
418
|
+
try:
|
|
419
|
+
ret.append(element_decoder(buffer))
|
|
420
|
+
except struct.error as e:
|
|
421
|
+
if "unpack requires a buffer" in str(e):
|
|
422
|
+
break
|
|
423
|
+
else:
|
|
424
|
+
raise e
|
|
425
|
+
return ret
|
|
426
|
+
|
|
427
|
+
def array_decode_fixed_length(buffer):
|
|
428
|
+
return [element_decoder(buffer) for _ in range(fixed_length)]
|
|
429
|
+
|
|
430
|
+
if fixed_length is not None:
|
|
431
|
+
return array_decode_fixed_length
|
|
432
|
+
elif exhaust_buffer:
|
|
433
|
+
return array_decode_exhaust
|
|
434
|
+
else:
|
|
435
|
+
return array_decode
|
|
436
|
+
|
|
437
|
+
@classmethod
|
|
438
|
+
def make_object_decode(cls, sub_schema):
|
|
439
|
+
sub_decoders = {
|
|
440
|
+
key: StructCodec.make_decode(prop)
|
|
441
|
+
for key, prop in sub_schema["properties"].items()
|
|
442
|
+
}
|
|
443
|
+
return lambda buffer: {
|
|
444
|
+
key: sub_decoder(buffer) for key, sub_decoder in sub_decoders.items()
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
@classmethod
|
|
448
|
+
def make_object_or_null_decode(cls, sub_schema):
|
|
449
|
+
sub_decoders = {
|
|
450
|
+
key: StructCodec.make_decode(prop)
|
|
451
|
+
for key, prop in sub_schema["properties"].items()
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
def decode_object_or_null(buffer):
|
|
455
|
+
# We have to check the buffer length for null, as the islices in
|
|
456
|
+
# sub-decoders won't raise StopIteration
|
|
457
|
+
buffer = list(buffer)
|
|
458
|
+
if len(buffer) == 0:
|
|
459
|
+
return None
|
|
460
|
+
else:
|
|
461
|
+
buffer = iter(buffer)
|
|
462
|
+
return {
|
|
463
|
+
key: sub_decoder(buffer)
|
|
464
|
+
for key, sub_decoder in sub_decoders.items()
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
return decode_object_or_null
|
|
468
|
+
|
|
469
|
+
@classmethod
|
|
470
|
+
def make_string_decode(cls, sub_schema):
|
|
471
|
+
f = "<" + sub_schema["binaryFormat"]
|
|
472
|
+
size = struct.calcsize(f)
|
|
473
|
+
encoding = sub_schema.get("stringEncoding", "utf-8")
|
|
474
|
+
null_terminated = sub_schema.get("nullTerminated", False)
|
|
475
|
+
if not null_terminated:
|
|
476
|
+
return lambda buffer: struct.unpack(f, bytes(islice(buffer, size)))[
|
|
477
|
+
0
|
|
478
|
+
].decode(encoding)
|
|
479
|
+
else:
|
|
480
|
+
|
|
481
|
+
def decode_string(buffer):
|
|
482
|
+
s = struct.unpack(f, bytes(islice(buffer, size)))[0].decode(encoding)
|
|
483
|
+
i = s.find("\x00")
|
|
484
|
+
if i == -1:
|
|
485
|
+
return s
|
|
486
|
+
return s[:i]
|
|
487
|
+
|
|
488
|
+
return decode_string
|
|
489
|
+
|
|
490
|
+
@classmethod
|
|
491
|
+
def make_null_decode(cls, sub_schema):
|
|
492
|
+
if sub_schema.get("binaryFormat") is not None:
|
|
493
|
+
f = sub_schema["binaryFormat"]
|
|
494
|
+
size = struct.calcsize(f)
|
|
495
|
+
|
|
496
|
+
def padding_decode(buffer):
|
|
497
|
+
struct.unpack(f, bytes(islice(buffer, size)))
|
|
498
|
+
|
|
499
|
+
return padding_decode
|
|
500
|
+
else:
|
|
501
|
+
return lambda _: None
|
|
502
|
+
|
|
503
|
+
@classmethod
|
|
504
|
+
def make_numeric_decode(cls, sub_schema):
|
|
505
|
+
f = "<" + sub_schema["binaryFormat"]
|
|
506
|
+
size = struct.calcsize(f)
|
|
507
|
+
return lambda buffer: struct.unpack(f, bytes(islice(buffer, size)))[0]
|
|
508
|
+
|
|
509
|
+
@classmethod
|
|
510
|
+
def make_encode(cls, sub_schema):
|
|
511
|
+
"""
|
|
512
|
+
Create a function that can encode objects of this schema
|
|
513
|
+
"""
|
|
514
|
+
if set(sub_schema["type"]) == {"object", "null"}:
|
|
515
|
+
return StructCodec.make_object_or_null_encode(sub_schema)
|
|
516
|
+
else:
|
|
517
|
+
return {
|
|
518
|
+
"array": StructCodec.make_array_encode,
|
|
519
|
+
"object": StructCodec.make_object_encode,
|
|
520
|
+
"string": StructCodec.make_string_encode,
|
|
521
|
+
"null": StructCodec.make_null_encode,
|
|
522
|
+
"number": StructCodec.make_numeric_encode,
|
|
523
|
+
"integer": StructCodec.make_numeric_encode,
|
|
524
|
+
"boolean": StructCodec.make_numeric_encode,
|
|
525
|
+
}[sub_schema["type"]](sub_schema)
|
|
526
|
+
|
|
527
|
+
@classmethod
|
|
528
|
+
def make_array_encode(cls, sub_schema):
|
|
529
|
+
element_encoder = StructCodec.make_encode(sub_schema["items"])
|
|
530
|
+
fixed_length = sub_schema.get("length")
|
|
531
|
+
array_length_f = "<" + sub_schema.get("arrayLengthFormat", "L")
|
|
532
|
+
exhaust_buffer = sub_schema.get("noLengthEncodingExhaustBuffer", False)
|
|
533
|
+
|
|
534
|
+
def array_encode_fixed_length(array):
|
|
535
|
+
if len(array) != fixed_length:
|
|
536
|
+
raise ValueError(
|
|
537
|
+
f"Array length {len(array)} does not match schema"
|
|
538
|
+
f" fixed length {fixed_length}"
|
|
539
|
+
)
|
|
540
|
+
return b"".join(element_encoder(ele) for ele in array)
|
|
541
|
+
|
|
542
|
+
def array_encode_exhaust(array):
|
|
543
|
+
return b"".join(element_encoder(ele) for ele in array)
|
|
544
|
+
|
|
545
|
+
def array_encode_with_length(array):
|
|
546
|
+
try:
|
|
547
|
+
packed_length = struct.pack(array_length_f, len(array))
|
|
548
|
+
except struct.error:
|
|
549
|
+
raise ValueError(
|
|
550
|
+
"Couldn't pack array size - it is likely too long"
|
|
551
|
+
" for the specified arrayLengthFormat"
|
|
552
|
+
)
|
|
553
|
+
return packed_length + b"".join(element_encoder(ele) for ele in array)
|
|
554
|
+
|
|
555
|
+
if fixed_length is not None:
|
|
556
|
+
return array_encode_fixed_length
|
|
557
|
+
elif exhaust_buffer:
|
|
558
|
+
return array_encode_exhaust
|
|
559
|
+
else:
|
|
560
|
+
return array_encode_with_length
|
|
561
|
+
|
|
562
|
+
@classmethod
|
|
563
|
+
def make_object_encode(cls, sub_schema):
|
|
564
|
+
sub_encoders = {
|
|
565
|
+
key: StructCodec.make_encode(prop)
|
|
566
|
+
for key, prop in sub_schema["properties"].items()
|
|
567
|
+
}
|
|
568
|
+
defaults = {
|
|
569
|
+
key: prop["default"]
|
|
570
|
+
for key, prop in sub_schema["properties"].items()
|
|
571
|
+
if "default" in prop
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
def object_encode(obj):
|
|
575
|
+
values = []
|
|
576
|
+
for key, sub_encoder in sub_encoders.items():
|
|
577
|
+
try:
|
|
578
|
+
values.append(sub_encoder(obj[key]))
|
|
579
|
+
except KeyError:
|
|
580
|
+
values.append(sub_encoder(defaults[key]))
|
|
581
|
+
return b"".join(values)
|
|
582
|
+
|
|
583
|
+
return object_encode
|
|
584
|
+
|
|
585
|
+
@classmethod
|
|
586
|
+
def make_object_or_null_encode(cls, sub_schema):
|
|
587
|
+
sub_encoders = {
|
|
588
|
+
key: StructCodec.make_encode(prop)
|
|
589
|
+
for key, prop in sub_schema["properties"].items()
|
|
590
|
+
}
|
|
591
|
+
defaults = {
|
|
592
|
+
key: prop["default"]
|
|
593
|
+
for key, prop in sub_schema["properties"].items()
|
|
594
|
+
if "default" in prop
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
def object_encode(obj):
|
|
598
|
+
values = []
|
|
599
|
+
if obj is not None:
|
|
600
|
+
for key, sub_encoder in sub_encoders.items():
|
|
601
|
+
try:
|
|
602
|
+
values.append(sub_encoder(obj[key]))
|
|
603
|
+
except KeyError:
|
|
604
|
+
values.append(sub_encoder(defaults[key]))
|
|
605
|
+
return b"".join(values)
|
|
606
|
+
|
|
607
|
+
return object_encode
|
|
608
|
+
|
|
609
|
+
@classmethod
|
|
610
|
+
def make_string_encode(cls, sub_schema):
|
|
611
|
+
encoding = sub_schema.get("stringEncoding", "utf-8")
|
|
612
|
+
return lambda string: struct.pack(
|
|
613
|
+
"<" + sub_schema["binaryFormat"], string.encode(encoding)
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
@classmethod
|
|
617
|
+
def make_null_encode(cls, sub_schema):
|
|
618
|
+
return lambda _: struct.pack(sub_schema.get("binaryFormat", "0x"))
|
|
619
|
+
|
|
620
|
+
@classmethod
|
|
621
|
+
def make_numeric_encode(cls, sub_schema):
|
|
622
|
+
return struct.Struct("<" + sub_schema["binaryFormat"]).pack
|
|
623
|
+
|
|
624
|
+
@classmethod
|
|
625
|
+
def modify_schema(cls, schema: Mapping) -> Mapping:
|
|
626
|
+
# This codec requires that additional properties are
|
|
627
|
+
# not allowed. Rather than get schema authors to repeat that everywhere
|
|
628
|
+
# we add it here, sadly we can't do this in the metaschema as "default" isn't
|
|
629
|
+
# used by the validator.
|
|
630
|
+
def enforce_fixed_properties(obj):
|
|
631
|
+
if type(obj) is list:
|
|
632
|
+
return [enforce_fixed_properties(j) for j in obj]
|
|
633
|
+
elif type(obj) is dict:
|
|
634
|
+
ret = {k: enforce_fixed_properties(v) for k, v in obj.items()}
|
|
635
|
+
if "object" in ret.get("type", []):
|
|
636
|
+
if ret.get("additional_properties"):
|
|
637
|
+
raise ValueError(
|
|
638
|
+
"Struct codec does not support additional_properties"
|
|
639
|
+
)
|
|
640
|
+
# To prevent authors having to list required properties the default
|
|
641
|
+
# is that all without a default are required.
|
|
642
|
+
if "required" not in ret:
|
|
643
|
+
ret["required"] = [
|
|
644
|
+
prop
|
|
645
|
+
for prop, sub_schema in ret.get("properties", {}).items()
|
|
646
|
+
if "default" not in sub_schema
|
|
647
|
+
]
|
|
648
|
+
ret["additionalProperties"] = False
|
|
649
|
+
return ret
|
|
650
|
+
else:
|
|
651
|
+
return obj
|
|
652
|
+
|
|
653
|
+
schema = enforce_fixed_properties(schema)
|
|
654
|
+
|
|
655
|
+
# We also give the schema an explicit ordering
|
|
656
|
+
return StructCodec.order_by_index(schema)
|
|
657
|
+
|
|
658
|
+
def __init__(self, schema: Mapping[str, Any]) -> None:
|
|
659
|
+
try:
|
|
660
|
+
StructCodecSchemaValidator.check_schema(schema)
|
|
661
|
+
except jsonschema.exceptions.SchemaError as ve:
|
|
662
|
+
raise exceptions.MetadataSchemaValidationError(str(ve)) from ve
|
|
663
|
+
|
|
664
|
+
self.encode = StructCodec.make_encode(schema)
|
|
665
|
+
decoder = StructCodec.make_decode(schema)
|
|
666
|
+
self.decode = lambda buffer: decoder(iter(buffer))
|
|
667
|
+
|
|
668
|
+
def encode(self, obj: Any) -> bytes:
|
|
669
|
+
# Set by __init__
|
|
670
|
+
pass # pragma: nocover
|
|
671
|
+
|
|
672
|
+
def decode(self, encoded: bytes) -> Any:
|
|
673
|
+
# Set by __init__
|
|
674
|
+
pass # pragma: nocover
|
|
675
|
+
|
|
676
|
+
def numpy_dtype(self, schema):
|
|
677
|
+
# Mapping from struct format characters to NumPy dtype strings
|
|
678
|
+
# Note: All are little-endian as enforced by the struct codec
|
|
679
|
+
# This means they will be the standard size across platforms
|
|
680
|
+
FORMAT_TO_DTYPE = {
|
|
681
|
+
# Boolean
|
|
682
|
+
"?": "?",
|
|
683
|
+
# Integers
|
|
684
|
+
"b": "i1",
|
|
685
|
+
"B": "u1",
|
|
686
|
+
"h": "i2",
|
|
687
|
+
"H": "u2",
|
|
688
|
+
"i": "i4",
|
|
689
|
+
"I": "u4",
|
|
690
|
+
"l": "i4",
|
|
691
|
+
"L": "u4",
|
|
692
|
+
"q": "i8",
|
|
693
|
+
"Q": "u8",
|
|
694
|
+
# Floats
|
|
695
|
+
"f": "f4",
|
|
696
|
+
"d": "f8",
|
|
697
|
+
# Single character
|
|
698
|
+
"c": "S1",
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
def _convert_binary_format(fmt):
|
|
702
|
+
if fmt.endswith("x"):
|
|
703
|
+
if fmt == "x":
|
|
704
|
+
return "V1"
|
|
705
|
+
n = int(fmt[:-1])
|
|
706
|
+
return f"V{n}"
|
|
707
|
+
|
|
708
|
+
if fmt.endswith("s"):
|
|
709
|
+
if fmt == "s":
|
|
710
|
+
return "S1"
|
|
711
|
+
n = int(fmt[:-1])
|
|
712
|
+
return f"S{n}"
|
|
713
|
+
|
|
714
|
+
if fmt.endswith("p"):
|
|
715
|
+
raise ValueError(
|
|
716
|
+
"Pascal string format ('p') is not supported by NumPy dtypes."
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
if fmt in FORMAT_TO_DTYPE:
|
|
720
|
+
return FORMAT_TO_DTYPE[fmt]
|
|
721
|
+
|
|
722
|
+
# As schemas are validated on __init__ this should never happen
|
|
723
|
+
raise ValueError(f"Unsupported binary format: {fmt}") # pragma: no cover
|
|
724
|
+
|
|
725
|
+
def _process_schema_node(node):
|
|
726
|
+
# The null type with union can only occur at the top-level
|
|
727
|
+
if set(node.get("type", [])) == {"object", "null"}:
|
|
728
|
+
raise ValueError("Top level object/null union not supported")
|
|
729
|
+
elif node.get("type") == "object":
|
|
730
|
+
fields = []
|
|
731
|
+
for prop_name, prop_schema in node.get("properties", {}).items():
|
|
732
|
+
fields.append((prop_name, _process_schema_node(prop_schema)))
|
|
733
|
+
return fields
|
|
734
|
+
|
|
735
|
+
elif node.get("type") == "array":
|
|
736
|
+
if "length" not in node:
|
|
737
|
+
raise ValueError(
|
|
738
|
+
"Only fixed-length arrays are supported for NumPy dtype"
|
|
739
|
+
" conversion. Variable-length arrays cannot be represented"
|
|
740
|
+
" in a structured dtype."
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
length = node["length"]
|
|
744
|
+
item_dtype = _process_schema_node(node["items"])
|
|
745
|
+
|
|
746
|
+
# Return the item dtype with shape information
|
|
747
|
+
return (item_dtype, (length,))
|
|
748
|
+
|
|
749
|
+
elif node.get("type") in ("number", "integer", "boolean", "string", "null"):
|
|
750
|
+
fmt = node["binaryFormat"]
|
|
751
|
+
dtype_str = _convert_binary_format(fmt)
|
|
752
|
+
|
|
753
|
+
if dtype_str[0] not in "VSU?":
|
|
754
|
+
# Don't add endianness to void, string, unicode or bool types
|
|
755
|
+
dtype_str = "<" + dtype_str
|
|
756
|
+
|
|
757
|
+
return dtype_str
|
|
758
|
+
|
|
759
|
+
dtype_spec = _process_schema_node(schema)
|
|
760
|
+
return np.dtype(dtype_spec)
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
register_metadata_codec(StructCodec, "struct")
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def validate_bytes(data: bytes | None) -> None:
|
|
767
|
+
if data is not None and not isinstance(data, bytes):
|
|
768
|
+
raise TypeError(
|
|
769
|
+
f"If no encoding is set metadata should be bytes, found {type(data)}"
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
class MetadataSchema:
|
|
774
|
+
"""
|
|
775
|
+
Class for validating, encoding and decoding metadata.
|
|
776
|
+
|
|
777
|
+
:param dict schema: A dict containing a valid JSONSchema object.
|
|
778
|
+
"""
|
|
779
|
+
|
|
780
|
+
def __init__(self, schema: Mapping[str, Any] | None) -> None:
|
|
781
|
+
self._schema = schema
|
|
782
|
+
self._unmodified_schema = schema
|
|
783
|
+
self._bypass_validation = False
|
|
784
|
+
|
|
785
|
+
if schema is None:
|
|
786
|
+
self._string = ""
|
|
787
|
+
self._validate_row = validate_bytes
|
|
788
|
+
self.encode_row = NOOPCodec({}).encode
|
|
789
|
+
self.decode_row = NOOPCodec({}).decode
|
|
790
|
+
self.empty_value = b""
|
|
791
|
+
self.codec_instance = NOOPCodec({})
|
|
792
|
+
else:
|
|
793
|
+
try:
|
|
794
|
+
TSKITMetadataSchemaValidator.check_schema(schema)
|
|
795
|
+
except jsonschema.exceptions.SchemaError as ve:
|
|
796
|
+
raise exceptions.MetadataSchemaValidationError(str(ve)) from ve
|
|
797
|
+
try:
|
|
798
|
+
codec_cls = codec_registry[schema["codec"]]
|
|
799
|
+
except KeyError:
|
|
800
|
+
raise exceptions.MetadataSchemaValidationError(
|
|
801
|
+
f"Unrecognised metadata codec '{schema['codec']}'. "
|
|
802
|
+
f"Valid options are {str(list(codec_registry.keys()))}."
|
|
803
|
+
)
|
|
804
|
+
# Codecs can modify the schema, for example to set defaults as the validator
|
|
805
|
+
# does not.
|
|
806
|
+
self._schema = codec_cls.modify_schema(schema)
|
|
807
|
+
self.codec_instance = codec_cls(self._schema)
|
|
808
|
+
self._string = tskit.canonical_json(self._schema)
|
|
809
|
+
self._validate_row = TSKITMetadataSchemaValidator(self._schema).validate
|
|
810
|
+
self._bypass_validation = codec_cls.is_schema_trivial(schema)
|
|
811
|
+
self.encode_row = self.codec_instance.encode
|
|
812
|
+
self.decode_row = self.codec_instance.decode
|
|
813
|
+
|
|
814
|
+
# If None is allowed by the schema as the top-level type, it gets used even
|
|
815
|
+
# in the presence of default and required values.
|
|
816
|
+
if "type" in self._schema and "null" in self._schema["type"]:
|
|
817
|
+
self.empty_value = None
|
|
818
|
+
else:
|
|
819
|
+
self.empty_value = {}
|
|
820
|
+
|
|
821
|
+
def __repr__(self) -> str:
|
|
822
|
+
return self._string
|
|
823
|
+
|
|
824
|
+
def __str__(self) -> str:
|
|
825
|
+
if isinstance(self._schema, collections.OrderedDict):
|
|
826
|
+
s = pprint.pformat(dict(self._schema))
|
|
827
|
+
else:
|
|
828
|
+
s = pprint.pformat(self._schema)
|
|
829
|
+
if "\n" in s:
|
|
830
|
+
return f"tskit.MetadataSchema(\n{s}\n)"
|
|
831
|
+
else:
|
|
832
|
+
return f"tskit.MetadataSchema({s})"
|
|
833
|
+
|
|
834
|
+
def __eq__(self, other) -> bool:
|
|
835
|
+
return self._string == other._string
|
|
836
|
+
|
|
837
|
+
@property
|
|
838
|
+
def schema(self) -> Mapping[str, Any] | None:
|
|
839
|
+
# Return a copy to avoid unintentional mutation
|
|
840
|
+
return copy.deepcopy(self._unmodified_schema)
|
|
841
|
+
|
|
842
|
+
def asdict(self) -> Mapping[str, Any] | None:
|
|
843
|
+
"""
|
|
844
|
+
Returns a dict representation of this schema. One possible use of this is to
|
|
845
|
+
modify this dict and then pass it to the ``MetadataSchema`` constructor to create
|
|
846
|
+
a similar schema.
|
|
847
|
+
"""
|
|
848
|
+
return self.schema
|
|
849
|
+
|
|
850
|
+
def validate_and_encode_row(self, row: Any) -> bytes:
|
|
851
|
+
"""
|
|
852
|
+
Validate a row (dict) of metadata against this schema and return the encoded
|
|
853
|
+
representation (bytes) using the codec specified in the schema.
|
|
854
|
+
"""
|
|
855
|
+
# If the schema is permissive then validation can't fail
|
|
856
|
+
if not self._bypass_validation:
|
|
857
|
+
try:
|
|
858
|
+
self._validate_row(row)
|
|
859
|
+
except jsonschema.exceptions.ValidationError as ve:
|
|
860
|
+
raise exceptions.MetadataValidationError(str(ve)) from ve
|
|
861
|
+
return self.encode_row(row)
|
|
862
|
+
|
|
863
|
+
def decode_row(self, row: bytes) -> Any:
|
|
864
|
+
"""
|
|
865
|
+
Decode an encoded row (bytes) of metadata, using the codec specifed in the schema
|
|
866
|
+
and return a python dict. Note that no validation of the metadata against the
|
|
867
|
+
schema is performed.
|
|
868
|
+
"""
|
|
869
|
+
# Set by __init__
|
|
870
|
+
pass # pragma: no cover
|
|
871
|
+
|
|
872
|
+
def encode_row(self, row: Any) -> bytes:
|
|
873
|
+
"""
|
|
874
|
+
Encode a row (dict) of metadata to its binary representation (bytes)
|
|
875
|
+
using the codec specified in the schema. Note that unlike
|
|
876
|
+
:meth:`validate_and_encode_row` no validation against the schema is performed.
|
|
877
|
+
This should only be used for performance if a validation check is not needed.
|
|
878
|
+
"""
|
|
879
|
+
# Set by __init__
|
|
880
|
+
pass # pragma: no cover
|
|
881
|
+
|
|
882
|
+
def numpy_dtype(self) -> Any:
|
|
883
|
+
return self.codec_instance.numpy_dtype(self._schema)
|
|
884
|
+
|
|
885
|
+
def structured_array_from_buffer(self, buffer: Any) -> Any:
|
|
886
|
+
"""
|
|
887
|
+
Convert a buffer of metadata into a structured NumPy array.
|
|
888
|
+
"""
|
|
889
|
+
dtype = self.numpy_dtype()
|
|
890
|
+
return np.frombuffer(buffer, dtype=dtype)
|
|
891
|
+
|
|
892
|
+
@staticmethod
|
|
893
|
+
def permissive_json():
|
|
894
|
+
"""
|
|
895
|
+
The simplest, permissive JSON schema. Only specifies the JSON codec and has
|
|
896
|
+
no constraints on the properties.
|
|
897
|
+
"""
|
|
898
|
+
return MetadataSchema({"codec": "json"})
|
|
899
|
+
|
|
900
|
+
@staticmethod
|
|
901
|
+
def null():
|
|
902
|
+
"""
|
|
903
|
+
The null schema which defines no properties and results in raw bytes
|
|
904
|
+
being returned on accessing metadata column.
|
|
905
|
+
"""
|
|
906
|
+
return MetadataSchema(None)
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
# Often many replicate tree sequences are processed with identical schemas, so cache them
|
|
910
|
+
@functools.lru_cache(maxsize=128)
|
|
911
|
+
def parse_metadata_schema(encoded_schema: str) -> MetadataSchema:
|
|
912
|
+
"""
|
|
913
|
+
Create a schema object from its string encoding. The exact class returned is
|
|
914
|
+
determined by the ``encoding`` specification in the string.
|
|
915
|
+
|
|
916
|
+
:param str encoded_schema: The string encoded schema.
|
|
917
|
+
:return: A subclass of AbstractMetadataSchema.
|
|
918
|
+
"""
|
|
919
|
+
if encoded_schema == "":
|
|
920
|
+
return MetadataSchema.null()
|
|
921
|
+
else:
|
|
922
|
+
try:
|
|
923
|
+
decoded = json.loads(
|
|
924
|
+
encoded_schema, object_pairs_hook=collections.OrderedDict
|
|
925
|
+
)
|
|
926
|
+
except json.decoder.JSONDecodeError:
|
|
927
|
+
raise ValueError(f"Metadata schema is not JSON, found {encoded_schema}")
|
|
928
|
+
return MetadataSchema(decoded)
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
class _CachedMetadata:
|
|
932
|
+
"""
|
|
933
|
+
Descriptor for lazy decoding of metadata on attribute access.
|
|
934
|
+
"""
|
|
935
|
+
|
|
936
|
+
def __get__(self, row, owner):
|
|
937
|
+
if row._metadata_decoder is not None:
|
|
938
|
+
# Some classes that use this are frozen so we need to directly setattr.
|
|
939
|
+
__builtins__object__setattr__(
|
|
940
|
+
row, "_metadata", row._metadata_decoder(row._metadata)
|
|
941
|
+
)
|
|
942
|
+
# Decoder being None indicates that metadata is decoded
|
|
943
|
+
__builtins__object__setattr__(row, "_metadata_decoder", None)
|
|
944
|
+
return row._metadata
|
|
945
|
+
|
|
946
|
+
def __set__(self, row, value):
|
|
947
|
+
__builtins__object__setattr__(row, "_metadata", value)
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
def lazy_decode(own_init=False):
|
|
951
|
+
def _lazy_decode(cls):
|
|
952
|
+
"""
|
|
953
|
+
Modifies a dataclass such that it lazily decodes metadata, if it is encoded.
|
|
954
|
+
If the metadata passed to the constructor is encoded a `metadata_decoder`
|
|
955
|
+
parameter must be also be passed.
|
|
956
|
+
"""
|
|
957
|
+
if not own_init:
|
|
958
|
+
wrapped_init = cls.__init__
|
|
959
|
+
|
|
960
|
+
# Intercept the init to record the decoder
|
|
961
|
+
def new_init(self, *args, metadata_decoder=None, **kwargs):
|
|
962
|
+
__builtins__object__setattr__(
|
|
963
|
+
self, "_metadata_decoder", metadata_decoder
|
|
964
|
+
)
|
|
965
|
+
wrapped_init(self, *args, **kwargs)
|
|
966
|
+
|
|
967
|
+
cls.__init__ = new_init
|
|
968
|
+
|
|
969
|
+
# Add a descriptor to the class to decode and cache metadata
|
|
970
|
+
cls.metadata = _CachedMetadata()
|
|
971
|
+
|
|
972
|
+
# Add slots needed to the class
|
|
973
|
+
slots = cls.__slots__
|
|
974
|
+
slots.extend(["_metadata", "_metadata_decoder"])
|
|
975
|
+
dict_ = dict()
|
|
976
|
+
sloted_members = dict()
|
|
977
|
+
for k, v in cls.__dict__.items():
|
|
978
|
+
if k not in slots:
|
|
979
|
+
dict_[k] = v
|
|
980
|
+
elif not isinstance(v, types.MemberDescriptorType):
|
|
981
|
+
sloted_members[k] = v
|
|
982
|
+
new_cls = type(cls.__name__, cls.__bases__, dict_)
|
|
983
|
+
for k, v in sloted_members.items():
|
|
984
|
+
setattr(new_cls, k, v)
|
|
985
|
+
return new_cls
|
|
986
|
+
|
|
987
|
+
return _lazy_decode
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
class MetadataProvider:
|
|
991
|
+
"""
|
|
992
|
+
Abstract superclass of container objects that provide metadata.
|
|
993
|
+
"""
|
|
994
|
+
|
|
995
|
+
def __init__(self, ll_object):
|
|
996
|
+
self._ll_object = ll_object
|
|
997
|
+
|
|
998
|
+
@property
|
|
999
|
+
def metadata_schema(self) -> MetadataSchema:
|
|
1000
|
+
"""
|
|
1001
|
+
The :class:`tskit.MetadataSchema` for this object.
|
|
1002
|
+
"""
|
|
1003
|
+
return parse_metadata_schema(self._ll_object.metadata_schema)
|
|
1004
|
+
|
|
1005
|
+
@metadata_schema.setter
|
|
1006
|
+
def metadata_schema(self, schema: MetadataSchema) -> None:
|
|
1007
|
+
# Check the schema is a valid schema instance by roundtripping it.
|
|
1008
|
+
text_version = repr(schema)
|
|
1009
|
+
parse_metadata_schema(text_version)
|
|
1010
|
+
self._ll_object.metadata_schema = text_version
|
|
1011
|
+
|
|
1012
|
+
@property
|
|
1013
|
+
def metadata(self) -> Any:
|
|
1014
|
+
"""
|
|
1015
|
+
The decoded metadata for this object.
|
|
1016
|
+
"""
|
|
1017
|
+
return self.metadata_schema.decode_row(self.metadata_bytes)
|
|
1018
|
+
|
|
1019
|
+
@metadata.setter
|
|
1020
|
+
def metadata(self, metadata: bytes | dict | None) -> None:
|
|
1021
|
+
encoded = self.metadata_schema.validate_and_encode_row(metadata)
|
|
1022
|
+
self._ll_object.metadata = encoded
|
|
1023
|
+
|
|
1024
|
+
@property
|
|
1025
|
+
def metadata_bytes(self) -> Any:
|
|
1026
|
+
"""
|
|
1027
|
+
The raw bytes of metadata for this TableCollection
|
|
1028
|
+
"""
|
|
1029
|
+
return self._ll_object.metadata
|
|
1030
|
+
|
|
1031
|
+
@property
|
|
1032
|
+
def nbytes(self) -> int:
|
|
1033
|
+
return len(self._ll_object.metadata) + len(self._ll_object.metadata_schema)
|
|
1034
|
+
|
|
1035
|
+
def assert_equals(self, other: MetadataProvider):
|
|
1036
|
+
if self.metadata_schema != other.metadata_schema:
|
|
1037
|
+
raise AssertionError(
|
|
1038
|
+
f"Metadata schemas differ: self={self.metadata_schema} "
|
|
1039
|
+
f"other={other.metadata_schema}"
|
|
1040
|
+
)
|
|
1041
|
+
if self.metadata != other.metadata:
|
|
1042
|
+
raise AssertionError(
|
|
1043
|
+
f"Metadata differs: self={self.metadata} " f"other={other.metadata}"
|
|
1044
|
+
)
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
NOTSET = object() # Sentinel for unset default values
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
class TableMetadataReader:
|
|
1051
|
+
# Mixin for table classes that expose decoded metadata
|
|
1052
|
+
|
|
1053
|
+
@property
|
|
1054
|
+
def metadata_schema(self) -> MetadataSchema:
|
|
1055
|
+
"""
|
|
1056
|
+
The :class:`tskit.MetadataSchema` for this table.
|
|
1057
|
+
"""
|
|
1058
|
+
# This isn't as inefficient as it looks because we're using an LRU cache on
|
|
1059
|
+
# the parse_metadata_schema function. Thus, we're really only incurring the
|
|
1060
|
+
# cost of creating the unicode string from the low-level schema and looking
|
|
1061
|
+
# up the functools cache.
|
|
1062
|
+
return parse_metadata_schema(self.ll_table.metadata_schema)
|
|
1063
|
+
|
|
1064
|
+
def metadata_vector(self, key, *, dtype=None, default_value=NOTSET):
|
|
1065
|
+
"""
|
|
1066
|
+
Returns a numpy array of metadata values obtained by extracting ``key``
|
|
1067
|
+
from each metadata entry, and using ``default_value`` if the key is
|
|
1068
|
+
not present. ``key`` may be a list, in which case nested values are returned.
|
|
1069
|
+
For instance, ``key = ["a", "x"]`` will return an array of
|
|
1070
|
+
``row.metadata["a"]["x"]`` values, iterated over rows in this table.
|
|
1071
|
+
|
|
1072
|
+
:param str key: The name, or a list of names, of metadata entries.
|
|
1073
|
+
:param str dtype: The dtype of the result (can usually be omitted).
|
|
1074
|
+
:param object default_value: The value to be inserted if the metadata key
|
|
1075
|
+
is not present. Note that for numeric columns, a default value of None
|
|
1076
|
+
will result in a non-numeric array. The default behaviour is to raise
|
|
1077
|
+
``KeyError`` on missing entries.
|
|
1078
|
+
"""
|
|
1079
|
+
from collections.abc import Mapping
|
|
1080
|
+
|
|
1081
|
+
if default_value == NOTSET:
|
|
1082
|
+
|
|
1083
|
+
def getter(d, k):
|
|
1084
|
+
return d[k]
|
|
1085
|
+
|
|
1086
|
+
else:
|
|
1087
|
+
|
|
1088
|
+
def getter(d, k):
|
|
1089
|
+
return (
|
|
1090
|
+
d.get(k, default_value) if isinstance(d, Mapping) else default_value
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
if isinstance(key, list):
|
|
1094
|
+
out = np.array(
|
|
1095
|
+
[functools.reduce(getter, key, row.metadata) for row in self],
|
|
1096
|
+
dtype=dtype,
|
|
1097
|
+
)
|
|
1098
|
+
else:
|
|
1099
|
+
out = np.array(
|
|
1100
|
+
[getter(row.metadata, key) for row in self],
|
|
1101
|
+
dtype=dtype,
|
|
1102
|
+
)
|
|
1103
|
+
return out
|
|
1104
|
+
|
|
1105
|
+
def _make_row(self, *args):
|
|
1106
|
+
return self.row_class(*args, metadata_decoder=self.metadata_schema.decode_row)
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
class TableMetadataWriter(TableMetadataReader):
|
|
1110
|
+
# Mixin for tables writing metadata
|
|
1111
|
+
|
|
1112
|
+
@TableMetadataReader.metadata_schema.setter
|
|
1113
|
+
def metadata_schema(self, schema: MetadataSchema) -> None:
|
|
1114
|
+
if not isinstance(schema, MetadataSchema):
|
|
1115
|
+
raise TypeError(
|
|
1116
|
+
"Only instances of tskit.MetadataSchema can be assigned to "
|
|
1117
|
+
f"metadata_schema, not {type(schema)}"
|
|
1118
|
+
)
|
|
1119
|
+
self.ll_table.metadata_schema = repr(schema)
|
|
1120
|
+
|
|
1121
|
+
def packset_metadata(self, metadatas):
|
|
1122
|
+
"""
|
|
1123
|
+
Packs the specified list of metadata values and updates the ``metadata``
|
|
1124
|
+
and ``metadata_offset`` columns. The length of the metadatas array
|
|
1125
|
+
must be equal to the number of rows in the table.
|
|
1126
|
+
|
|
1127
|
+
:param list metadatas: A list of metadata bytes values.
|
|
1128
|
+
"""
|
|
1129
|
+
packed, offset = util.pack_bytes(metadatas)
|
|
1130
|
+
data = self.asdict()
|
|
1131
|
+
data["metadata"] = packed
|
|
1132
|
+
data["metadata_offset"] = offset
|
|
1133
|
+
self.set_columns(**data)
|
|
1134
|
+
|
|
1135
|
+
def drop_metadata(self, *, keep_schema=False):
|
|
1136
|
+
"""
|
|
1137
|
+
Drops all metadata in this table. By default, the schema is also cleared,
|
|
1138
|
+
except if ``keep_schema`` is True.
|
|
1139
|
+
|
|
1140
|
+
:param bool keep_schema: True if the current schema should be kept intact.
|
|
1141
|
+
"""
|
|
1142
|
+
data = self.asdict()
|
|
1143
|
+
data["metadata"] = []
|
|
1144
|
+
data["metadata_offset"][:] = 0
|
|
1145
|
+
self.set_columns(**data)
|
|
1146
|
+
if not keep_schema:
|
|
1147
|
+
self.metadata_schema = MetadataSchema.null()
|