bio2zarr 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +125 -24
- bio2zarr/core.py +13 -3
- bio2zarr/vcf.py +568 -328
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/METADATA +1 -1
- bio2zarr-0.0.6.dist-info/RECORD +16 -0
- bio2zarr-0.0.5.dist-info/RECORD +0 -16
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/LICENSE +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/WHEEL +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/top_level.txt +0 -0
bio2zarr/vcf.py
CHANGED
|
@@ -111,9 +111,6 @@ class VcfField:
|
|
|
111
111
|
return self.name
|
|
112
112
|
return f"{self.category}/{self.name}"
|
|
113
113
|
|
|
114
|
-
# TODO add method here to choose a good set compressor and
|
|
115
|
-
# filters default here for this field.
|
|
116
|
-
|
|
117
114
|
def smallest_dtype(self):
|
|
118
115
|
"""
|
|
119
116
|
Returns the smallest dtype suitable for this field based
|
|
@@ -123,13 +120,13 @@ class VcfField:
|
|
|
123
120
|
if self.vcf_type == "Float":
|
|
124
121
|
ret = "f4"
|
|
125
122
|
elif self.vcf_type == "Integer":
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
123
|
+
if not math.isfinite(s.max_value):
|
|
124
|
+
# All missing values; use i1. Note we should have some API to
|
|
125
|
+
# check more explicitly for missingness:
|
|
126
|
+
# https://github.com/sgkit-dev/bio2zarr/issues/131
|
|
127
|
+
ret = "i1"
|
|
128
|
+
else:
|
|
129
|
+
ret = core.min_int_dtype(s.min_value, s.max_value)
|
|
133
130
|
elif self.vcf_type == "Flag":
|
|
134
131
|
ret = "bool"
|
|
135
132
|
elif self.vcf_type == "Character":
|
|
@@ -152,6 +149,10 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
|
152
149
|
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
153
150
|
)
|
|
154
151
|
|
|
152
|
+
# TODO refactor this to have embedded Contig dataclass, Filters
|
|
153
|
+
# and Samples dataclasses to allow for more information to be
|
|
154
|
+
# retained and forward compatibility.
|
|
155
|
+
|
|
155
156
|
|
|
156
157
|
@dataclasses.dataclass
|
|
157
158
|
class IcfMetadata:
|
|
@@ -183,6 +184,14 @@ class IcfMetadata:
|
|
|
183
184
|
fields.append(field)
|
|
184
185
|
return fields
|
|
185
186
|
|
|
187
|
+
@property
|
|
188
|
+
def num_contigs(self):
|
|
189
|
+
return len(self.contig_names)
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def num_filters(self):
|
|
193
|
+
return len(self.filters)
|
|
194
|
+
|
|
186
195
|
@property
|
|
187
196
|
def num_records(self):
|
|
188
197
|
return sum(self.contig_record_counts.values())
|
|
@@ -1242,6 +1251,50 @@ class ZarrColumnSpec:
|
|
|
1242
1251
|
spec._choose_compressor_settings()
|
|
1243
1252
|
return spec
|
|
1244
1253
|
|
|
1254
|
+
@staticmethod
|
|
1255
|
+
def from_field(
|
|
1256
|
+
vcf_field,
|
|
1257
|
+
*,
|
|
1258
|
+
num_variants,
|
|
1259
|
+
num_samples,
|
|
1260
|
+
variants_chunk_size,
|
|
1261
|
+
samples_chunk_size,
|
|
1262
|
+
variable_name=None,
|
|
1263
|
+
):
|
|
1264
|
+
shape = [num_variants]
|
|
1265
|
+
prefix = "variant_"
|
|
1266
|
+
dimensions = ["variants"]
|
|
1267
|
+
chunks = [variants_chunk_size]
|
|
1268
|
+
if vcf_field.category == "FORMAT":
|
|
1269
|
+
prefix = "call_"
|
|
1270
|
+
shape.append(num_samples)
|
|
1271
|
+
chunks.append(samples_chunk_size)
|
|
1272
|
+
dimensions.append("samples")
|
|
1273
|
+
if variable_name is None:
|
|
1274
|
+
variable_name = prefix + vcf_field.name
|
|
1275
|
+
# TODO make an option to add in the empty extra dimension
|
|
1276
|
+
if vcf_field.summary.max_number > 1:
|
|
1277
|
+
shape.append(vcf_field.summary.max_number)
|
|
1278
|
+
# TODO we should really be checking this to see if the named dimensions
|
|
1279
|
+
# are actually correct.
|
|
1280
|
+
if vcf_field.vcf_number == "R":
|
|
1281
|
+
dimensions.append("alleles")
|
|
1282
|
+
elif vcf_field.vcf_number == "A":
|
|
1283
|
+
dimensions.append("alt_alleles")
|
|
1284
|
+
elif vcf_field.vcf_number == "G":
|
|
1285
|
+
dimensions.append("genotypes")
|
|
1286
|
+
else:
|
|
1287
|
+
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
|
|
1288
|
+
return ZarrColumnSpec.new(
|
|
1289
|
+
vcf_field=vcf_field.full_name,
|
|
1290
|
+
name=variable_name,
|
|
1291
|
+
dtype=vcf_field.smallest_dtype(),
|
|
1292
|
+
shape=shape,
|
|
1293
|
+
chunks=chunks,
|
|
1294
|
+
dimensions=dimensions,
|
|
1295
|
+
description=vcf_field.description,
|
|
1296
|
+
)
|
|
1297
|
+
|
|
1245
1298
|
def _choose_compressor_settings(self):
|
|
1246
1299
|
"""
|
|
1247
1300
|
Choose compressor and filter settings based on the size and
|
|
@@ -1250,17 +1303,32 @@ class ZarrColumnSpec:
|
|
|
1250
1303
|
|
|
1251
1304
|
See https://github.com/pystatgen/bio2zarr/discussions/74
|
|
1252
1305
|
"""
|
|
1253
|
-
dt = np.dtype(self.dtype)
|
|
1254
1306
|
# Default is to not shuffle, because autoshuffle isn't recognised
|
|
1255
1307
|
# by many Zarr implementations, and shuffling can lead to worse
|
|
1256
1308
|
# performance in some cases anyway. Turning on shuffle should be a
|
|
1257
1309
|
# deliberate choice.
|
|
1258
1310
|
shuffle = numcodecs.Blosc.NOSHUFFLE
|
|
1259
|
-
if
|
|
1260
|
-
#
|
|
1311
|
+
if self.name == "call_genotype" and self.dtype == "i1":
|
|
1312
|
+
# call_genotype gets BITSHUFFLE by default as it gets
|
|
1313
|
+
# significantly better compression (at a cost of slower
|
|
1314
|
+
# decoding)
|
|
1261
1315
|
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
1316
|
+
elif self.dtype == "bool":
|
|
1317
|
+
shuffle = numcodecs.Blosc.BITSHUFFLE
|
|
1318
|
+
|
|
1262
1319
|
self.compressor["shuffle"] = shuffle
|
|
1263
1320
|
|
|
1321
|
+
@property
|
|
1322
|
+
def variant_chunk_nbytes(self):
|
|
1323
|
+
"""
|
|
1324
|
+
Returns the nbytes for a single variant chunk of this array.
|
|
1325
|
+
"""
|
|
1326
|
+
chunk_items = self.chunks[0]
|
|
1327
|
+
for size in self.shape[1:]:
|
|
1328
|
+
chunk_items *= size
|
|
1329
|
+
dt = np.dtype(self.dtype)
|
|
1330
|
+
return chunk_items * dt.itemsize
|
|
1331
|
+
|
|
1264
1332
|
|
|
1265
1333
|
ZARR_SCHEMA_FORMAT_VERSION = "0.2"
|
|
1266
1334
|
|
|
@@ -1313,6 +1381,16 @@ class VcfZarrSchema:
|
|
|
1313
1381
|
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
|
|
1314
1382
|
)
|
|
1315
1383
|
|
|
1384
|
+
def spec_from_field(field, variable_name=None):
|
|
1385
|
+
return ZarrColumnSpec.from_field(
|
|
1386
|
+
field,
|
|
1387
|
+
num_samples=n,
|
|
1388
|
+
num_variants=m,
|
|
1389
|
+
samples_chunk_size=samples_chunk_size,
|
|
1390
|
+
variants_chunk_size=variants_chunk_size,
|
|
1391
|
+
variable_name=variable_name,
|
|
1392
|
+
)
|
|
1393
|
+
|
|
1316
1394
|
def fixed_field_spec(
|
|
1317
1395
|
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
|
|
1318
1396
|
):
|
|
@@ -1328,95 +1406,56 @@ class VcfZarrSchema:
|
|
|
1328
1406
|
|
|
1329
1407
|
alt_col = icf.columns["ALT"]
|
|
1330
1408
|
max_alleles = alt_col.vcf_field.summary.max_number + 1
|
|
1331
|
-
num_filters = len(icf.metadata.filters)
|
|
1332
1409
|
|
|
1333
|
-
# # FIXME get dtype from lookup table
|
|
1334
1410
|
colspecs = [
|
|
1335
1411
|
fixed_field_spec(
|
|
1336
1412
|
name="variant_contig",
|
|
1337
|
-
dtype=
|
|
1413
|
+
dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
|
|
1338
1414
|
),
|
|
1339
1415
|
fixed_field_spec(
|
|
1340
1416
|
name="variant_filter",
|
|
1341
1417
|
dtype="bool",
|
|
1342
|
-
shape=(m, num_filters),
|
|
1418
|
+
shape=(m, icf.metadata.num_filters),
|
|
1343
1419
|
dimensions=["variants", "filters"],
|
|
1344
1420
|
),
|
|
1345
1421
|
fixed_field_spec(
|
|
1346
1422
|
name="variant_allele",
|
|
1347
1423
|
dtype="str",
|
|
1348
|
-
shape=
|
|
1424
|
+
shape=(m, max_alleles),
|
|
1349
1425
|
dimensions=["variants", "alleles"],
|
|
1350
1426
|
),
|
|
1351
1427
|
fixed_field_spec(
|
|
1352
|
-
vcf_field="POS",
|
|
1353
|
-
name="variant_position",
|
|
1354
|
-
dtype="i4",
|
|
1355
|
-
),
|
|
1356
|
-
fixed_field_spec(
|
|
1357
|
-
vcf_field=None,
|
|
1358
1428
|
name="variant_id",
|
|
1359
1429
|
dtype="str",
|
|
1360
1430
|
),
|
|
1361
1431
|
fixed_field_spec(
|
|
1362
|
-
vcf_field=None,
|
|
1363
1432
|
name="variant_id_mask",
|
|
1364
1433
|
dtype="bool",
|
|
1365
1434
|
),
|
|
1366
|
-
fixed_field_spec(
|
|
1367
|
-
vcf_field="QUAL",
|
|
1368
|
-
name="variant_quality",
|
|
1369
|
-
dtype="f4",
|
|
1370
|
-
),
|
|
1371
1435
|
]
|
|
1436
|
+
name_map = {field.full_name: field for field in icf.metadata.fields}
|
|
1437
|
+
|
|
1438
|
+
# Only two of the fixed fields have a direct one-to-one mapping.
|
|
1439
|
+
colspecs.extend(
|
|
1440
|
+
[
|
|
1441
|
+
spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
|
|
1442
|
+
spec_from_field(name_map["POS"], variable_name="variant_position"),
|
|
1443
|
+
]
|
|
1444
|
+
)
|
|
1445
|
+
colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
|
|
1372
1446
|
|
|
1373
1447
|
gt_field = None
|
|
1374
|
-
for field in icf.metadata.
|
|
1375
|
-
if field.category == "fixed":
|
|
1376
|
-
continue
|
|
1448
|
+
for field in icf.metadata.format_fields:
|
|
1377
1449
|
if field.name == "GT":
|
|
1378
1450
|
gt_field = field
|
|
1379
1451
|
continue
|
|
1380
|
-
|
|
1381
|
-
prefix = "variant_"
|
|
1382
|
-
dimensions = ["variants"]
|
|
1383
|
-
chunks = [variants_chunk_size]
|
|
1384
|
-
if field.category == "FORMAT":
|
|
1385
|
-
prefix = "call_"
|
|
1386
|
-
shape.append(n)
|
|
1387
|
-
chunks.append(samples_chunk_size)
|
|
1388
|
-
dimensions.append("samples")
|
|
1389
|
-
# TODO make an option to add in the empty extra dimension
|
|
1390
|
-
if field.summary.max_number > 1:
|
|
1391
|
-
shape.append(field.summary.max_number)
|
|
1392
|
-
# TODO we should really be checking this to see if the named dimensions
|
|
1393
|
-
# are actually correct.
|
|
1394
|
-
if field.vcf_number == "R":
|
|
1395
|
-
dimensions.append("alleles")
|
|
1396
|
-
elif field.vcf_number == "A":
|
|
1397
|
-
dimensions.append("alt_alleles")
|
|
1398
|
-
elif field.vcf_number == "G":
|
|
1399
|
-
dimensions.append("genotypes")
|
|
1400
|
-
else:
|
|
1401
|
-
dimensions.append(f"{field.category}_{field.name}_dim")
|
|
1402
|
-
variable_name = prefix + field.name
|
|
1403
|
-
colspec = ZarrColumnSpec.new(
|
|
1404
|
-
vcf_field=field.full_name,
|
|
1405
|
-
name=variable_name,
|
|
1406
|
-
dtype=field.smallest_dtype(),
|
|
1407
|
-
shape=shape,
|
|
1408
|
-
chunks=chunks,
|
|
1409
|
-
dimensions=dimensions,
|
|
1410
|
-
description=field.description,
|
|
1411
|
-
)
|
|
1412
|
-
colspecs.append(colspec)
|
|
1452
|
+
colspecs.append(spec_from_field(field))
|
|
1413
1453
|
|
|
1414
1454
|
if gt_field is not None:
|
|
1415
1455
|
ploidy = gt_field.summary.max_number - 1
|
|
1416
1456
|
shape = [m, n]
|
|
1417
1457
|
chunks = [variants_chunk_size, samples_chunk_size]
|
|
1418
1458
|
dimensions = ["variants", "samples"]
|
|
1419
|
-
|
|
1420
1459
|
colspecs.append(
|
|
1421
1460
|
ZarrColumnSpec.new(
|
|
1422
1461
|
vcf_field=None,
|
|
@@ -1498,15 +1537,6 @@ class VcfZarr:
|
|
|
1498
1537
|
return data
|
|
1499
1538
|
|
|
1500
1539
|
|
|
1501
|
-
@dataclasses.dataclass
|
|
1502
|
-
class EncodingWork:
|
|
1503
|
-
func: callable = dataclasses.field(repr=False)
|
|
1504
|
-
start: int
|
|
1505
|
-
stop: int
|
|
1506
|
-
columns: list[str]
|
|
1507
|
-
memory: int = 0
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
1540
|
def parse_max_memory(max_memory):
|
|
1511
1541
|
if max_memory is None:
|
|
1512
1542
|
# Effectively unbounded
|
|
@@ -1517,67 +1547,299 @@ def parse_max_memory(max_memory):
|
|
|
1517
1547
|
return max_memory
|
|
1518
1548
|
|
|
1519
1549
|
|
|
1550
|
+
@dataclasses.dataclass
|
|
1551
|
+
class VcfZarrPartition:
|
|
1552
|
+
start_index: int
|
|
1553
|
+
stop_index: int
|
|
1554
|
+
start_chunk: int
|
|
1555
|
+
stop_chunk: int
|
|
1556
|
+
|
|
1557
|
+
@staticmethod
|
|
1558
|
+
def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
|
|
1559
|
+
num_chunks = int(np.ceil(num_records / chunk_size))
|
|
1560
|
+
if max_chunks is not None:
|
|
1561
|
+
num_chunks = min(num_chunks, max_chunks)
|
|
1562
|
+
partitions = []
|
|
1563
|
+
splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
|
|
1564
|
+
for chunk_slice in splits:
|
|
1565
|
+
start_chunk = int(chunk_slice[0])
|
|
1566
|
+
stop_chunk = int(chunk_slice[-1]) + 1
|
|
1567
|
+
start_index = start_chunk * chunk_size
|
|
1568
|
+
stop_index = min(stop_chunk * chunk_size, num_records)
|
|
1569
|
+
partitions.append(
|
|
1570
|
+
VcfZarrPartition(start_index, stop_index, start_chunk, stop_chunk)
|
|
1571
|
+
)
|
|
1572
|
+
return partitions
|
|
1573
|
+
|
|
1574
|
+
|
|
1575
|
+
VZW_METADATA_FORMAT_VERSION = "0.1"
|
|
1576
|
+
|
|
1577
|
+
|
|
1578
|
+
@dataclasses.dataclass
|
|
1579
|
+
class VcfZarrWriterMetadata:
|
|
1580
|
+
format_version: str
|
|
1581
|
+
icf_path: str
|
|
1582
|
+
schema: VcfZarrSchema
|
|
1583
|
+
dimension_separator: str
|
|
1584
|
+
partitions: list
|
|
1585
|
+
provenance: dict
|
|
1586
|
+
|
|
1587
|
+
def asdict(self):
|
|
1588
|
+
return dataclasses.asdict(self)
|
|
1589
|
+
|
|
1590
|
+
@staticmethod
|
|
1591
|
+
def fromdict(d):
|
|
1592
|
+
if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
|
|
1593
|
+
raise ValueError(
|
|
1594
|
+
"VcfZarrWriter format version mismatch: "
|
|
1595
|
+
f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
|
|
1596
|
+
)
|
|
1597
|
+
ret = VcfZarrWriterMetadata(**d)
|
|
1598
|
+
ret.schema = VcfZarrSchema.fromdict(ret.schema)
|
|
1599
|
+
ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
|
|
1600
|
+
return ret
|
|
1601
|
+
|
|
1602
|
+
|
|
1520
1603
|
class VcfZarrWriter:
|
|
1521
|
-
def __init__(self, path
|
|
1604
|
+
def __init__(self, path):
|
|
1522
1605
|
self.path = pathlib.Path(path)
|
|
1606
|
+
self.wip_path = self.path / "wip"
|
|
1607
|
+
self.arrays_path = self.wip_path / "arrays"
|
|
1608
|
+
self.partitions_path = self.wip_path / "partitions"
|
|
1609
|
+
self.metadata = None
|
|
1610
|
+
self.icf = None
|
|
1611
|
+
|
|
1612
|
+
@property
|
|
1613
|
+
def schema(self):
|
|
1614
|
+
return self.metadata.schema
|
|
1615
|
+
|
|
1616
|
+
@property
|
|
1617
|
+
def num_partitions(self):
|
|
1618
|
+
return len(self.metadata.partitions)
|
|
1619
|
+
|
|
1620
|
+
#######################
|
|
1621
|
+
# init
|
|
1622
|
+
#######################
|
|
1623
|
+
|
|
1624
|
+
def init(
|
|
1625
|
+
self,
|
|
1626
|
+
icf,
|
|
1627
|
+
*,
|
|
1628
|
+
target_num_partitions,
|
|
1629
|
+
schema,
|
|
1630
|
+
dimension_separator=None,
|
|
1631
|
+
max_variant_chunks=None,
|
|
1632
|
+
):
|
|
1523
1633
|
self.icf = icf
|
|
1524
|
-
self.
|
|
1634
|
+
if self.path.exists():
|
|
1635
|
+
raise ValueError("Zarr path already exists") # NEEDS TEST
|
|
1636
|
+
partitions = VcfZarrPartition.generate_partitions(
|
|
1637
|
+
self.icf.num_records,
|
|
1638
|
+
schema.variants_chunk_size,
|
|
1639
|
+
target_num_partitions,
|
|
1640
|
+
max_chunks=max_variant_chunks,
|
|
1641
|
+
)
|
|
1525
1642
|
# Default to using nested directories following the Zarr v3 default.
|
|
1526
1643
|
# This seems to require version 2.17+ to work properly
|
|
1527
|
-
|
|
1644
|
+
dimension_separator = (
|
|
1528
1645
|
"/" if dimension_separator is None else dimension_separator
|
|
1529
1646
|
)
|
|
1647
|
+
self.metadata = VcfZarrWriterMetadata(
|
|
1648
|
+
format_version=VZW_METADATA_FORMAT_VERSION,
|
|
1649
|
+
icf_path=str(self.icf.path),
|
|
1650
|
+
schema=schema,
|
|
1651
|
+
dimension_separator=dimension_separator,
|
|
1652
|
+
partitions=partitions,
|
|
1653
|
+
# Bare minimum here for provenance - see comments above
|
|
1654
|
+
provenance={"source": f"bio2zarr-{provenance.__version__}"},
|
|
1655
|
+
)
|
|
1656
|
+
|
|
1657
|
+
self.path.mkdir()
|
|
1530
1658
|
store = zarr.DirectoryStore(self.path)
|
|
1531
|
-
|
|
1659
|
+
root = zarr.group(store=store)
|
|
1660
|
+
root.attrs.update(
|
|
1661
|
+
{
|
|
1662
|
+
"vcf_zarr_version": "0.2",
|
|
1663
|
+
"vcf_header": self.icf.vcf_header,
|
|
1664
|
+
"source": f"bio2zarr-{provenance.__version__}",
|
|
1665
|
+
}
|
|
1666
|
+
)
|
|
1667
|
+
# Doing this syncronously - this is fine surely
|
|
1668
|
+
self.encode_samples(root)
|
|
1669
|
+
self.encode_filter_id(root)
|
|
1670
|
+
self.encode_contig_id(root)
|
|
1671
|
+
|
|
1672
|
+
self.wip_path.mkdir()
|
|
1673
|
+
self.arrays_path.mkdir()
|
|
1674
|
+
self.partitions_path.mkdir()
|
|
1675
|
+
store = zarr.DirectoryStore(self.arrays_path)
|
|
1676
|
+
root = zarr.group(store=store)
|
|
1677
|
+
|
|
1678
|
+
for column in self.schema.columns.values():
|
|
1679
|
+
self.init_array(root, column, partitions[-1].stop_index)
|
|
1680
|
+
|
|
1681
|
+
logger.info("Writing WIP metadata")
|
|
1682
|
+
with open(self.wip_path / "metadata.json", "w") as f:
|
|
1683
|
+
json.dump(self.metadata.asdict(), f, indent=4)
|
|
1684
|
+
return len(partitions)
|
|
1685
|
+
|
|
1686
|
+
def encode_samples(self, root):
|
|
1687
|
+
if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
|
|
1688
|
+
raise ValueError(
|
|
1689
|
+
"Subsetting or reordering samples not supported currently"
|
|
1690
|
+
) # NEEDS TEST
|
|
1691
|
+
array = root.array(
|
|
1692
|
+
"sample_id",
|
|
1693
|
+
self.schema.sample_id,
|
|
1694
|
+
dtype="str",
|
|
1695
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1696
|
+
chunks=(self.schema.samples_chunk_size,),
|
|
1697
|
+
)
|
|
1698
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
1699
|
+
logger.debug("Samples done")
|
|
1532
1700
|
|
|
1533
|
-
def
|
|
1701
|
+
def encode_contig_id(self, root):
|
|
1702
|
+
array = root.array(
|
|
1703
|
+
"contig_id",
|
|
1704
|
+
self.schema.contig_id,
|
|
1705
|
+
dtype="str",
|
|
1706
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1707
|
+
)
|
|
1708
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1709
|
+
if self.schema.contig_length is not None:
|
|
1710
|
+
array = root.array(
|
|
1711
|
+
"contig_length",
|
|
1712
|
+
self.schema.contig_length,
|
|
1713
|
+
dtype=np.int64,
|
|
1714
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1715
|
+
)
|
|
1716
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1717
|
+
|
|
1718
|
+
def encode_filter_id(self, root):
|
|
1719
|
+
array = root.array(
|
|
1720
|
+
"filter_id",
|
|
1721
|
+
self.schema.filter_id,
|
|
1722
|
+
dtype="str",
|
|
1723
|
+
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1724
|
+
)
|
|
1725
|
+
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
|
|
1726
|
+
|
|
1727
|
+
def init_array(self, root, variable, variants_dim_size):
|
|
1534
1728
|
object_codec = None
|
|
1535
1729
|
if variable.dtype == "O":
|
|
1536
1730
|
object_codec = numcodecs.VLenUTF8()
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1731
|
+
shape = list(variable.shape)
|
|
1732
|
+
# Truncate the variants dimension is max_variant_chunks was specified
|
|
1733
|
+
shape[0] = variants_dim_size
|
|
1734
|
+
a = root.empty(
|
|
1735
|
+
variable.name,
|
|
1736
|
+
shape=shape,
|
|
1540
1737
|
chunks=variable.chunks,
|
|
1541
1738
|
dtype=variable.dtype,
|
|
1542
1739
|
compressor=numcodecs.get_codec(variable.compressor),
|
|
1543
1740
|
filters=[numcodecs.get_codec(filt) for filt in variable.filters],
|
|
1544
1741
|
object_codec=object_codec,
|
|
1545
|
-
dimension_separator=self.dimension_separator,
|
|
1742
|
+
dimension_separator=self.metadata.dimension_separator,
|
|
1743
|
+
)
|
|
1744
|
+
a.attrs.update(
|
|
1745
|
+
{
|
|
1746
|
+
"description": variable.description,
|
|
1747
|
+
# Dimension names are part of the spec in Zarr v3
|
|
1748
|
+
"_ARRAY_DIMENSIONS": variable.dimensions,
|
|
1749
|
+
}
|
|
1546
1750
|
)
|
|
1547
|
-
|
|
1548
|
-
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
|
|
1751
|
+
logger.debug(f"Initialised {a}")
|
|
1549
1752
|
|
|
1550
|
-
|
|
1551
|
-
|
|
1753
|
+
#######################
|
|
1754
|
+
# encode_partition
|
|
1755
|
+
#######################
|
|
1552
1756
|
|
|
1553
|
-
def
|
|
1554
|
-
|
|
1555
|
-
|
|
1757
|
+
def load_metadata(self):
|
|
1758
|
+
if self.metadata is None:
|
|
1759
|
+
with open(self.wip_path / "metadata.json") as f:
|
|
1760
|
+
self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
|
|
1761
|
+
self.icf = IntermediateColumnarFormat(self.metadata.icf_path)
|
|
1762
|
+
|
|
1763
|
+
def partition_path(self, partition_index):
|
|
1764
|
+
return self.partitions_path / f"p{partition_index}"
|
|
1765
|
+
|
|
1766
|
+
def wip_partition_array_path(self, partition_index, name):
|
|
1767
|
+
return self.partition_path(partition_index) / f"wip_{name}"
|
|
1768
|
+
|
|
1769
|
+
def partition_array_path(self, partition_index, name):
|
|
1770
|
+
return self.partition_path(partition_index) / name
|
|
1771
|
+
|
|
1772
|
+
def encode_partition(self, partition_index):
|
|
1773
|
+
self.load_metadata()
|
|
1774
|
+
partition_path = self.partition_path(partition_index)
|
|
1775
|
+
partition_path.mkdir(exist_ok=True)
|
|
1776
|
+
logger.info(f"Encoding partition {partition_index} to {partition_path}")
|
|
1777
|
+
|
|
1778
|
+
self.encode_alleles_partition(partition_index)
|
|
1779
|
+
self.encode_id_partition(partition_index)
|
|
1780
|
+
self.encode_filters_partition(partition_index)
|
|
1781
|
+
self.encode_contig_partition(partition_index)
|
|
1782
|
+
for col in self.schema.columns.values():
|
|
1783
|
+
if col.vcf_field is not None:
|
|
1784
|
+
self.encode_array_partition(col, partition_index)
|
|
1785
|
+
if "call_genotype" in self.schema.columns:
|
|
1786
|
+
self.encode_genotypes_partition(partition_index)
|
|
1787
|
+
|
|
1788
|
+
def init_partition_array(self, partition_index, name):
|
|
1789
|
+
wip_path = self.wip_partition_array_path(partition_index, name)
|
|
1790
|
+
# Create an empty array like the definition
|
|
1791
|
+
src = self.arrays_path / name
|
|
1792
|
+
# Overwrite any existing WIP files
|
|
1793
|
+
shutil.copytree(src, wip_path, dirs_exist_ok=True)
|
|
1794
|
+
array = zarr.open(wip_path)
|
|
1795
|
+
logger.debug(f"Opened empty array {array} @ {wip_path}")
|
|
1796
|
+
return array
|
|
1797
|
+
|
|
1798
|
+
def finalise_partition_array(self, partition_index, name):
|
|
1799
|
+
wip_path = self.wip_partition_array_path(partition_index, name)
|
|
1800
|
+
final_path = self.partition_array_path(partition_index, name)
|
|
1801
|
+
if final_path.exists():
|
|
1802
|
+
# NEEDS TEST
|
|
1803
|
+
logger.warning(f"Removing existing {final_path}")
|
|
1804
|
+
shutil.rmtree(final_path)
|
|
1556
1805
|
# Atomic swap
|
|
1557
|
-
os.rename(
|
|
1558
|
-
logger.
|
|
1806
|
+
os.rename(wip_path, final_path)
|
|
1807
|
+
logger.debug(f"Encoded {name} partition {partition_index}")
|
|
1808
|
+
|
|
1809
|
+
def encode_array_partition(self, column, partition_index):
|
|
1810
|
+
array = self.init_partition_array(partition_index, column.name)
|
|
1559
1811
|
|
|
1560
|
-
|
|
1812
|
+
partition = self.metadata.partitions[partition_index]
|
|
1813
|
+
ba = core.BufferedArray(array, partition.start_index)
|
|
1561
1814
|
source_col = self.icf.columns[column.vcf_field]
|
|
1562
|
-
array = self.get_array(column.name)
|
|
1563
|
-
ba = core.BufferedArray(array, start)
|
|
1564
1815
|
sanitiser = source_col.sanitiser_factory(ba.buff.shape)
|
|
1565
1816
|
|
|
1566
|
-
for value in source_col.iter_values(
|
|
1817
|
+
for value in source_col.iter_values(
|
|
1818
|
+
partition.start_index, partition.stop_index
|
|
1819
|
+
):
|
|
1567
1820
|
# We write directly into the buffer in the sanitiser function
|
|
1568
1821
|
# to make it easier to reason about dimension padding
|
|
1569
1822
|
j = ba.next_buffer_row()
|
|
1570
1823
|
sanitiser(ba.buff, j, value)
|
|
1571
1824
|
ba.flush()
|
|
1572
|
-
|
|
1825
|
+
self.finalise_partition_array(partition_index, column.name)
|
|
1573
1826
|
|
|
1574
|
-
def
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1827
|
+
def encode_genotypes_partition(self, partition_index):
|
|
1828
|
+
gt_array = self.init_partition_array(partition_index, "call_genotype")
|
|
1829
|
+
gt_mask_array = self.init_partition_array(partition_index, "call_genotype_mask")
|
|
1830
|
+
gt_phased_array = self.init_partition_array(
|
|
1831
|
+
partition_index, "call_genotype_phased"
|
|
1832
|
+
)
|
|
1833
|
+
|
|
1834
|
+
partition = self.metadata.partitions[partition_index]
|
|
1835
|
+
gt = core.BufferedArray(gt_array, partition.start_index)
|
|
1836
|
+
gt_mask = core.BufferedArray(gt_mask_array, partition.start_index)
|
|
1837
|
+
gt_phased = core.BufferedArray(gt_phased_array, partition.start_index)
|
|
1579
1838
|
|
|
1580
|
-
|
|
1839
|
+
source_col = self.icf.columns["FORMAT/GT"]
|
|
1840
|
+
for value in source_col.iter_values(
|
|
1841
|
+
partition.start_index, partition.stop_index
|
|
1842
|
+
):
|
|
1581
1843
|
j = gt.next_buffer_row()
|
|
1582
1844
|
sanitise_value_int_2d(gt.buff, j, value[:, :-1])
|
|
1583
1845
|
j = gt_phased.next_buffer_row()
|
|
@@ -1589,29 +1851,40 @@ class VcfZarrWriter:
|
|
|
1589
1851
|
gt.flush()
|
|
1590
1852
|
gt_phased.flush()
|
|
1591
1853
|
gt_mask.flush()
|
|
1592
|
-
logger.debug(f"Encoded GT slice {start}:{stop}")
|
|
1593
1854
|
|
|
1594
|
-
|
|
1855
|
+
self.finalise_partition_array(partition_index, "call_genotype")
|
|
1856
|
+
self.finalise_partition_array(partition_index, "call_genotype_mask")
|
|
1857
|
+
self.finalise_partition_array(partition_index, "call_genotype_phased")
|
|
1858
|
+
|
|
1859
|
+
def encode_alleles_partition(self, partition_index):
|
|
1860
|
+
array_name = "variant_allele"
|
|
1861
|
+
alleles_array = self.init_partition_array(partition_index, array_name)
|
|
1862
|
+
partition = self.metadata.partitions[partition_index]
|
|
1863
|
+
alleles = core.BufferedArray(alleles_array, partition.start_index)
|
|
1595
1864
|
ref_col = self.icf.columns["REF"]
|
|
1596
1865
|
alt_col = self.icf.columns["ALT"]
|
|
1597
|
-
alleles = core.BufferedArray(self.get_array("variant_allele"), start)
|
|
1598
1866
|
|
|
1599
1867
|
for ref, alt in zip(
|
|
1600
|
-
ref_col.iter_values(
|
|
1868
|
+
ref_col.iter_values(partition.start_index, partition.stop_index),
|
|
1869
|
+
alt_col.iter_values(partition.start_index, partition.stop_index),
|
|
1601
1870
|
):
|
|
1602
1871
|
j = alleles.next_buffer_row()
|
|
1603
1872
|
alleles.buff[j, :] = STR_FILL
|
|
1604
1873
|
alleles.buff[j, 0] = ref[0]
|
|
1605
1874
|
alleles.buff[j, 1 : 1 + len(alt)] = alt
|
|
1606
1875
|
alleles.flush()
|
|
1607
|
-
logger.debug(f"Encoded alleles slice {start}:{stop}")
|
|
1608
1876
|
|
|
1609
|
-
|
|
1877
|
+
self.finalise_partition_array(partition_index, array_name)
|
|
1878
|
+
|
|
1879
|
+
def encode_id_partition(self, partition_index):
|
|
1880
|
+
vid_array = self.init_partition_array(partition_index, "variant_id")
|
|
1881
|
+
vid_mask_array = self.init_partition_array(partition_index, "variant_id_mask")
|
|
1882
|
+
partition = self.metadata.partitions[partition_index]
|
|
1883
|
+
vid = core.BufferedArray(vid_array, partition.start_index)
|
|
1884
|
+
vid_mask = core.BufferedArray(vid_mask_array, partition.start_index)
|
|
1610
1885
|
col = self.icf.columns["ID"]
|
|
1611
|
-
vid = core.BufferedArray(self.get_array("variant_id"), start)
|
|
1612
|
-
vid_mask = core.BufferedArray(self.get_array("variant_id_mask"), start)
|
|
1613
1886
|
|
|
1614
|
-
for value in col.iter_values(
|
|
1887
|
+
for value in col.iter_values(partition.start_index, partition.stop_index):
|
|
1615
1888
|
j = vid.next_buffer_row()
|
|
1616
1889
|
k = vid_mask.next_buffer_row()
|
|
1617
1890
|
assert j == k
|
|
@@ -1623,13 +1896,19 @@ class VcfZarrWriter:
|
|
|
1623
1896
|
vid_mask.buff[j] = True
|
|
1624
1897
|
vid.flush()
|
|
1625
1898
|
vid_mask.flush()
|
|
1626
|
-
logger.debug(f"Encoded ID slice {start}:{stop}")
|
|
1627
1899
|
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
var_filter = core.BufferedArray(self.get_array("variant_filter"), start)
|
|
1900
|
+
self.finalise_partition_array(partition_index, "variant_id")
|
|
1901
|
+
self.finalise_partition_array(partition_index, "variant_id_mask")
|
|
1631
1902
|
|
|
1632
|
-
|
|
1903
|
+
def encode_filters_partition(self, partition_index):
|
|
1904
|
+
lookup = {filt: index for index, filt in enumerate(self.schema.filter_id)}
|
|
1905
|
+
array_name = "variant_filter"
|
|
1906
|
+
array = self.init_partition_array(partition_index, array_name)
|
|
1907
|
+
partition = self.metadata.partitions[partition_index]
|
|
1908
|
+
var_filter = core.BufferedArray(array, partition.start_index)
|
|
1909
|
+
|
|
1910
|
+
col = self.icf.columns["FILTERS"]
|
|
1911
|
+
for value in col.iter_values(partition.start_index, partition.stop_index):
|
|
1633
1912
|
j = var_filter.next_buffer_row()
|
|
1634
1913
|
var_filter.buff[j] = False
|
|
1635
1914
|
for f in value:
|
|
@@ -1637,16 +1916,21 @@ class VcfZarrWriter:
|
|
|
1637
1916
|
var_filter.buff[j, lookup[f]] = True
|
|
1638
1917
|
except KeyError:
|
|
1639
1918
|
raise ValueError(
|
|
1640
|
-
f"Filter '{f}' was not defined
|
|
1919
|
+
f"Filter '{f}' was not defined in the header."
|
|
1641
1920
|
) from None
|
|
1642
1921
|
var_filter.flush()
|
|
1643
|
-
logger.debug(f"Encoded FILTERS slice {start}:{stop}")
|
|
1644
1922
|
|
|
1645
|
-
|
|
1923
|
+
self.finalise_partition_array(partition_index, array_name)
|
|
1924
|
+
|
|
1925
|
+
def encode_contig_partition(self, partition_index):
|
|
1926
|
+
lookup = {contig: index for index, contig in enumerate(self.schema.contig_id)}
|
|
1927
|
+
array_name = "variant_contig"
|
|
1928
|
+
array = self.init_partition_array(partition_index, array_name)
|
|
1929
|
+
partition = self.metadata.partitions[partition_index]
|
|
1930
|
+
contig = core.BufferedArray(array, partition.start_index)
|
|
1646
1931
|
col = self.icf.columns["CHROM"]
|
|
1647
|
-
contig = core.BufferedArray(self.get_array("variant_contig"), start)
|
|
1648
1932
|
|
|
1649
|
-
for value in col.iter_values(
|
|
1933
|
+
for value in col.iter_values(partition.start_index, partition.stop_index):
|
|
1650
1934
|
j = contig.next_buffer_row()
|
|
1651
1935
|
# Note: because we are using the indexes to define the lookups
|
|
1652
1936
|
# and we always have an index, it seems that we the contig lookup
|
|
@@ -1654,161 +1938,120 @@ class VcfZarrWriter:
|
|
|
1654
1938
|
# here, please do open an issue with a reproducible example!
|
|
1655
1939
|
contig.buff[j] = lookup[value[0]]
|
|
1656
1940
|
contig.flush()
|
|
1657
|
-
logger.debug(f"Encoded CHROM slice {start}:{stop}")
|
|
1658
|
-
|
|
1659
|
-
def encode_samples(self):
|
|
1660
|
-
if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
|
|
1661
|
-
raise ValueError(
|
|
1662
|
-
"Subsetting or reordering samples not supported currently"
|
|
1663
|
-
) # NEEDS TEST
|
|
1664
|
-
array = self.root.array(
|
|
1665
|
-
"sample_id",
|
|
1666
|
-
self.schema.sample_id,
|
|
1667
|
-
dtype="str",
|
|
1668
|
-
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1669
|
-
chunks=(self.schema.samples_chunk_size,),
|
|
1670
|
-
)
|
|
1671
|
-
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
|
|
1672
|
-
logger.debug("Samples done")
|
|
1673
1941
|
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1942
|
+
self.finalise_partition_array(partition_index, array_name)
|
|
1943
|
+
|
|
1944
|
+
#######################
|
|
1945
|
+
# finalise
|
|
1946
|
+
#######################
|
|
1947
|
+
|
|
1948
|
+
def finalise_array(self, name):
|
|
1949
|
+
logger.info(f"Finalising {name}")
|
|
1950
|
+
final_path = self.path / name
|
|
1951
|
+
if final_path.exists():
|
|
1952
|
+
# NEEDS TEST
|
|
1953
|
+
raise ValueError(f"Array {name} already exists")
|
|
1954
|
+
for partition in range(len(self.metadata.partitions)):
|
|
1955
|
+
# Move all the files in partition dir to dest dir
|
|
1956
|
+
src = self.partition_array_path(partition, name)
|
|
1957
|
+
if not src.exists():
|
|
1958
|
+
# Needs test
|
|
1959
|
+
raise ValueError(f"Partition {partition} of {name} does not exist")
|
|
1960
|
+
dest = self.arrays_path / name
|
|
1961
|
+
# This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
|
|
1962
|
+
chunk_files = [
|
|
1963
|
+
path for path in src.iterdir() if not path.name.startswith(".")
|
|
1964
|
+
]
|
|
1965
|
+
# TODO check for a count of then number of files. If we require a
|
|
1966
|
+
# dimension_separator of "/" then we could make stronger assertions
|
|
1967
|
+
# here, as we'd always have num_variant_chunks
|
|
1968
|
+
logger.debug(
|
|
1969
|
+
f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
|
|
1688
1970
|
)
|
|
1689
|
-
|
|
1690
|
-
|
|
1971
|
+
for chunk_file in chunk_files:
|
|
1972
|
+
os.rename(chunk_file, dest / chunk_file.name)
|
|
1973
|
+
# Finally, once all the chunks have moved into the arrays dir,
|
|
1974
|
+
# we move it out of wip
|
|
1975
|
+
os.rename(self.arrays_path / name, self.path / name)
|
|
1976
|
+
core.update_progress(1)
|
|
1691
1977
|
|
|
1692
|
-
def
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1978
|
+
def finalise(self, show_progress=False):
|
|
1979
|
+
self.load_metadata()
|
|
1980
|
+
|
|
1981
|
+
progress_config = core.ProgressConfig(
|
|
1982
|
+
total=len(self.schema.columns),
|
|
1983
|
+
title="Finalise",
|
|
1984
|
+
units="array",
|
|
1985
|
+
show=show_progress,
|
|
1698
1986
|
)
|
|
1699
|
-
|
|
1700
|
-
|
|
1987
|
+
# NOTE: it's not clear that adding more workers will make this quicker,
|
|
1988
|
+
# as it's just going to be causing contention on the file system.
|
|
1989
|
+
# Something to check empirically in some deployments.
|
|
1990
|
+
# FIXME we're just using worker_processes=0 here to hook into the
|
|
1991
|
+
# SynchronousExecutor which is intended for testing purposes so
|
|
1992
|
+
# that we get test coverage. Should fix this either by allowing
|
|
1993
|
+
# for multiple workers, or making a standard wrapper for tqdm
|
|
1994
|
+
# that allows us to have a consistent look and feel.
|
|
1995
|
+
with core.ParallelWorkManager(0, progress_config) as pwm:
|
|
1996
|
+
for name in self.schema.columns:
|
|
1997
|
+
pwm.submit(self.finalise_array, name)
|
|
1998
|
+
zarr.consolidate_metadata(self.path)
|
|
1701
1999
|
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
self.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
|
|
1706
|
-
for column in self.schema.columns.values():
|
|
1707
|
-
self.init_array(column)
|
|
2000
|
+
######################
|
|
2001
|
+
# encode_all_partitions
|
|
2002
|
+
######################
|
|
1708
2003
|
|
|
1709
|
-
def
|
|
1710
|
-
|
|
2004
|
+
def get_max_encoding_memory(self):
|
|
2005
|
+
"""
|
|
2006
|
+
Return the approximate maximum memory used to encode a variant chunk.
|
|
2007
|
+
"""
|
|
2008
|
+
max_encoding_mem = max(
|
|
2009
|
+
col.variant_chunk_nbytes for col in self.schema.columns.values()
|
|
2010
|
+
)
|
|
2011
|
+
gt_mem = 0
|
|
2012
|
+
if "call_genotype" in self.schema.columns:
|
|
2013
|
+
encoded_together = [
|
|
2014
|
+
"call_genotype",
|
|
2015
|
+
"call_genotype_phased",
|
|
2016
|
+
"call_genotype_mask",
|
|
2017
|
+
]
|
|
2018
|
+
gt_mem = sum(
|
|
2019
|
+
self.schema.columns[col].variant_chunk_nbytes
|
|
2020
|
+
for col in encoded_together
|
|
2021
|
+
)
|
|
2022
|
+
return max(max_encoding_mem, gt_mem)
|
|
1711
2023
|
|
|
1712
|
-
def
|
|
1713
|
-
self,
|
|
1714
|
-
worker_processes=1,
|
|
1715
|
-
max_v_chunks=None,
|
|
1716
|
-
show_progress=False,
|
|
1717
|
-
max_memory=None,
|
|
2024
|
+
def encode_all_partitions(
|
|
2025
|
+
self, *, worker_processes=1, show_progress=False, max_memory=None
|
|
1718
2026
|
):
|
|
1719
2027
|
max_memory = parse_max_memory(max_memory)
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
2028
|
+
self.load_metadata()
|
|
2029
|
+
num_partitions = self.num_partitions
|
|
2030
|
+
per_worker_memory = self.get_max_encoding_memory()
|
|
2031
|
+
logger.info(
|
|
2032
|
+
f"Encoding Zarr over {num_partitions} partitions with "
|
|
2033
|
+
f"{worker_processes} workers and {display_size(per_worker_memory)} "
|
|
2034
|
+
"per worker"
|
|
1727
2035
|
)
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
total_bytes = 0
|
|
1736
|
-
encoding_memory_requirements = {}
|
|
1737
|
-
for col in self.schema.columns.values():
|
|
1738
|
-
array = self.get_array(col.name)
|
|
1739
|
-
# NOTE!! this is bad, we're potentially creating quite a large
|
|
1740
|
-
# numpy array for basically nothing. We can compute this.
|
|
1741
|
-
variant_chunk_size = array.blocks[0].nbytes
|
|
1742
|
-
encoding_memory_requirements[col.name] = variant_chunk_size
|
|
1743
|
-
logger.debug(
|
|
1744
|
-
f"{col.name} requires at least {display_size(variant_chunk_size)} "
|
|
1745
|
-
f"per worker"
|
|
2036
|
+
# Each partition requires per_worker_memory bytes, so to prevent more that
|
|
2037
|
+
# max_memory being used, we clamp the number of workers
|
|
2038
|
+
max_num_workers = max_memory // per_worker_memory
|
|
2039
|
+
if max_num_workers < worker_processes:
|
|
2040
|
+
logger.warning(
|
|
2041
|
+
f"Limiting number of workers to {max_num_workers} to "
|
|
2042
|
+
f"keep within specified memory budget of {display_size(max_memory)}"
|
|
1746
2043
|
)
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
work = []
|
|
1753
|
-
for start, stop in slices:
|
|
1754
|
-
for col in self.schema.columns.values():
|
|
1755
|
-
if col.vcf_field is not None:
|
|
1756
|
-
f = functools.partial(self.encode_array_slice, col)
|
|
1757
|
-
work.append(
|
|
1758
|
-
EncodingWork(
|
|
1759
|
-
f,
|
|
1760
|
-
start,
|
|
1761
|
-
stop,
|
|
1762
|
-
[col.name],
|
|
1763
|
-
encoding_memory_requirements[col.name],
|
|
1764
|
-
)
|
|
1765
|
-
)
|
|
1766
|
-
work.append(
|
|
1767
|
-
EncodingWork(self.encode_alleles_slice, start, stop, ["variant_allele"])
|
|
1768
|
-
)
|
|
1769
|
-
work.append(
|
|
1770
|
-
EncodingWork(
|
|
1771
|
-
self.encode_id_slice, start, stop, ["variant_id", "variant_id_mask"]
|
|
1772
|
-
)
|
|
1773
|
-
)
|
|
1774
|
-
work.append(
|
|
1775
|
-
EncodingWork(
|
|
1776
|
-
functools.partial(self.encode_filters_slice, filter_id_map),
|
|
1777
|
-
start,
|
|
1778
|
-
stop,
|
|
1779
|
-
["variant_filter"],
|
|
1780
|
-
)
|
|
1781
|
-
)
|
|
1782
|
-
work.append(
|
|
1783
|
-
EncodingWork(
|
|
1784
|
-
functools.partial(self.encode_contig_slice, contig_id_map),
|
|
1785
|
-
start,
|
|
1786
|
-
stop,
|
|
1787
|
-
["variant_contig"],
|
|
1788
|
-
)
|
|
2044
|
+
if max_num_workers <= 0:
|
|
2045
|
+
raise ValueError(
|
|
2046
|
+
f"Insufficient memory to encode a partition:"
|
|
2047
|
+
f"{display_size(per_worker_memory)} > {display_size(max_memory)}"
|
|
1789
2048
|
)
|
|
1790
|
-
|
|
1791
|
-
variables = [
|
|
1792
|
-
"call_genotype",
|
|
1793
|
-
"call_genotype_phased",
|
|
1794
|
-
"call_genotype_mask",
|
|
1795
|
-
]
|
|
1796
|
-
gt_memory = sum(
|
|
1797
|
-
encoding_memory_requirements[name] for name in variables
|
|
1798
|
-
)
|
|
1799
|
-
work.append(
|
|
1800
|
-
EncodingWork(
|
|
1801
|
-
self.encode_genotypes_slice, start, stop, variables, gt_memory
|
|
1802
|
-
)
|
|
1803
|
-
)
|
|
2049
|
+
num_workers = min(max_num_workers, worker_processes)
|
|
1804
2050
|
|
|
1805
|
-
|
|
1806
|
-
for
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
f"Insufficient memory for {wp.columns}: "
|
|
1810
|
-
f"{display_size(wp.memory)} > {display_size(max_memory)}"
|
|
1811
|
-
)
|
|
2051
|
+
total_bytes = 0
|
|
2052
|
+
for col in self.schema.columns.values():
|
|
2053
|
+
# Open the array definition to get the total size
|
|
2054
|
+
total_bytes += zarr.open(self.arrays_path / col.name).nbytes
|
|
1812
2055
|
|
|
1813
2056
|
progress_config = core.ProgressConfig(
|
|
1814
2057
|
total=total_bytes,
|
|
@@ -1816,54 +2059,9 @@ class VcfZarrWriter:
|
|
|
1816
2059
|
units="B",
|
|
1817
2060
|
show=show_progress,
|
|
1818
2061
|
)
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
# below doesn't really work.
|
|
1823
|
-
max_queued = 4 * max(1, worker_processes)
|
|
1824
|
-
encoded_slices = collections.Counter()
|
|
1825
|
-
|
|
1826
|
-
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
1827
|
-
future = pwm.submit(self.encode_samples)
|
|
1828
|
-
future_to_work = {future: EncodingWork(None, 0, 0, [])}
|
|
1829
|
-
|
|
1830
|
-
def service_completed_futures():
|
|
1831
|
-
nonlocal used_memory
|
|
1832
|
-
|
|
1833
|
-
completed = pwm.wait_for_completed()
|
|
1834
|
-
for future in completed:
|
|
1835
|
-
wp_done = future_to_work.pop(future)
|
|
1836
|
-
used_memory -= wp_done.memory
|
|
1837
|
-
logger.debug(
|
|
1838
|
-
f"Complete {wp_done}: used mem={display_size(used_memory)}"
|
|
1839
|
-
)
|
|
1840
|
-
for column in wp_done.columns:
|
|
1841
|
-
encoded_slices[column] += 1
|
|
1842
|
-
if encoded_slices[column] == len(slices):
|
|
1843
|
-
# Do this syncronously for simplicity. Should be
|
|
1844
|
-
# fine as the workers will probably be busy with
|
|
1845
|
-
# large encode tasks most of the time.
|
|
1846
|
-
self.finalise_array(column)
|
|
1847
|
-
|
|
1848
|
-
for wp in work:
|
|
1849
|
-
while (
|
|
1850
|
-
used_memory + wp.memory > max_memory
|
|
1851
|
-
or len(future_to_work) > max_queued
|
|
1852
|
-
):
|
|
1853
|
-
logger.debug(
|
|
1854
|
-
f"Wait: mem_required={used_memory + wp.memory} "
|
|
1855
|
-
f"max_mem={max_memory} queued={len(future_to_work)} "
|
|
1856
|
-
f"max_queued={max_queued}"
|
|
1857
|
-
)
|
|
1858
|
-
service_completed_futures()
|
|
1859
|
-
future = pwm.submit(wp.func, wp.start, wp.stop)
|
|
1860
|
-
used_memory += wp.memory
|
|
1861
|
-
logger.debug(f"Submit {wp}: used mem={display_size(used_memory)}")
|
|
1862
|
-
future_to_work[future] = wp
|
|
1863
|
-
|
|
1864
|
-
logger.debug("All work submitted")
|
|
1865
|
-
while len(future_to_work) > 0:
|
|
1866
|
-
service_completed_futures()
|
|
2062
|
+
with core.ParallelWorkManager(num_workers, progress_config) as pwm:
|
|
2063
|
+
for partition_index in range(num_partitions):
|
|
2064
|
+
pwm.submit(self.encode_partition, partition_index)
|
|
1867
2065
|
|
|
1868
2066
|
|
|
1869
2067
|
def mkschema(if_path, out):
|
|
@@ -1878,13 +2076,48 @@ def encode(
|
|
|
1878
2076
|
schema_path=None,
|
|
1879
2077
|
variants_chunk_size=None,
|
|
1880
2078
|
samples_chunk_size=None,
|
|
1881
|
-
|
|
2079
|
+
max_variant_chunks=None,
|
|
1882
2080
|
dimension_separator=None,
|
|
1883
2081
|
max_memory=None,
|
|
1884
2082
|
worker_processes=1,
|
|
1885
2083
|
show_progress=False,
|
|
1886
2084
|
):
|
|
1887
|
-
|
|
2085
|
+
# Rough heuristic to split work up enough to keep utilisation high
|
|
2086
|
+
target_num_partitions = max(1, worker_processes * 4)
|
|
2087
|
+
encode_init(
|
|
2088
|
+
if_path,
|
|
2089
|
+
zarr_path,
|
|
2090
|
+
target_num_partitions,
|
|
2091
|
+
schema_path=schema_path,
|
|
2092
|
+
variants_chunk_size=variants_chunk_size,
|
|
2093
|
+
samples_chunk_size=samples_chunk_size,
|
|
2094
|
+
max_variant_chunks=max_variant_chunks,
|
|
2095
|
+
dimension_separator=dimension_separator,
|
|
2096
|
+
)
|
|
2097
|
+
vzw = VcfZarrWriter(zarr_path)
|
|
2098
|
+
vzw.encode_all_partitions(
|
|
2099
|
+
worker_processes=worker_processes,
|
|
2100
|
+
show_progress=show_progress,
|
|
2101
|
+
max_memory=max_memory,
|
|
2102
|
+
)
|
|
2103
|
+
vzw.finalise(show_progress)
|
|
2104
|
+
|
|
2105
|
+
|
|
2106
|
+
def encode_init(
|
|
2107
|
+
icf_path,
|
|
2108
|
+
zarr_path,
|
|
2109
|
+
target_num_partitions,
|
|
2110
|
+
*,
|
|
2111
|
+
schema_path=None,
|
|
2112
|
+
variants_chunk_size=None,
|
|
2113
|
+
samples_chunk_size=None,
|
|
2114
|
+
max_variant_chunks=None,
|
|
2115
|
+
dimension_separator=None,
|
|
2116
|
+
max_memory=None,
|
|
2117
|
+
worker_processes=1,
|
|
2118
|
+
show_progress=False,
|
|
2119
|
+
):
|
|
2120
|
+
icf = IntermediateColumnarFormat(icf_path)
|
|
1888
2121
|
if schema_path is None:
|
|
1889
2122
|
schema = VcfZarrSchema.generate(
|
|
1890
2123
|
icf,
|
|
@@ -1900,18 +2133,25 @@ def encode(
|
|
|
1900
2133
|
with open(schema_path) as f:
|
|
1901
2134
|
schema = VcfZarrSchema.fromjson(f.read())
|
|
1902
2135
|
zarr_path = pathlib.Path(zarr_path)
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
worker_processes=worker_processes,
|
|
1911
|
-
max_memory=max_memory,
|
|
1912
|
-
show_progress=show_progress,
|
|
2136
|
+
vzw = VcfZarrWriter(zarr_path)
|
|
2137
|
+
vzw.init(
|
|
2138
|
+
icf,
|
|
2139
|
+
target_num_partitions=target_num_partitions,
|
|
2140
|
+
schema=schema,
|
|
2141
|
+
dimension_separator=dimension_separator,
|
|
2142
|
+
max_variant_chunks=max_variant_chunks,
|
|
1913
2143
|
)
|
|
1914
|
-
vzw.
|
|
2144
|
+
return vzw.num_partitions, vzw.get_max_encoding_memory()
|
|
2145
|
+
|
|
2146
|
+
|
|
2147
|
+
def encode_partition(zarr_path, partition):
|
|
2148
|
+
writer = VcfZarrWriter(zarr_path)
|
|
2149
|
+
writer.encode_partition(partition)
|
|
2150
|
+
|
|
2151
|
+
|
|
2152
|
+
def encode_finalise(zarr_path, show_progress=False):
|
|
2153
|
+
writer = VcfZarrWriter(zarr_path)
|
|
2154
|
+
writer.finalise(show_progress=show_progress)
|
|
1915
2155
|
|
|
1916
2156
|
|
|
1917
2157
|
def convert(
|
|
@@ -2121,7 +2361,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
|
|
|
2121
2361
|
assert pos[start_index] == first_pos
|
|
2122
2362
|
vcf = cyvcf2.VCF(vcf_path)
|
|
2123
2363
|
if show_progress:
|
|
2124
|
-
iterator = tqdm.tqdm(vcf, desc="
|
|
2364
|
+
iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
|
|
2125
2365
|
else:
|
|
2126
2366
|
iterator = vcf
|
|
2127
2367
|
for j, row in enumerate(iterator, start_index):
|