bio2zarr 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/vcf.py CHANGED
@@ -111,9 +111,6 @@ class VcfField:
111
111
  return self.name
112
112
  return f"{self.category}/{self.name}"
113
113
 
114
- # TODO add method here to choose a good set compressor and
115
- # filters default here for this field.
116
-
117
114
  def smallest_dtype(self):
118
115
  """
119
116
  Returns the smallest dtype suitable for this field based
@@ -123,13 +120,13 @@ class VcfField:
123
120
  if self.vcf_type == "Float":
124
121
  ret = "f4"
125
122
  elif self.vcf_type == "Integer":
126
- dtype = "i4"
127
- for a_dtype in ["i1", "i2"]:
128
- info = np.iinfo(a_dtype)
129
- if info.min <= s.min_value and s.max_value <= info.max:
130
- dtype = a_dtype
131
- break
132
- ret = dtype
123
+ if not math.isfinite(s.max_value):
124
+ # All missing values; use i1. Note we should have some API to
125
+ # check more explicitly for missingness:
126
+ # https://github.com/sgkit-dev/bio2zarr/issues/131
127
+ ret = "i1"
128
+ else:
129
+ ret = core.min_int_dtype(s.min_value, s.max_value)
133
130
  elif self.vcf_type == "Flag":
134
131
  ret = "bool"
135
132
  elif self.vcf_type == "Character":
@@ -152,6 +149,10 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
152
149
  cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
153
150
  )
154
151
 
152
+ # TODO refactor this to have embedded Contig dataclass, Filters
153
+ # and Samples dataclasses to allow for more information to be
154
+ # retained and forward compatibility.
155
+
155
156
 
156
157
  @dataclasses.dataclass
157
158
  class IcfMetadata:
@@ -183,6 +184,14 @@ class IcfMetadata:
183
184
  fields.append(field)
184
185
  return fields
185
186
 
187
+ @property
188
+ def num_contigs(self):
189
+ return len(self.contig_names)
190
+
191
+ @property
192
+ def num_filters(self):
193
+ return len(self.filters)
194
+
186
195
  @property
187
196
  def num_records(self):
188
197
  return sum(self.contig_record_counts.values())
@@ -1242,6 +1251,50 @@ class ZarrColumnSpec:
1242
1251
  spec._choose_compressor_settings()
1243
1252
  return spec
1244
1253
 
1254
+ @staticmethod
1255
+ def from_field(
1256
+ vcf_field,
1257
+ *,
1258
+ num_variants,
1259
+ num_samples,
1260
+ variants_chunk_size,
1261
+ samples_chunk_size,
1262
+ variable_name=None,
1263
+ ):
1264
+ shape = [num_variants]
1265
+ prefix = "variant_"
1266
+ dimensions = ["variants"]
1267
+ chunks = [variants_chunk_size]
1268
+ if vcf_field.category == "FORMAT":
1269
+ prefix = "call_"
1270
+ shape.append(num_samples)
1271
+ chunks.append(samples_chunk_size)
1272
+ dimensions.append("samples")
1273
+ if variable_name is None:
1274
+ variable_name = prefix + vcf_field.name
1275
+ # TODO make an option to add in the empty extra dimension
1276
+ if vcf_field.summary.max_number > 1:
1277
+ shape.append(vcf_field.summary.max_number)
1278
+ # TODO we should really be checking this to see if the named dimensions
1279
+ # are actually correct.
1280
+ if vcf_field.vcf_number == "R":
1281
+ dimensions.append("alleles")
1282
+ elif vcf_field.vcf_number == "A":
1283
+ dimensions.append("alt_alleles")
1284
+ elif vcf_field.vcf_number == "G":
1285
+ dimensions.append("genotypes")
1286
+ else:
1287
+ dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
1288
+ return ZarrColumnSpec.new(
1289
+ vcf_field=vcf_field.full_name,
1290
+ name=variable_name,
1291
+ dtype=vcf_field.smallest_dtype(),
1292
+ shape=shape,
1293
+ chunks=chunks,
1294
+ dimensions=dimensions,
1295
+ description=vcf_field.description,
1296
+ )
1297
+
1245
1298
  def _choose_compressor_settings(self):
1246
1299
  """
1247
1300
  Choose compressor and filter settings based on the size and
@@ -1250,17 +1303,32 @@ class ZarrColumnSpec:
1250
1303
 
1251
1304
  See https://github.com/pystatgen/bio2zarr/discussions/74
1252
1305
  """
1253
- dt = np.dtype(self.dtype)
1254
1306
  # Default is to not shuffle, because autoshuffle isn't recognised
1255
1307
  # by many Zarr implementations, and shuffling can lead to worse
1256
1308
  # performance in some cases anyway. Turning on shuffle should be a
1257
1309
  # deliberate choice.
1258
1310
  shuffle = numcodecs.Blosc.NOSHUFFLE
1259
- if dt.itemsize == 1:
1260
- # Any 1 byte field gets BITSHUFFLE by default
1311
+ if self.name == "call_genotype" and self.dtype == "i1":
1312
+ # call_genotype gets BITSHUFFLE by default as it gets
1313
+ # significantly better compression (at a cost of slower
1314
+ # decoding)
1261
1315
  shuffle = numcodecs.Blosc.BITSHUFFLE
1316
+ elif self.dtype == "bool":
1317
+ shuffle = numcodecs.Blosc.BITSHUFFLE
1318
+
1262
1319
  self.compressor["shuffle"] = shuffle
1263
1320
 
1321
+ @property
1322
+ def variant_chunk_nbytes(self):
1323
+ """
1324
+ Returns the nbytes for a single variant chunk of this array.
1325
+ """
1326
+ chunk_items = self.chunks[0]
1327
+ for size in self.shape[1:]:
1328
+ chunk_items *= size
1329
+ dt = np.dtype(self.dtype)
1330
+ return chunk_items * dt.itemsize
1331
+
1264
1332
 
1265
1333
  ZARR_SCHEMA_FORMAT_VERSION = "0.2"
1266
1334
 
@@ -1313,6 +1381,16 @@ class VcfZarrSchema:
1313
1381
  f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
1314
1382
  )
1315
1383
 
1384
+ def spec_from_field(field, variable_name=None):
1385
+ return ZarrColumnSpec.from_field(
1386
+ field,
1387
+ num_samples=n,
1388
+ num_variants=m,
1389
+ samples_chunk_size=samples_chunk_size,
1390
+ variants_chunk_size=variants_chunk_size,
1391
+ variable_name=variable_name,
1392
+ )
1393
+
1316
1394
  def fixed_field_spec(
1317
1395
  name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
1318
1396
  ):
@@ -1328,95 +1406,56 @@ class VcfZarrSchema:
1328
1406
 
1329
1407
  alt_col = icf.columns["ALT"]
1330
1408
  max_alleles = alt_col.vcf_field.summary.max_number + 1
1331
- num_filters = len(icf.metadata.filters)
1332
1409
 
1333
- # # FIXME get dtype from lookup table
1334
1410
  colspecs = [
1335
1411
  fixed_field_spec(
1336
1412
  name="variant_contig",
1337
- dtype="i2", # FIXME
1413
+ dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
1338
1414
  ),
1339
1415
  fixed_field_spec(
1340
1416
  name="variant_filter",
1341
1417
  dtype="bool",
1342
- shape=(m, num_filters),
1418
+ shape=(m, icf.metadata.num_filters),
1343
1419
  dimensions=["variants", "filters"],
1344
1420
  ),
1345
1421
  fixed_field_spec(
1346
1422
  name="variant_allele",
1347
1423
  dtype="str",
1348
- shape=[m, max_alleles],
1424
+ shape=(m, max_alleles),
1349
1425
  dimensions=["variants", "alleles"],
1350
1426
  ),
1351
1427
  fixed_field_spec(
1352
- vcf_field="POS",
1353
- name="variant_position",
1354
- dtype="i4",
1355
- ),
1356
- fixed_field_spec(
1357
- vcf_field=None,
1358
1428
  name="variant_id",
1359
1429
  dtype="str",
1360
1430
  ),
1361
1431
  fixed_field_spec(
1362
- vcf_field=None,
1363
1432
  name="variant_id_mask",
1364
1433
  dtype="bool",
1365
1434
  ),
1366
- fixed_field_spec(
1367
- vcf_field="QUAL",
1368
- name="variant_quality",
1369
- dtype="f4",
1370
- ),
1371
1435
  ]
1436
+ name_map = {field.full_name: field for field in icf.metadata.fields}
1437
+
1438
+ # Only two of the fixed fields have a direct one-to-one mapping.
1439
+ colspecs.extend(
1440
+ [
1441
+ spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
1442
+ spec_from_field(name_map["POS"], variable_name="variant_position"),
1443
+ ]
1444
+ )
1445
+ colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
1372
1446
 
1373
1447
  gt_field = None
1374
- for field in icf.metadata.fields:
1375
- if field.category == "fixed":
1376
- continue
1448
+ for field in icf.metadata.format_fields:
1377
1449
  if field.name == "GT":
1378
1450
  gt_field = field
1379
1451
  continue
1380
- shape = [m]
1381
- prefix = "variant_"
1382
- dimensions = ["variants"]
1383
- chunks = [variants_chunk_size]
1384
- if field.category == "FORMAT":
1385
- prefix = "call_"
1386
- shape.append(n)
1387
- chunks.append(samples_chunk_size)
1388
- dimensions.append("samples")
1389
- # TODO make an option to add in the empty extra dimension
1390
- if field.summary.max_number > 1:
1391
- shape.append(field.summary.max_number)
1392
- # TODO we should really be checking this to see if the named dimensions
1393
- # are actually correct.
1394
- if field.vcf_number == "R":
1395
- dimensions.append("alleles")
1396
- elif field.vcf_number == "A":
1397
- dimensions.append("alt_alleles")
1398
- elif field.vcf_number == "G":
1399
- dimensions.append("genotypes")
1400
- else:
1401
- dimensions.append(f"{field.category}_{field.name}_dim")
1402
- variable_name = prefix + field.name
1403
- colspec = ZarrColumnSpec.new(
1404
- vcf_field=field.full_name,
1405
- name=variable_name,
1406
- dtype=field.smallest_dtype(),
1407
- shape=shape,
1408
- chunks=chunks,
1409
- dimensions=dimensions,
1410
- description=field.description,
1411
- )
1412
- colspecs.append(colspec)
1452
+ colspecs.append(spec_from_field(field))
1413
1453
 
1414
1454
  if gt_field is not None:
1415
1455
  ploidy = gt_field.summary.max_number - 1
1416
1456
  shape = [m, n]
1417
1457
  chunks = [variants_chunk_size, samples_chunk_size]
1418
1458
  dimensions = ["variants", "samples"]
1419
-
1420
1459
  colspecs.append(
1421
1460
  ZarrColumnSpec.new(
1422
1461
  vcf_field=None,
@@ -1498,15 +1537,6 @@ class VcfZarr:
1498
1537
  return data
1499
1538
 
1500
1539
 
1501
- @dataclasses.dataclass
1502
- class EncodingWork:
1503
- func: callable = dataclasses.field(repr=False)
1504
- start: int
1505
- stop: int
1506
- columns: list[str]
1507
- memory: int = 0
1508
-
1509
-
1510
1540
  def parse_max_memory(max_memory):
1511
1541
  if max_memory is None:
1512
1542
  # Effectively unbounded
@@ -1517,67 +1547,299 @@ def parse_max_memory(max_memory):
1517
1547
  return max_memory
1518
1548
 
1519
1549
 
1550
+ @dataclasses.dataclass
1551
+ class VcfZarrPartition:
1552
+ start_index: int
1553
+ stop_index: int
1554
+ start_chunk: int
1555
+ stop_chunk: int
1556
+
1557
+ @staticmethod
1558
+ def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
1559
+ num_chunks = int(np.ceil(num_records / chunk_size))
1560
+ if max_chunks is not None:
1561
+ num_chunks = min(num_chunks, max_chunks)
1562
+ partitions = []
1563
+ splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
1564
+ for chunk_slice in splits:
1565
+ start_chunk = int(chunk_slice[0])
1566
+ stop_chunk = int(chunk_slice[-1]) + 1
1567
+ start_index = start_chunk * chunk_size
1568
+ stop_index = min(stop_chunk * chunk_size, num_records)
1569
+ partitions.append(
1570
+ VcfZarrPartition(start_index, stop_index, start_chunk, stop_chunk)
1571
+ )
1572
+ return partitions
1573
+
1574
+
1575
+ VZW_METADATA_FORMAT_VERSION = "0.1"
1576
+
1577
+
1578
+ @dataclasses.dataclass
1579
+ class VcfZarrWriterMetadata:
1580
+ format_version: str
1581
+ icf_path: str
1582
+ schema: VcfZarrSchema
1583
+ dimension_separator: str
1584
+ partitions: list
1585
+ provenance: dict
1586
+
1587
+ def asdict(self):
1588
+ return dataclasses.asdict(self)
1589
+
1590
+ @staticmethod
1591
+ def fromdict(d):
1592
+ if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
1593
+ raise ValueError(
1594
+ "VcfZarrWriter format version mismatch: "
1595
+ f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
1596
+ )
1597
+ ret = VcfZarrWriterMetadata(**d)
1598
+ ret.schema = VcfZarrSchema.fromdict(ret.schema)
1599
+ ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
1600
+ return ret
1601
+
1602
+
1520
1603
  class VcfZarrWriter:
1521
- def __init__(self, path, icf, schema, dimension_separator=None):
1604
+ def __init__(self, path):
1522
1605
  self.path = pathlib.Path(path)
1606
+ self.wip_path = self.path / "wip"
1607
+ self.arrays_path = self.wip_path / "arrays"
1608
+ self.partitions_path = self.wip_path / "partitions"
1609
+ self.metadata = None
1610
+ self.icf = None
1611
+
1612
+ @property
1613
+ def schema(self):
1614
+ return self.metadata.schema
1615
+
1616
+ @property
1617
+ def num_partitions(self):
1618
+ return len(self.metadata.partitions)
1619
+
1620
+ #######################
1621
+ # init
1622
+ #######################
1623
+
1624
+ def init(
1625
+ self,
1626
+ icf,
1627
+ *,
1628
+ target_num_partitions,
1629
+ schema,
1630
+ dimension_separator=None,
1631
+ max_variant_chunks=None,
1632
+ ):
1523
1633
  self.icf = icf
1524
- self.schema = schema
1634
+ if self.path.exists():
1635
+ raise ValueError("Zarr path already exists") # NEEDS TEST
1636
+ partitions = VcfZarrPartition.generate_partitions(
1637
+ self.icf.num_records,
1638
+ schema.variants_chunk_size,
1639
+ target_num_partitions,
1640
+ max_chunks=max_variant_chunks,
1641
+ )
1525
1642
  # Default to using nested directories following the Zarr v3 default.
1526
1643
  # This seems to require version 2.17+ to work properly
1527
- self.dimension_separator = (
1644
+ dimension_separator = (
1528
1645
  "/" if dimension_separator is None else dimension_separator
1529
1646
  )
1647
+ self.metadata = VcfZarrWriterMetadata(
1648
+ format_version=VZW_METADATA_FORMAT_VERSION,
1649
+ icf_path=str(self.icf.path),
1650
+ schema=schema,
1651
+ dimension_separator=dimension_separator,
1652
+ partitions=partitions,
1653
+ # Bare minimum here for provenance - see comments above
1654
+ provenance={"source": f"bio2zarr-{provenance.__version__}"},
1655
+ )
1656
+
1657
+ self.path.mkdir()
1530
1658
  store = zarr.DirectoryStore(self.path)
1531
- self.root = zarr.group(store=store)
1659
+ root = zarr.group(store=store)
1660
+ root.attrs.update(
1661
+ {
1662
+ "vcf_zarr_version": "0.2",
1663
+ "vcf_header": self.icf.vcf_header,
1664
+ "source": f"bio2zarr-{provenance.__version__}",
1665
+ }
1666
+ )
1667
+ # Doing this syncronously - this is fine surely
1668
+ self.encode_samples(root)
1669
+ self.encode_filter_id(root)
1670
+ self.encode_contig_id(root)
1671
+
1672
+ self.wip_path.mkdir()
1673
+ self.arrays_path.mkdir()
1674
+ self.partitions_path.mkdir()
1675
+ store = zarr.DirectoryStore(self.arrays_path)
1676
+ root = zarr.group(store=store)
1677
+
1678
+ for column in self.schema.columns.values():
1679
+ self.init_array(root, column, partitions[-1].stop_index)
1680
+
1681
+ logger.info("Writing WIP metadata")
1682
+ with open(self.wip_path / "metadata.json", "w") as f:
1683
+ json.dump(self.metadata.asdict(), f, indent=4)
1684
+ return len(partitions)
1685
+
1686
+ def encode_samples(self, root):
1687
+ if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
1688
+ raise ValueError(
1689
+ "Subsetting or reordering samples not supported currently"
1690
+ ) # NEEDS TEST
1691
+ array = root.array(
1692
+ "sample_id",
1693
+ self.schema.sample_id,
1694
+ dtype="str",
1695
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1696
+ chunks=(self.schema.samples_chunk_size,),
1697
+ )
1698
+ array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
1699
+ logger.debug("Samples done")
1532
1700
 
1533
- def init_array(self, variable):
1701
+ def encode_contig_id(self, root):
1702
+ array = root.array(
1703
+ "contig_id",
1704
+ self.schema.contig_id,
1705
+ dtype="str",
1706
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1707
+ )
1708
+ array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1709
+ if self.schema.contig_length is not None:
1710
+ array = root.array(
1711
+ "contig_length",
1712
+ self.schema.contig_length,
1713
+ dtype=np.int64,
1714
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1715
+ )
1716
+ array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1717
+
1718
+ def encode_filter_id(self, root):
1719
+ array = root.array(
1720
+ "filter_id",
1721
+ self.schema.filter_id,
1722
+ dtype="str",
1723
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1724
+ )
1725
+ array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
1726
+
1727
+ def init_array(self, root, variable, variants_dim_size):
1534
1728
  object_codec = None
1535
1729
  if variable.dtype == "O":
1536
1730
  object_codec = numcodecs.VLenUTF8()
1537
- a = self.root.empty(
1538
- "wip_" + variable.name,
1539
- shape=variable.shape,
1731
+ shape = list(variable.shape)
1732
+ # Truncate the variants dimension is max_variant_chunks was specified
1733
+ shape[0] = variants_dim_size
1734
+ a = root.empty(
1735
+ variable.name,
1736
+ shape=shape,
1540
1737
  chunks=variable.chunks,
1541
1738
  dtype=variable.dtype,
1542
1739
  compressor=numcodecs.get_codec(variable.compressor),
1543
1740
  filters=[numcodecs.get_codec(filt) for filt in variable.filters],
1544
1741
  object_codec=object_codec,
1545
- dimension_separator=self.dimension_separator,
1742
+ dimension_separator=self.metadata.dimension_separator,
1743
+ )
1744
+ a.attrs.update(
1745
+ {
1746
+ "description": variable.description,
1747
+ # Dimension names are part of the spec in Zarr v3
1748
+ "_ARRAY_DIMENSIONS": variable.dimensions,
1749
+ }
1546
1750
  )
1547
- # Dimension names are part of the spec in Zarr v3
1548
- a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
1751
+ logger.debug(f"Initialised {a}")
1549
1752
 
1550
- def get_array(self, name):
1551
- return self.root["wip_" + name]
1753
+ #######################
1754
+ # encode_partition
1755
+ #######################
1552
1756
 
1553
- def finalise_array(self, variable_name):
1554
- source = self.path / ("wip_" + variable_name)
1555
- dest = self.path / variable_name
1757
+ def load_metadata(self):
1758
+ if self.metadata is None:
1759
+ with open(self.wip_path / "metadata.json") as f:
1760
+ self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
1761
+ self.icf = IntermediateColumnarFormat(self.metadata.icf_path)
1762
+
1763
+ def partition_path(self, partition_index):
1764
+ return self.partitions_path / f"p{partition_index}"
1765
+
1766
+ def wip_partition_array_path(self, partition_index, name):
1767
+ return self.partition_path(partition_index) / f"wip_{name}"
1768
+
1769
+ def partition_array_path(self, partition_index, name):
1770
+ return self.partition_path(partition_index) / name
1771
+
1772
+ def encode_partition(self, partition_index):
1773
+ self.load_metadata()
1774
+ partition_path = self.partition_path(partition_index)
1775
+ partition_path.mkdir(exist_ok=True)
1776
+ logger.info(f"Encoding partition {partition_index} to {partition_path}")
1777
+
1778
+ self.encode_alleles_partition(partition_index)
1779
+ self.encode_id_partition(partition_index)
1780
+ self.encode_filters_partition(partition_index)
1781
+ self.encode_contig_partition(partition_index)
1782
+ for col in self.schema.columns.values():
1783
+ if col.vcf_field is not None:
1784
+ self.encode_array_partition(col, partition_index)
1785
+ if "call_genotype" in self.schema.columns:
1786
+ self.encode_genotypes_partition(partition_index)
1787
+
1788
+ def init_partition_array(self, partition_index, name):
1789
+ wip_path = self.wip_partition_array_path(partition_index, name)
1790
+ # Create an empty array like the definition
1791
+ src = self.arrays_path / name
1792
+ # Overwrite any existing WIP files
1793
+ shutil.copytree(src, wip_path, dirs_exist_ok=True)
1794
+ array = zarr.open(wip_path)
1795
+ logger.debug(f"Opened empty array {array} @ {wip_path}")
1796
+ return array
1797
+
1798
+ def finalise_partition_array(self, partition_index, name):
1799
+ wip_path = self.wip_partition_array_path(partition_index, name)
1800
+ final_path = self.partition_array_path(partition_index, name)
1801
+ if final_path.exists():
1802
+ # NEEDS TEST
1803
+ logger.warning(f"Removing existing {final_path}")
1804
+ shutil.rmtree(final_path)
1556
1805
  # Atomic swap
1557
- os.rename(source, dest)
1558
- logger.info(f"Finalised {variable_name}")
1806
+ os.rename(wip_path, final_path)
1807
+ logger.debug(f"Encoded {name} partition {partition_index}")
1808
+
1809
+ def encode_array_partition(self, column, partition_index):
1810
+ array = self.init_partition_array(partition_index, column.name)
1559
1811
 
1560
- def encode_array_slice(self, column, start, stop):
1812
+ partition = self.metadata.partitions[partition_index]
1813
+ ba = core.BufferedArray(array, partition.start_index)
1561
1814
  source_col = self.icf.columns[column.vcf_field]
1562
- array = self.get_array(column.name)
1563
- ba = core.BufferedArray(array, start)
1564
1815
  sanitiser = source_col.sanitiser_factory(ba.buff.shape)
1565
1816
 
1566
- for value in source_col.iter_values(start, stop):
1817
+ for value in source_col.iter_values(
1818
+ partition.start_index, partition.stop_index
1819
+ ):
1567
1820
  # We write directly into the buffer in the sanitiser function
1568
1821
  # to make it easier to reason about dimension padding
1569
1822
  j = ba.next_buffer_row()
1570
1823
  sanitiser(ba.buff, j, value)
1571
1824
  ba.flush()
1572
- logger.debug(f"Encoded {column.name} slice {start}:{stop}")
1825
+ self.finalise_partition_array(partition_index, column.name)
1573
1826
 
1574
- def encode_genotypes_slice(self, start, stop):
1575
- source_col = self.icf.columns["FORMAT/GT"]
1576
- gt = core.BufferedArray(self.get_array("call_genotype"), start)
1577
- gt_mask = core.BufferedArray(self.get_array("call_genotype_mask"), start)
1578
- gt_phased = core.BufferedArray(self.get_array("call_genotype_phased"), start)
1827
+ def encode_genotypes_partition(self, partition_index):
1828
+ gt_array = self.init_partition_array(partition_index, "call_genotype")
1829
+ gt_mask_array = self.init_partition_array(partition_index, "call_genotype_mask")
1830
+ gt_phased_array = self.init_partition_array(
1831
+ partition_index, "call_genotype_phased"
1832
+ )
1833
+
1834
+ partition = self.metadata.partitions[partition_index]
1835
+ gt = core.BufferedArray(gt_array, partition.start_index)
1836
+ gt_mask = core.BufferedArray(gt_mask_array, partition.start_index)
1837
+ gt_phased = core.BufferedArray(gt_phased_array, partition.start_index)
1579
1838
 
1580
- for value in source_col.iter_values(start, stop):
1839
+ source_col = self.icf.columns["FORMAT/GT"]
1840
+ for value in source_col.iter_values(
1841
+ partition.start_index, partition.stop_index
1842
+ ):
1581
1843
  j = gt.next_buffer_row()
1582
1844
  sanitise_value_int_2d(gt.buff, j, value[:, :-1])
1583
1845
  j = gt_phased.next_buffer_row()
@@ -1589,29 +1851,40 @@ class VcfZarrWriter:
1589
1851
  gt.flush()
1590
1852
  gt_phased.flush()
1591
1853
  gt_mask.flush()
1592
- logger.debug(f"Encoded GT slice {start}:{stop}")
1593
1854
 
1594
- def encode_alleles_slice(self, start, stop):
1855
+ self.finalise_partition_array(partition_index, "call_genotype")
1856
+ self.finalise_partition_array(partition_index, "call_genotype_mask")
1857
+ self.finalise_partition_array(partition_index, "call_genotype_phased")
1858
+
1859
+ def encode_alleles_partition(self, partition_index):
1860
+ array_name = "variant_allele"
1861
+ alleles_array = self.init_partition_array(partition_index, array_name)
1862
+ partition = self.metadata.partitions[partition_index]
1863
+ alleles = core.BufferedArray(alleles_array, partition.start_index)
1595
1864
  ref_col = self.icf.columns["REF"]
1596
1865
  alt_col = self.icf.columns["ALT"]
1597
- alleles = core.BufferedArray(self.get_array("variant_allele"), start)
1598
1866
 
1599
1867
  for ref, alt in zip(
1600
- ref_col.iter_values(start, stop), alt_col.iter_values(start, stop)
1868
+ ref_col.iter_values(partition.start_index, partition.stop_index),
1869
+ alt_col.iter_values(partition.start_index, partition.stop_index),
1601
1870
  ):
1602
1871
  j = alleles.next_buffer_row()
1603
1872
  alleles.buff[j, :] = STR_FILL
1604
1873
  alleles.buff[j, 0] = ref[0]
1605
1874
  alleles.buff[j, 1 : 1 + len(alt)] = alt
1606
1875
  alleles.flush()
1607
- logger.debug(f"Encoded alleles slice {start}:{stop}")
1608
1876
 
1609
- def encode_id_slice(self, start, stop):
1877
+ self.finalise_partition_array(partition_index, array_name)
1878
+
1879
+ def encode_id_partition(self, partition_index):
1880
+ vid_array = self.init_partition_array(partition_index, "variant_id")
1881
+ vid_mask_array = self.init_partition_array(partition_index, "variant_id_mask")
1882
+ partition = self.metadata.partitions[partition_index]
1883
+ vid = core.BufferedArray(vid_array, partition.start_index)
1884
+ vid_mask = core.BufferedArray(vid_mask_array, partition.start_index)
1610
1885
  col = self.icf.columns["ID"]
1611
- vid = core.BufferedArray(self.get_array("variant_id"), start)
1612
- vid_mask = core.BufferedArray(self.get_array("variant_id_mask"), start)
1613
1886
 
1614
- for value in col.iter_values(start, stop):
1887
+ for value in col.iter_values(partition.start_index, partition.stop_index):
1615
1888
  j = vid.next_buffer_row()
1616
1889
  k = vid_mask.next_buffer_row()
1617
1890
  assert j == k
@@ -1623,13 +1896,19 @@ class VcfZarrWriter:
1623
1896
  vid_mask.buff[j] = True
1624
1897
  vid.flush()
1625
1898
  vid_mask.flush()
1626
- logger.debug(f"Encoded ID slice {start}:{stop}")
1627
1899
 
1628
- def encode_filters_slice(self, lookup, start, stop):
1629
- col = self.icf.columns["FILTERS"]
1630
- var_filter = core.BufferedArray(self.get_array("variant_filter"), start)
1900
+ self.finalise_partition_array(partition_index, "variant_id")
1901
+ self.finalise_partition_array(partition_index, "variant_id_mask")
1631
1902
 
1632
- for value in col.iter_values(start, stop):
1903
+ def encode_filters_partition(self, partition_index):
1904
+ lookup = {filt: index for index, filt in enumerate(self.schema.filter_id)}
1905
+ array_name = "variant_filter"
1906
+ array = self.init_partition_array(partition_index, array_name)
1907
+ partition = self.metadata.partitions[partition_index]
1908
+ var_filter = core.BufferedArray(array, partition.start_index)
1909
+
1910
+ col = self.icf.columns["FILTERS"]
1911
+ for value in col.iter_values(partition.start_index, partition.stop_index):
1633
1912
  j = var_filter.next_buffer_row()
1634
1913
  var_filter.buff[j] = False
1635
1914
  for f in value:
@@ -1637,16 +1916,21 @@ class VcfZarrWriter:
1637
1916
  var_filter.buff[j, lookup[f]] = True
1638
1917
  except KeyError:
1639
1918
  raise ValueError(
1640
- f"Filter '{f}' was not defined " f"in the header."
1919
+ f"Filter '{f}' was not defined in the header."
1641
1920
  ) from None
1642
1921
  var_filter.flush()
1643
- logger.debug(f"Encoded FILTERS slice {start}:{stop}")
1644
1922
 
1645
- def encode_contig_slice(self, lookup, start, stop):
1923
+ self.finalise_partition_array(partition_index, array_name)
1924
+
1925
+ def encode_contig_partition(self, partition_index):
1926
+ lookup = {contig: index for index, contig in enumerate(self.schema.contig_id)}
1927
+ array_name = "variant_contig"
1928
+ array = self.init_partition_array(partition_index, array_name)
1929
+ partition = self.metadata.partitions[partition_index]
1930
+ contig = core.BufferedArray(array, partition.start_index)
1646
1931
  col = self.icf.columns["CHROM"]
1647
- contig = core.BufferedArray(self.get_array("variant_contig"), start)
1648
1932
 
1649
- for value in col.iter_values(start, stop):
1933
+ for value in col.iter_values(partition.start_index, partition.stop_index):
1650
1934
  j = contig.next_buffer_row()
1651
1935
  # Note: because we are using the indexes to define the lookups
1652
1936
  # and we always have an index, it seems that we the contig lookup
@@ -1654,161 +1938,120 @@ class VcfZarrWriter:
1654
1938
  # here, please do open an issue with a reproducible example!
1655
1939
  contig.buff[j] = lookup[value[0]]
1656
1940
  contig.flush()
1657
- logger.debug(f"Encoded CHROM slice {start}:{stop}")
1658
-
1659
- def encode_samples(self):
1660
- if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
1661
- raise ValueError(
1662
- "Subsetting or reordering samples not supported currently"
1663
- ) # NEEDS TEST
1664
- array = self.root.array(
1665
- "sample_id",
1666
- self.schema.sample_id,
1667
- dtype="str",
1668
- compressor=DEFAULT_ZARR_COMPRESSOR,
1669
- chunks=(self.schema.samples_chunk_size,),
1670
- )
1671
- array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
1672
- logger.debug("Samples done")
1673
1941
 
1674
- def encode_contig_id(self):
1675
- array = self.root.array(
1676
- "contig_id",
1677
- self.schema.contig_id,
1678
- dtype="str",
1679
- compressor=DEFAULT_ZARR_COMPRESSOR,
1680
- )
1681
- array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1682
- if self.schema.contig_length is not None:
1683
- array = self.root.array(
1684
- "contig_length",
1685
- self.schema.contig_length,
1686
- dtype=np.int64,
1687
- compressor=DEFAULT_ZARR_COMPRESSOR,
1942
+ self.finalise_partition_array(partition_index, array_name)
1943
+
1944
+ #######################
1945
+ # finalise
1946
+ #######################
1947
+
1948
+ def finalise_array(self, name):
1949
+ logger.info(f"Finalising {name}")
1950
+ final_path = self.path / name
1951
+ if final_path.exists():
1952
+ # NEEDS TEST
1953
+ raise ValueError(f"Array {name} already exists")
1954
+ for partition in range(len(self.metadata.partitions)):
1955
+ # Move all the files in partition dir to dest dir
1956
+ src = self.partition_array_path(partition, name)
1957
+ if not src.exists():
1958
+ # Needs test
1959
+ raise ValueError(f"Partition {partition} of {name} does not exist")
1960
+ dest = self.arrays_path / name
1961
+ # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
1962
+ chunk_files = [
1963
+ path for path in src.iterdir() if not path.name.startswith(".")
1964
+ ]
1965
+ # TODO check for a count of then number of files. If we require a
1966
+ # dimension_separator of "/" then we could make stronger assertions
1967
+ # here, as we'd always have num_variant_chunks
1968
+ logger.debug(
1969
+ f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
1688
1970
  )
1689
- array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1690
- return {v: j for j, v in enumerate(self.schema.contig_id)}
1971
+ for chunk_file in chunk_files:
1972
+ os.rename(chunk_file, dest / chunk_file.name)
1973
+ # Finally, once all the chunks have moved into the arrays dir,
1974
+ # we move it out of wip
1975
+ os.rename(self.arrays_path / name, self.path / name)
1976
+ core.update_progress(1)
1691
1977
 
1692
- def encode_filter_id(self):
1693
- array = self.root.array(
1694
- "filter_id",
1695
- self.schema.filter_id,
1696
- dtype="str",
1697
- compressor=DEFAULT_ZARR_COMPRESSOR,
1978
+ def finalise(self, show_progress=False):
1979
+ self.load_metadata()
1980
+
1981
+ progress_config = core.ProgressConfig(
1982
+ total=len(self.schema.columns),
1983
+ title="Finalise",
1984
+ units="array",
1985
+ show=show_progress,
1698
1986
  )
1699
- array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
1700
- return {v: j for j, v in enumerate(self.schema.filter_id)}
1987
+ # NOTE: it's not clear that adding more workers will make this quicker,
1988
+ # as it's just going to be causing contention on the file system.
1989
+ # Something to check empirically in some deployments.
1990
+ # FIXME we're just using worker_processes=0 here to hook into the
1991
+ # SynchronousExecutor which is intended for testing purposes so
1992
+ # that we get test coverage. Should fix this either by allowing
1993
+ # for multiple workers, or making a standard wrapper for tqdm
1994
+ # that allows us to have a consistent look and feel.
1995
+ with core.ParallelWorkManager(0, progress_config) as pwm:
1996
+ for name in self.schema.columns:
1997
+ pwm.submit(self.finalise_array, name)
1998
+ zarr.consolidate_metadata(self.path)
1701
1999
 
1702
- def init(self):
1703
- self.root.attrs["vcf_zarr_version"] = "0.2"
1704
- self.root.attrs["vcf_header"] = self.icf.vcf_header
1705
- self.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
1706
- for column in self.schema.columns.values():
1707
- self.init_array(column)
2000
+ ######################
2001
+ # encode_all_partitions
2002
+ ######################
1708
2003
 
1709
- def finalise(self):
1710
- zarr.consolidate_metadata(self.path)
2004
+ def get_max_encoding_memory(self):
2005
+ """
2006
+ Return the approximate maximum memory used to encode a variant chunk.
2007
+ """
2008
+ max_encoding_mem = max(
2009
+ col.variant_chunk_nbytes for col in self.schema.columns.values()
2010
+ )
2011
+ gt_mem = 0
2012
+ if "call_genotype" in self.schema.columns:
2013
+ encoded_together = [
2014
+ "call_genotype",
2015
+ "call_genotype_phased",
2016
+ "call_genotype_mask",
2017
+ ]
2018
+ gt_mem = sum(
2019
+ self.schema.columns[col].variant_chunk_nbytes
2020
+ for col in encoded_together
2021
+ )
2022
+ return max(max_encoding_mem, gt_mem)
1711
2023
 
1712
- def encode(
1713
- self,
1714
- worker_processes=1,
1715
- max_v_chunks=None,
1716
- show_progress=False,
1717
- max_memory=None,
2024
+ def encode_all_partitions(
2025
+ self, *, worker_processes=1, show_progress=False, max_memory=None
1718
2026
  ):
1719
2027
  max_memory = parse_max_memory(max_memory)
1720
-
1721
- # TODO this will move into the setup logic later when we're making it possible
1722
- # to split the work by slice
1723
- num_slices = max(1, worker_processes * 4)
1724
- # Using POS arbitrarily to get the array slices
1725
- slices = core.chunk_aligned_slices(
1726
- self.get_array("variant_position"), num_slices, max_chunks=max_v_chunks
2028
+ self.load_metadata()
2029
+ num_partitions = self.num_partitions
2030
+ per_worker_memory = self.get_max_encoding_memory()
2031
+ logger.info(
2032
+ f"Encoding Zarr over {num_partitions} partitions with "
2033
+ f"{worker_processes} workers and {display_size(per_worker_memory)} "
2034
+ "per worker"
1727
2035
  )
1728
- truncated = slices[-1][-1]
1729
- for array in self.root.values():
1730
- if array.attrs["_ARRAY_DIMENSIONS"][0] == "variants":
1731
- shape = list(array.shape)
1732
- shape[0] = truncated
1733
- array.resize(shape)
1734
-
1735
- total_bytes = 0
1736
- encoding_memory_requirements = {}
1737
- for col in self.schema.columns.values():
1738
- array = self.get_array(col.name)
1739
- # NOTE!! this is bad, we're potentially creating quite a large
1740
- # numpy array for basically nothing. We can compute this.
1741
- variant_chunk_size = array.blocks[0].nbytes
1742
- encoding_memory_requirements[col.name] = variant_chunk_size
1743
- logger.debug(
1744
- f"{col.name} requires at least {display_size(variant_chunk_size)} "
1745
- f"per worker"
2036
+ # Each partition requires per_worker_memory bytes, so to prevent more that
2037
+ # max_memory being used, we clamp the number of workers
2038
+ max_num_workers = max_memory // per_worker_memory
2039
+ if max_num_workers < worker_processes:
2040
+ logger.warning(
2041
+ f"Limiting number of workers to {max_num_workers} to "
2042
+ f"keep within specified memory budget of {display_size(max_memory)}"
1746
2043
  )
1747
- total_bytes += array.nbytes
1748
-
1749
- filter_id_map = self.encode_filter_id()
1750
- contig_id_map = self.encode_contig_id()
1751
-
1752
- work = []
1753
- for start, stop in slices:
1754
- for col in self.schema.columns.values():
1755
- if col.vcf_field is not None:
1756
- f = functools.partial(self.encode_array_slice, col)
1757
- work.append(
1758
- EncodingWork(
1759
- f,
1760
- start,
1761
- stop,
1762
- [col.name],
1763
- encoding_memory_requirements[col.name],
1764
- )
1765
- )
1766
- work.append(
1767
- EncodingWork(self.encode_alleles_slice, start, stop, ["variant_allele"])
1768
- )
1769
- work.append(
1770
- EncodingWork(
1771
- self.encode_id_slice, start, stop, ["variant_id", "variant_id_mask"]
1772
- )
1773
- )
1774
- work.append(
1775
- EncodingWork(
1776
- functools.partial(self.encode_filters_slice, filter_id_map),
1777
- start,
1778
- stop,
1779
- ["variant_filter"],
1780
- )
1781
- )
1782
- work.append(
1783
- EncodingWork(
1784
- functools.partial(self.encode_contig_slice, contig_id_map),
1785
- start,
1786
- stop,
1787
- ["variant_contig"],
1788
- )
2044
+ if max_num_workers <= 0:
2045
+ raise ValueError(
2046
+ f"Insufficient memory to encode a partition:"
2047
+ f"{display_size(per_worker_memory)} > {display_size(max_memory)}"
1789
2048
  )
1790
- if "call_genotype" in self.schema.columns:
1791
- variables = [
1792
- "call_genotype",
1793
- "call_genotype_phased",
1794
- "call_genotype_mask",
1795
- ]
1796
- gt_memory = sum(
1797
- encoding_memory_requirements[name] for name in variables
1798
- )
1799
- work.append(
1800
- EncodingWork(
1801
- self.encode_genotypes_slice, start, stop, variables, gt_memory
1802
- )
1803
- )
2049
+ num_workers = min(max_num_workers, worker_processes)
1804
2050
 
1805
- # Fail early if we can't fit a particular column into memory
1806
- for wp in work:
1807
- if wp.memory > max_memory:
1808
- raise ValueError(
1809
- f"Insufficient memory for {wp.columns}: "
1810
- f"{display_size(wp.memory)} > {display_size(max_memory)}"
1811
- )
2051
+ total_bytes = 0
2052
+ for col in self.schema.columns.values():
2053
+ # Open the array definition to get the total size
2054
+ total_bytes += zarr.open(self.arrays_path / col.name).nbytes
1812
2055
 
1813
2056
  progress_config = core.ProgressConfig(
1814
2057
  total=total_bytes,
@@ -1816,54 +2059,9 @@ class VcfZarrWriter:
1816
2059
  units="B",
1817
2060
  show=show_progress,
1818
2061
  )
1819
-
1820
- used_memory = 0
1821
- # We need to keep some bounds on the queue size or the memory bounds algorithm
1822
- # below doesn't really work.
1823
- max_queued = 4 * max(1, worker_processes)
1824
- encoded_slices = collections.Counter()
1825
-
1826
- with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1827
- future = pwm.submit(self.encode_samples)
1828
- future_to_work = {future: EncodingWork(None, 0, 0, [])}
1829
-
1830
- def service_completed_futures():
1831
- nonlocal used_memory
1832
-
1833
- completed = pwm.wait_for_completed()
1834
- for future in completed:
1835
- wp_done = future_to_work.pop(future)
1836
- used_memory -= wp_done.memory
1837
- logger.debug(
1838
- f"Complete {wp_done}: used mem={display_size(used_memory)}"
1839
- )
1840
- for column in wp_done.columns:
1841
- encoded_slices[column] += 1
1842
- if encoded_slices[column] == len(slices):
1843
- # Do this syncronously for simplicity. Should be
1844
- # fine as the workers will probably be busy with
1845
- # large encode tasks most of the time.
1846
- self.finalise_array(column)
1847
-
1848
- for wp in work:
1849
- while (
1850
- used_memory + wp.memory > max_memory
1851
- or len(future_to_work) > max_queued
1852
- ):
1853
- logger.debug(
1854
- f"Wait: mem_required={used_memory + wp.memory} "
1855
- f"max_mem={max_memory} queued={len(future_to_work)} "
1856
- f"max_queued={max_queued}"
1857
- )
1858
- service_completed_futures()
1859
- future = pwm.submit(wp.func, wp.start, wp.stop)
1860
- used_memory += wp.memory
1861
- logger.debug(f"Submit {wp}: used mem={display_size(used_memory)}")
1862
- future_to_work[future] = wp
1863
-
1864
- logger.debug("All work submitted")
1865
- while len(future_to_work) > 0:
1866
- service_completed_futures()
2062
+ with core.ParallelWorkManager(num_workers, progress_config) as pwm:
2063
+ for partition_index in range(num_partitions):
2064
+ pwm.submit(self.encode_partition, partition_index)
1867
2065
 
1868
2066
 
1869
2067
  def mkschema(if_path, out):
@@ -1878,13 +2076,48 @@ def encode(
1878
2076
  schema_path=None,
1879
2077
  variants_chunk_size=None,
1880
2078
  samples_chunk_size=None,
1881
- max_v_chunks=None,
2079
+ max_variant_chunks=None,
1882
2080
  dimension_separator=None,
1883
2081
  max_memory=None,
1884
2082
  worker_processes=1,
1885
2083
  show_progress=False,
1886
2084
  ):
1887
- icf = IntermediateColumnarFormat(if_path)
2085
+ # Rough heuristic to split work up enough to keep utilisation high
2086
+ target_num_partitions = max(1, worker_processes * 4)
2087
+ encode_init(
2088
+ if_path,
2089
+ zarr_path,
2090
+ target_num_partitions,
2091
+ schema_path=schema_path,
2092
+ variants_chunk_size=variants_chunk_size,
2093
+ samples_chunk_size=samples_chunk_size,
2094
+ max_variant_chunks=max_variant_chunks,
2095
+ dimension_separator=dimension_separator,
2096
+ )
2097
+ vzw = VcfZarrWriter(zarr_path)
2098
+ vzw.encode_all_partitions(
2099
+ worker_processes=worker_processes,
2100
+ show_progress=show_progress,
2101
+ max_memory=max_memory,
2102
+ )
2103
+ vzw.finalise(show_progress)
2104
+
2105
+
2106
+ def encode_init(
2107
+ icf_path,
2108
+ zarr_path,
2109
+ target_num_partitions,
2110
+ *,
2111
+ schema_path=None,
2112
+ variants_chunk_size=None,
2113
+ samples_chunk_size=None,
2114
+ max_variant_chunks=None,
2115
+ dimension_separator=None,
2116
+ max_memory=None,
2117
+ worker_processes=1,
2118
+ show_progress=False,
2119
+ ):
2120
+ icf = IntermediateColumnarFormat(icf_path)
1888
2121
  if schema_path is None:
1889
2122
  schema = VcfZarrSchema.generate(
1890
2123
  icf,
@@ -1900,18 +2133,25 @@ def encode(
1900
2133
  with open(schema_path) as f:
1901
2134
  schema = VcfZarrSchema.fromjson(f.read())
1902
2135
  zarr_path = pathlib.Path(zarr_path)
1903
- if zarr_path.exists():
1904
- logger.warning(f"Deleting existing {zarr_path}")
1905
- shutil.rmtree(zarr_path)
1906
- vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
1907
- vzw.init()
1908
- vzw.encode(
1909
- max_v_chunks=max_v_chunks,
1910
- worker_processes=worker_processes,
1911
- max_memory=max_memory,
1912
- show_progress=show_progress,
2136
+ vzw = VcfZarrWriter(zarr_path)
2137
+ vzw.init(
2138
+ icf,
2139
+ target_num_partitions=target_num_partitions,
2140
+ schema=schema,
2141
+ dimension_separator=dimension_separator,
2142
+ max_variant_chunks=max_variant_chunks,
1913
2143
  )
1914
- vzw.finalise()
2144
+ return vzw.num_partitions, vzw.get_max_encoding_memory()
2145
+
2146
+
2147
+ def encode_partition(zarr_path, partition):
2148
+ writer = VcfZarrWriter(zarr_path)
2149
+ writer.encode_partition(partition)
2150
+
2151
+
2152
+ def encode_finalise(zarr_path, show_progress=False):
2153
+ writer = VcfZarrWriter(zarr_path)
2154
+ writer.finalise(show_progress=show_progress)
1915
2155
 
1916
2156
 
1917
2157
  def convert(
@@ -2121,7 +2361,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
2121
2361
  assert pos[start_index] == first_pos
2122
2362
  vcf = cyvcf2.VCF(vcf_path)
2123
2363
  if show_progress:
2124
- iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
2364
+ iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records) # NEEDS TEST
2125
2365
  else:
2126
2366
  iterator = vcf
2127
2367
  for j, row in enumerate(iterator, start_index):