bio2zarr 0.0.4__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/__init__.py CHANGED
@@ -1 +1 @@
1
- from . provenance import __version__
1
+ from .provenance import __version__ # noqa F401
bio2zarr/__main__.py CHANGED
@@ -2,11 +2,13 @@ import click
2
2
 
3
3
  from . import cli
4
4
 
5
+
5
6
  @cli.version
6
7
  @click.group()
7
8
  def bio2zarr():
8
9
  pass
9
10
 
11
+
10
12
  # Provide a single top-level interface to all of the functionality.
11
13
  # This probably isn't the recommended way of interacting, as we
12
14
  # install individual commands as console scripts. However, this
bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.4'
16
- __version_tuple__ = version_tuple = (0, 0, 4)
15
+ __version__ = version = '0.0.5'
16
+ __version_tuple__ = version_tuple = (0, 0, 5)
bio2zarr/cli.py CHANGED
@@ -4,15 +4,11 @@ import pathlib
4
4
  import shutil
5
5
 
6
6
  import click
7
- import tabulate
8
7
  import coloredlogs
9
8
  import numcodecs
9
+ import tabulate
10
10
 
11
- from . import vcf
12
- from . import vcf_utils
13
- from . import plink
14
- from . import provenance
15
-
11
+ from . import plink, provenance, vcf, vcf_utils
16
12
 
17
13
  logger = logging.getLogger(__name__)
18
14
 
@@ -75,7 +71,7 @@ compressor = click.option(
75
71
  "--compressor",
76
72
  type=click.Choice(["lz4", "zstd"]),
77
73
  default=None,
78
- help="Codec to use for compressing column chunks (Default=zstd)."
74
+ help="Codec to use for compressing column chunks (Default=zstd).",
79
75
  )
80
76
 
81
77
  # Note: -l and -w were chosen when these were called "width" and "length".
@@ -207,7 +203,7 @@ def dexplode_partition(icf_path, partition, verbose):
207
203
  from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
208
204
  """
209
205
  setup_logging(verbose)
210
- vcf.explode_partition(icf_path, partition, show_progress=True)
206
+ vcf.explode_partition(icf_path, partition, show_progress=False)
211
207
 
212
208
 
213
209
  @click.command
bio2zarr/core.py CHANGED
@@ -1,16 +1,15 @@
1
- import dataclasses
2
- import contextlib
3
1
  import concurrent.futures as cf
2
+ import contextlib
3
+ import dataclasses
4
+ import logging
4
5
  import multiprocessing
5
6
  import threading
6
- import logging
7
7
  import time
8
8
 
9
- import zarr
9
+ import numcodecs
10
10
  import numpy as np
11
11
  import tqdm
12
- import numcodecs
13
-
12
+ import zarr
14
13
 
15
14
  logger = logging.getLogger(__name__)
16
15
 
bio2zarr/plink.py CHANGED
@@ -1,14 +1,13 @@
1
1
  import logging
2
2
 
3
+ import bed_reader
3
4
  import humanfriendly
5
+ import numcodecs
4
6
  import numpy as np
5
7
  import zarr
6
- import bed_reader
7
- import numcodecs
8
8
 
9
9
  from . import core
10
10
 
11
-
12
11
  logger = logging.getLogger(__name__)
13
12
 
14
13
 
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
24
23
  gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
25
24
  gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
26
25
  variants_chunk_size = gt.array.chunks[0]
27
- n = gt.array.shape[1]
28
26
  assert start % variants_chunk_size == 0
29
27
 
30
28
  logger.debug(f"Reading slice {start}:{stop}")
@@ -96,7 +94,7 @@ def convert(
96
94
  chunks=(samples_chunk_size,),
97
95
  )
98
96
  a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
99
- logger.debug(f"Encoded samples")
97
+ logger.debug("Encoded samples")
100
98
 
101
99
  # TODO encode these in slices - but read them in one go to avoid
102
100
  # fetching repeatedly from bim file
@@ -108,7 +106,7 @@ def convert(
108
106
  chunks=(variants_chunk_size,),
109
107
  )
110
108
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
111
- logger.debug(f"encoded variant_position")
109
+ logger.debug("encoded variant_position")
112
110
 
113
111
  alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
114
112
  a = root.array(
@@ -119,7 +117,7 @@ def convert(
119
117
  chunks=(variants_chunk_size,),
120
118
  )
121
119
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
122
- logger.debug(f"encoded variant_allele")
120
+ logger.debug("encoded variant_allele")
123
121
 
124
122
  # TODO remove this?
125
123
  a = root.empty(
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
201
199
  elif bed_call == 2:
202
200
  assert list(zarr_call) == [1, 1]
203
201
  else: # pragma no cover
204
- assert False
202
+ raise AssertionError(f"Unexpected bed call {bed_call}")
bio2zarr/typing.py CHANGED
@@ -1,4 +1,4 @@
1
1
  from pathlib import Path
2
2
  from typing import Union
3
3
 
4
- PathType = Union[str, Path]
4
+ PathType = Union[str, Path]
bio2zarr/vcf.py CHANGED
@@ -1,29 +1,27 @@
1
1
  import collections
2
+ import contextlib
2
3
  import dataclasses
3
4
  import functools
5
+ import json
4
6
  import logging
7
+ import math
5
8
  import os
6
9
  import pathlib
7
10
  import pickle
8
- import sys
9
11
  import shutil
10
- import json
11
- import math
12
+ import sys
12
13
  import tempfile
13
- import contextlib
14
14
  from typing import Any, List
15
15
 
16
- import humanfriendly
17
16
  import cyvcf2
17
+ import humanfriendly
18
18
  import numcodecs
19
19
  import numpy as np
20
20
  import numpy.testing as nt
21
21
  import tqdm
22
22
  import zarr
23
23
 
24
- from . import core
25
- from . import provenance
26
- from . import vcf_utils
24
+ from . import core, provenance, vcf_utils
27
25
 
28
26
  logger = logging.getLogger(__name__)
29
27
 
@@ -284,9 +282,25 @@ def scan_vcf(path, target_num_partitions):
284
282
  return metadata, vcf.raw_header
285
283
 
286
284
 
285
+ def check_overlap(partitions):
286
+ for i in range(1, len(partitions)):
287
+ prev_partition = partitions[i - 1]
288
+ current_partition = partitions[i]
289
+ if (
290
+ prev_partition.region.contig == current_partition.region.contig
291
+ and prev_partition.region.end > current_partition.region.start
292
+ ):
293
+ raise ValueError(
294
+ f"Multiple VCFs have the region "
295
+ f"{prev_partition.region.contig}:{prev_partition.region.start}-"
296
+ f"{current_partition.region.end}"
297
+ )
298
+
299
+
287
300
  def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
288
301
  logger.info(
289
- f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
302
+ f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
303
+ f" partitions."
290
304
  )
291
305
  # An easy mistake to make is to pass the same file twice. Check this early on.
292
306
  for path, count in collections.Counter(paths).items():
@@ -331,6 +345,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
331
345
  all_partitions.sort(
332
346
  key=lambda x: (contig_index_map[x.region.contig], x.region.start)
333
347
  )
348
+ check_overlap(all_partitions)
334
349
  icf_metadata.partitions = all_partitions
335
350
  logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
336
351
  return icf_metadata, header
@@ -791,6 +806,8 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
791
806
  for vcf_field in icf_metadata.fields:
792
807
  field_path = get_vcf_field_path(out_path, vcf_field)
793
808
  field_partition_path = field_path / f"p{partition_index}"
809
+ # Should be robust to running explode_partition twice.
810
+ field_partition_path.mkdir(exist_ok=True)
794
811
  transformer = VcfValueTransformer.factory(vcf_field, num_samples)
795
812
  self.field_writers[vcf_field.full_name] = IcfFieldWriter(
796
813
  vcf_field,
@@ -832,7 +849,7 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
832
849
  partition.num_records for partition in self.metadata.partitions
833
850
  ]
834
851
  # Allow us to find which partition a given record is in
835
- self.partition_record_index = np.cumsum([0] + partition_num_records)
852
+ self.partition_record_index = np.cumsum([0, *partition_num_records])
836
853
  for field in self.metadata.fields:
837
854
  self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
838
855
  logger.info(
@@ -842,7 +859,8 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
842
859
 
843
860
  def __repr__(self):
844
861
  return (
845
- f"IntermediateColumnarFormat(fields={len(self)}, partitions={self.num_partitions}, "
862
+ f"IntermediateColumnarFormat(fields={len(self)}, "
863
+ f"partitions={self.num_partitions}, "
846
864
  f"records={self.num_records}, path={self.path})"
847
865
  )
848
866
 
@@ -890,15 +908,6 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
890
908
  return len(self.columns)
891
909
 
892
910
 
893
-
894
- def mkdir_with_progress(path):
895
- logger.debug(f"mkdir f{path}")
896
- # NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
897
- # parents=True will take care of it.
898
- path.mkdir(parents=True)
899
- core.update_progress(1)
900
-
901
-
902
911
  class IntermediateColumnarFormatWriter:
903
912
  def __init__(self, path):
904
913
  self.path = pathlib.Path(path)
@@ -941,45 +950,29 @@ class IntermediateColumnarFormatWriter:
941
950
  # dependencies as well.
942
951
  self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
943
952
 
944
- self.mkdirs(worker_processes, show_progress=show_progress)
953
+ self.mkdirs()
945
954
 
946
955
  # Note: this is needed for the current version of the vcfzarr spec, but it's
947
956
  # probably going to be dropped.
948
957
  # https://github.com/pystatgen/vcf-zarr-spec/issues/15
949
958
  # May be useful to keep lying around still though?
950
- logger.info(f"Writing VCF header")
959
+ logger.info("Writing VCF header")
951
960
  with open(self.path / "header.txt", "w") as f:
952
961
  f.write(header)
953
962
 
954
- logger.info(f"Writing WIP metadata")
963
+ logger.info("Writing WIP metadata")
955
964
  with open(self.wip_path / "metadata.json", "w") as f:
956
965
  json.dump(self.metadata.asdict(), f, indent=4)
957
966
  return self.num_partitions
958
967
 
959
- def mkdirs(self, worker_processes=1, show_progress=False):
960
- num_dirs = len(self.metadata.fields) * self.num_partitions
961
- logger.info(f"Creating {num_dirs} directories")
968
+ def mkdirs(self):
969
+ num_dirs = len(self.metadata.fields)
970
+ logger.info(f"Creating {num_dirs} field directories")
962
971
  self.path.mkdir()
963
972
  self.wip_path.mkdir()
964
- # Due to high latency batch system filesystems, we create all the directories in
965
- # parallel
966
- progress_config = core.ProgressConfig(
967
- total=num_dirs,
968
- units="dirs",
969
- title="Mkdirs",
970
- show=show_progress,
971
- )
972
- with core.ParallelWorkManager(
973
- worker_processes=worker_processes, progress_config=progress_config
974
- ) as manager:
975
- for field in self.metadata.fields:
976
- col_path = get_vcf_field_path(self.path, field)
977
- # Don't bother trying to count the intermediate directories towards
978
- # progress
979
- manager.submit(col_path.mkdir, parents=True)
980
- for j in range(self.num_partitions):
981
- part_path = col_path / f"p{j}"
982
- manager.submit(mkdir_with_progress, part_path)
973
+ for field in self.metadata.fields:
974
+ col_path = get_vcf_field_path(self.path, field)
975
+ col_path.mkdir(parents=True)
983
976
 
984
977
  def load_partition_summaries(self):
985
978
  summaries = []
@@ -995,13 +988,14 @@ class IntermediateColumnarFormatWriter:
995
988
  not_found.append(j)
996
989
  if len(not_found) > 0:
997
990
  raise FileNotFoundError(
998
- f"Partition metadata not found for {len(not_found)} partitions: {not_found}"
991
+ f"Partition metadata not found for {len(not_found)}"
992
+ f" partitions: {not_found}"
999
993
  )
1000
994
  return summaries
1001
995
 
1002
996
  def load_metadata(self):
1003
997
  if self.metadata is None:
1004
- with open(self.wip_path / f"metadata.json") as f:
998
+ with open(self.wip_path / "metadata.json") as f:
1005
999
  self.metadata = IcfMetadata.fromdict(json.load(f))
1006
1000
 
1007
1001
  def process_partition(self, partition_index):
@@ -1050,12 +1044,14 @@ class IntermediateColumnarFormatWriter:
1050
1044
  for field in format_fields:
1051
1045
  val = variant.format(field.name)
1052
1046
  tcw.append(field.full_name, val)
1053
- # Note: an issue with updating the progress per variant here like this
1054
- # is that we get a significant pause at the end of the counter while
1055
- # all the "small" fields get flushed. Possibly not much to be done about it.
1047
+ # Note: an issue with updating the progress per variant here like
1048
+ # this is that we get a significant pause at the end of the counter
1049
+ # while all the "small" fields get flushed. Possibly not much to be
1050
+ # done about it.
1056
1051
  core.update_progress(1)
1057
1052
  logger.info(
1058
- f"Finished reading VCF for partition {partition_index}, flushing buffers"
1053
+ f"Finished reading VCF for partition {partition_index}, "
1054
+ f"flushing buffers"
1059
1055
  )
1060
1056
 
1061
1057
  partition_metadata = {
@@ -1137,11 +1133,11 @@ class IntermediateColumnarFormatWriter:
1137
1133
  for summary in partition_summaries:
1138
1134
  field.summary.update(summary["field_summaries"][field.full_name])
1139
1135
 
1140
- logger.info(f"Finalising metadata")
1136
+ logger.info("Finalising metadata")
1141
1137
  with open(self.path / "metadata.json", "w") as f:
1142
1138
  json.dump(self.metadata.asdict(), f, indent=4)
1143
1139
 
1144
- logger.debug(f"Removing WIP directory")
1140
+ logger.debug("Removing WIP directory")
1145
1141
  shutil.rmtree(self.wip_path)
1146
1142
 
1147
1143
 
@@ -1155,7 +1151,7 @@ def explode(
1155
1151
  compressor=None,
1156
1152
  ):
1157
1153
  writer = IntermediateColumnarFormatWriter(icf_path)
1158
- num_partitions = writer.init(
1154
+ writer.init(
1159
1155
  vcfs,
1160
1156
  # Heuristic to get reasonable worker utilisation with lumpy partition sizing
1161
1157
  target_num_partitions=max(1, worker_processes * 4),
@@ -1226,20 +1222,25 @@ class ZarrColumnSpec:
1226
1222
  dtype: str
1227
1223
  shape: tuple
1228
1224
  chunks: tuple
1229
- dimensions: list
1225
+ dimensions: tuple
1230
1226
  description: str
1231
1227
  vcf_field: str
1232
- compressor: dict = None
1233
- filters: list = None
1234
- # TODO add filters
1228
+ compressor: dict
1229
+ filters: list
1235
1230
 
1236
1231
  def __post_init__(self):
1232
+ # Ensure these are tuples for ease of comparison and consistency
1237
1233
  self.shape = tuple(self.shape)
1238
1234
  self.chunks = tuple(self.chunks)
1239
1235
  self.dimensions = tuple(self.dimensions)
1240
- self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
1241
- self.filters = []
1242
- self._choose_compressor_settings()
1236
+
1237
+ @staticmethod
1238
+ def new(**kwargs):
1239
+ spec = ZarrColumnSpec(
1240
+ **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
1241
+ )
1242
+ spec._choose_compressor_settings()
1243
+ return spec
1243
1244
 
1244
1245
  def _choose_compressor_settings(self):
1245
1246
  """
@@ -1315,7 +1316,7 @@ class VcfZarrSchema:
1315
1316
  def fixed_field_spec(
1316
1317
  name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
1317
1318
  ):
1318
- return ZarrColumnSpec(
1319
+ return ZarrColumnSpec.new(
1319
1320
  vcf_field=vcf_field,
1320
1321
  name=name,
1321
1322
  dtype=dtype,
@@ -1383,14 +1384,23 @@ class VcfZarrSchema:
1383
1384
  if field.category == "FORMAT":
1384
1385
  prefix = "call_"
1385
1386
  shape.append(n)
1386
- chunks.append(samples_chunk_size),
1387
+ chunks.append(samples_chunk_size)
1387
1388
  dimensions.append("samples")
1388
1389
  # TODO make an option to add in the empty extra dimension
1389
1390
  if field.summary.max_number > 1:
1390
1391
  shape.append(field.summary.max_number)
1391
- dimensions.append(field.name)
1392
+ # TODO we should really be checking this to see if the named dimensions
1393
+ # are actually correct.
1394
+ if field.vcf_number == "R":
1395
+ dimensions.append("alleles")
1396
+ elif field.vcf_number == "A":
1397
+ dimensions.append("alt_alleles")
1398
+ elif field.vcf_number == "G":
1399
+ dimensions.append("genotypes")
1400
+ else:
1401
+ dimensions.append(f"{field.category}_{field.name}_dim")
1392
1402
  variable_name = prefix + field.name
1393
- colspec = ZarrColumnSpec(
1403
+ colspec = ZarrColumnSpec.new(
1394
1404
  vcf_field=field.full_name,
1395
1405
  name=variable_name,
1396
1406
  dtype=field.smallest_dtype(),
@@ -1408,7 +1418,7 @@ class VcfZarrSchema:
1408
1418
  dimensions = ["variants", "samples"]
1409
1419
 
1410
1420
  colspecs.append(
1411
- ZarrColumnSpec(
1421
+ ZarrColumnSpec.new(
1412
1422
  vcf_field=None,
1413
1423
  name="call_genotype_phased",
1414
1424
  dtype="bool",
@@ -1421,7 +1431,7 @@ class VcfZarrSchema:
1421
1431
  shape += [ploidy]
1422
1432
  dimensions += ["ploidy"]
1423
1433
  colspecs.append(
1424
- ZarrColumnSpec(
1434
+ ZarrColumnSpec.new(
1425
1435
  vcf_field=None,
1426
1436
  name="call_genotype",
1427
1437
  dtype=gt_field.smallest_dtype(),
@@ -1432,7 +1442,7 @@ class VcfZarrSchema:
1432
1442
  )
1433
1443
  )
1434
1444
  colspecs.append(
1435
- ZarrColumnSpec(
1445
+ ZarrColumnSpec.new(
1436
1446
  vcf_field=None,
1437
1447
  name="call_genotype_mask",
1438
1448
  dtype="bool",
@@ -1514,7 +1524,9 @@ class VcfZarrWriter:
1514
1524
  self.schema = schema
1515
1525
  # Default to using nested directories following the Zarr v3 default.
1516
1526
  # This seems to require version 2.17+ to work properly
1517
- self.dimension_separator = "/" if dimension_separator is None else dimension_separator
1527
+ self.dimension_separator = (
1528
+ "/" if dimension_separator is None else dimension_separator
1529
+ )
1518
1530
  store = zarr.DirectoryStore(self.path)
1519
1531
  self.root = zarr.group(store=store)
1520
1532
 
@@ -1624,7 +1636,9 @@ class VcfZarrWriter:
1624
1636
  try:
1625
1637
  var_filter.buff[j, lookup[f]] = True
1626
1638
  except KeyError:
1627
- raise ValueError(f"Filter '{f}' was not defined in the header.")
1639
+ raise ValueError(
1640
+ f"Filter '{f}' was not defined " f"in the header."
1641
+ ) from None
1628
1642
  var_filter.flush()
1629
1643
  logger.debug(f"Encoded FILTERS slice {start}:{stop}")
1630
1644
 
@@ -1727,7 +1741,8 @@ class VcfZarrWriter:
1727
1741
  variant_chunk_size = array.blocks[0].nbytes
1728
1742
  encoding_memory_requirements[col.name] = variant_chunk_size
1729
1743
  logger.debug(
1730
- f"{col.name} requires at least {display_size(variant_chunk_size)} per worker"
1744
+ f"{col.name} requires at least {display_size(variant_chunk_size)} "
1745
+ f"per worker"
1731
1746
  )
1732
1747
  total_bytes += array.nbytes
1733
1748
 
@@ -1836,8 +1851,9 @@ class VcfZarrWriter:
1836
1851
  or len(future_to_work) > max_queued
1837
1852
  ):
1838
1853
  logger.debug(
1839
- f"Wait: mem_required={used_memory + wp.memory} max_mem={max_memory} "
1840
- f"queued={len(future_to_work)} max_queued={max_queued}"
1854
+ f"Wait: mem_required={used_memory + wp.memory} "
1855
+ f"max_mem={max_memory} queued={len(future_to_work)} "
1856
+ f"max_queued={max_queued}"
1841
1857
  )
1842
1858
  service_completed_futures()
1843
1859
  future = pwm.submit(wp.func, wp.start, wp.stop)
@@ -1881,7 +1897,7 @@ def encode(
1881
1897
  raise ValueError(
1882
1898
  "Cannot specify schema along with chunk sizes"
1883
1899
  ) # NEEDS TEST
1884
- with open(schema_path, "r") as f:
1900
+ with open(schema_path) as f:
1885
1901
  schema = VcfZarrSchema.fromjson(f.read())
1886
1902
  zarr_path = pathlib.Path(zarr_path)
1887
1903
  if zarr_path.exists():
@@ -1962,7 +1978,7 @@ def assert_all_fill(zarr_val, vcf_type):
1962
1978
  elif vcf_type == "Float":
1963
1979
  assert_all_fill_float(zarr_val)
1964
1980
  else: # pragma: no cover
1965
- assert False
1981
+ assert False # noqa PT015
1966
1982
 
1967
1983
 
1968
1984
  def assert_all_missing(zarr_val, vcf_type):
@@ -1975,7 +1991,7 @@ def assert_all_missing(zarr_val, vcf_type):
1975
1991
  elif vcf_type == "Float":
1976
1992
  assert_all_missing_float(zarr_val)
1977
1993
  else: # pragma: no cover
1978
- assert False
1994
+ assert False # noqa PT015
1979
1995
 
1980
1996
 
1981
1997
  def assert_info_val_missing(zarr_val, vcf_type):
@@ -2114,7 +2130,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
2114
2130
  assert vid[j] == ("." if row.ID is None else row.ID)
2115
2131
  assert allele[j, 0] == row.REF
2116
2132
  k = len(row.ALT)
2117
- nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT),
2133
+ nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
2118
2134
  assert np.all(allele[j, k + 1 :] == "")
2119
2135
  # TODO FILTERS
2120
2136
 
bio2zarr/vcf_utils.py CHANGED
@@ -1,14 +1,14 @@
1
- from typing import IO, Any, Dict, Optional, Sequence, Union
2
1
  import contextlib
3
- import struct
4
- import pathlib
5
2
  import gzip
6
- from dataclasses import dataclass
7
3
  import os
4
+ import pathlib
5
+ import struct
6
+ from dataclasses import dataclass
7
+ from typing import IO, Any, Dict, Optional, Sequence, Union
8
8
 
9
- import numpy as np
10
9
  import cyvcf2
11
10
  import humanfriendly
11
+ import numpy as np
12
12
 
13
13
  from bio2zarr.typing import PathType
14
14
 
@@ -38,7 +38,8 @@ def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Optional[Any] = None) -> A
38
38
  fmt : str
39
39
  A Python `struct` format string.
40
40
  nodata : Optional[Any], optional
41
- The value to return in case there is no further data in the stream, by default None
41
+ The value to return in case there is no further data in the stream,
42
+ by default None
42
43
 
43
44
  Returns
44
45
  -------
@@ -277,7 +278,8 @@ class TabixIndex:
277
278
  # Create file offsets for each element in the linear index
278
279
  file_offsets = np.array([get_file_offset(vfp) for vfp in linear_index])
279
280
 
280
- # Calculate corresponding contigs and positions or each element in the linear index
281
+ # Calculate corresponding contigs and positions or each element in
282
+ # the linear index
281
283
  contig_indexes = np.hstack(
282
284
  [np.full(len(li), i) for (i, li) in enumerate(linear_indexes)]
283
285
  )
@@ -433,6 +435,22 @@ class IndexedVcf(contextlib.AbstractContextManager):
433
435
  if var.POS >= start:
434
436
  yield var
435
437
 
438
+ def _filter_empty(self, regions):
439
+ """
440
+ Return all regions in the specified list that have one or more records.
441
+
442
+ Sometimes with Tabix indexes these seem to crop up:
443
+
444
+ - https://github.com/sgkit-dev/bio2zarr/issues/45
445
+ - https://github.com/sgkit-dev/bio2zarr/issues/120
446
+ """
447
+ ret = []
448
+ for region in regions:
449
+ variants = self.variants(region)
450
+ if next(variants, None) is not None:
451
+ ret.append(region)
452
+ return ret
453
+
436
454
  def partition_into_regions(
437
455
  self,
438
456
  num_parts: Optional[int] = None,
@@ -509,4 +527,4 @@ class IndexedVcf(contextlib.AbstractContextManager):
509
527
  if self.index.record_counts[ri] > 0:
510
528
  regions.append(Region(self.sequence_names[ri]))
511
529
 
512
- return regions
530
+ return self._filter_empty(regions)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Home-page: https://github.com/pystatgen/bio2zarr
6
6
  Author: sgkit Developers
@@ -0,0 +1,16 @@
1
+ bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
+ bio2zarr/__main__.py,sha256=hO4vV-kPFgsYq0NQwG2r-WkserPL27oqae_tUvNB7yE,527
3
+ bio2zarr/_version.py,sha256=EJB7__SNK9kQS_SWZB_U4DHJ3P8ftF6etZEihTYnuXE,411
4
+ bio2zarr/cli.py,sha256=k63xex-tQkogAlJ3N68Ikx8LqZrksXbZB2s6Z7h-zXc,11446
5
+ bio2zarr/core.py,sha256=reF9elN1dwmCoXXLgci-y5pXmAm3fTntmomHTRcG54g,8127
6
+ bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
7
+ bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
8
+ bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
9
+ bio2zarr/vcf.py,sha256=GFnwR2YP-cHU4tfHloRjyiBK9-xXDgXcAM_tz-w2qck,74324
10
+ bio2zarr/vcf_utils.py,sha256=r3NQXxWK1SYU7CcwDzSWXdX5Q8Ixk7gdCTEiFPzfUAk,17307
11
+ bio2zarr-0.0.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
+ bio2zarr-0.0.5.dist-info/METADATA,sha256=SasGYcKSRb7NqnYR98ODFvPEMdBNdpxWx5gqOt038QU,1077
13
+ bio2zarr-0.0.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
+ bio2zarr-0.0.5.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
15
+ bio2zarr-0.0.5.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
16
+ bio2zarr-0.0.5.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
2
- bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
3
- bio2zarr/_version.py,sha256=yBVOKdXLEcTVc7YV7ZPqRXhRDRt-pKrfXxcgHkgPY5g,411
4
- bio2zarr/cli.py,sha256=QE0DfoZHbBbxq9K_im9y4tJ49_Wss0zzavSjjz-85Xw,11484
5
- bio2zarr/core.py,sha256=tZb9exfFmuzbA8tUpPY8avSm9YvfH31-vUCTM4fpj78,8128
6
- bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
7
- bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
8
- bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
9
- bio2zarr/vcf.py,sha256=MEskVTDq4QntzoawPz0sfmInV0aPkIPLXXNv7GmVcmY,73870
10
- bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
11
- bio2zarr-0.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
- bio2zarr-0.0.4.dist-info/METADATA,sha256=DISckjzZ0b6FpBTfBvpmJmEe00SIdTHyB3UTsTR8rws,1077
13
- bio2zarr-0.0.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
- bio2zarr-0.0.4.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
15
- bio2zarr-0.0.4.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
16
- bio2zarr-0.0.4.dist-info/RECORD,,