bio2zarr 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/__init__.py CHANGED
@@ -1 +1 @@
1
- from . provenance import __version__
1
+ from .provenance import __version__ # noqa F401
bio2zarr/__main__.py CHANGED
@@ -2,11 +2,13 @@ import click
2
2
 
3
3
  from . import cli
4
4
 
5
+
5
6
  @cli.version
6
7
  @click.group()
7
8
  def bio2zarr():
8
9
  pass
9
10
 
11
+
10
12
  # Provide a single top-level interface to all of the functionality.
11
13
  # This probably isn't the recommended way of interacting, as we
12
14
  # install individual commands as console scripts. However, this
bio2zarr/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.3'
16
- __version_tuple__ = version_tuple = (0, 0, 3)
15
+ __version__ = version = '0.0.5'
16
+ __version_tuple__ = version_tuple = (0, 0, 5)
bio2zarr/cli.py CHANGED
@@ -4,14 +4,11 @@ import pathlib
4
4
  import shutil
5
5
 
6
6
  import click
7
- import tabulate
8
7
  import coloredlogs
8
+ import numcodecs
9
+ import tabulate
9
10
 
10
- from . import vcf
11
- from . import vcf_utils
12
- from . import plink
13
- from . import provenance
14
-
11
+ from . import plink, provenance, vcf, vcf_utils
15
12
 
16
13
  logger = logging.getLogger(__name__)
17
14
 
@@ -66,6 +63,17 @@ column_chunk_size = click.option(
66
63
  help="Approximate uncompressed size of exploded column chunks in MiB",
67
64
  )
68
65
 
66
+ # We could provide the full flexiblity of numcodecs/Blosc here, but there
67
+ # doesn't seem much point. Can always add more arguments here to control
68
+ # compression level, etc.
69
+ compressor = click.option(
70
+ "-C",
71
+ "--compressor",
72
+ type=click.Choice(["lz4", "zstd"]),
73
+ default=None,
74
+ help="Codec to use for compressing column chunks (Default=zstd).",
75
+ )
76
+
69
77
  # Note: -l and -w were chosen when these were called "width" and "length".
70
78
  # possibly there are better letters now.
71
79
  variants_chunk_size = click.option(
@@ -113,24 +121,36 @@ def check_overwrite_dir(path, force):
113
121
  shutil.rmtree(tmp_delete_path)
114
122
 
115
123
 
124
+ def get_compressor(cname):
125
+ if cname is None:
126
+ return None
127
+ config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
128
+ config["cname"] = cname
129
+ return numcodecs.get_codec(config)
130
+
131
+
116
132
  @click.command
117
133
  @vcfs
118
134
  @new_icf_path
119
135
  @force
120
136
  @verbose
121
- @worker_processes
122
137
  @column_chunk_size
123
- def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size):
138
+ @compressor
139
+ @worker_processes
140
+ def explode(
141
+ vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
142
+ ):
124
143
  """
125
144
  Convert VCF(s) to intermediate columnar format
126
145
  """
127
146
  setup_logging(verbose)
128
147
  check_overwrite_dir(icf_path, force)
129
148
  vcf.explode(
130
- vcfs,
131
149
  icf_path,
150
+ vcfs,
132
151
  worker_processes=worker_processes,
133
152
  column_chunk_size=column_chunk_size,
153
+ compressor=get_compressor(compressor),
134
154
  show_progress=True,
135
155
  )
136
156
 
@@ -141,10 +161,18 @@ def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size)
141
161
  @click.argument("num_partitions", type=click.IntRange(min=1))
142
162
  @force
143
163
  @column_chunk_size
164
+ @compressor
144
165
  @verbose
145
166
  @worker_processes
146
167
  def dexplode_init(
147
- vcfs, icf_path, num_partitions, force, column_chunk_size, verbose, worker_processes
168
+ vcfs,
169
+ icf_path,
170
+ num_partitions,
171
+ force,
172
+ column_chunk_size,
173
+ compressor,
174
+ verbose,
175
+ worker_processes,
148
176
  ):
149
177
  """
150
178
  Initial step for distributed conversion of VCF(s) to intermediate columnar format
@@ -158,6 +186,7 @@ def dexplode_init(
158
186
  target_num_partitions=num_partitions,
159
187
  column_chunk_size=column_chunk_size,
160
188
  worker_processes=worker_processes,
189
+ compressor=get_compressor(compressor),
161
190
  show_progress=True,
162
191
  )
163
192
  click.echo(num_partitions)
@@ -174,7 +203,7 @@ def dexplode_partition(icf_path, partition, verbose):
174
203
  from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
175
204
  """
176
205
  setup_logging(verbose)
177
- vcf.explode_partition(icf_path, partition, show_progress=True)
206
+ vcf.explode_partition(icf_path, partition, show_progress=False)
178
207
 
179
208
 
180
209
  @click.command
@@ -232,9 +261,8 @@ def mkschema(icf_path):
232
261
  @click.option(
233
262
  "-M",
234
263
  "--max-memory",
235
- type=int,
236
264
  default=None,
237
- help="An approximate bound on overall memory usage in megabytes",
265
+ help="An approximate bound on overall memory usage (e.g. 10G),",
238
266
  )
239
267
  @worker_processes
240
268
  def encode(
@@ -250,7 +278,7 @@ def encode(
250
278
  worker_processes,
251
279
  ):
252
280
  """
253
- Encode intermediate columnar format (see explode) to vcfzarr.
281
+ Convert intermediate columnar format to vcfzarr.
254
282
  """
255
283
  setup_logging(verbose)
256
284
  check_overwrite_dir(zarr_path, force)
bio2zarr/core.py CHANGED
@@ -1,16 +1,15 @@
1
- import dataclasses
2
- import contextlib
3
1
  import concurrent.futures as cf
2
+ import contextlib
3
+ import dataclasses
4
+ import logging
4
5
  import multiprocessing
5
6
  import threading
6
- import logging
7
7
  import time
8
8
 
9
- import zarr
9
+ import numcodecs
10
10
  import numpy as np
11
11
  import tqdm
12
- import numcodecs
13
-
12
+ import zarr
14
13
 
15
14
  logger = logging.getLogger(__name__)
16
15
 
@@ -50,7 +49,8 @@ def wait_on_futures(futures):
50
49
  cancel_futures(futures)
51
50
  if isinstance(exception, cf.process.BrokenProcessPool):
52
51
  raise RuntimeError(
53
- "Worker process died: you may have run out of memory") from exception
52
+ "Worker process died: you may have run out of memory"
53
+ ) from exception
54
54
  else:
55
55
  raise exception
56
56
 
bio2zarr/plink.py CHANGED
@@ -1,14 +1,13 @@
1
1
  import logging
2
2
 
3
+ import bed_reader
3
4
  import humanfriendly
5
+ import numcodecs
4
6
  import numpy as np
5
7
  import zarr
6
- import bed_reader
7
- import numcodecs
8
8
 
9
9
  from . import core
10
10
 
11
-
12
11
  logger = logging.getLogger(__name__)
13
12
 
14
13
 
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
24
23
  gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
25
24
  gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
26
25
  variants_chunk_size = gt.array.chunks[0]
27
- n = gt.array.shape[1]
28
26
  assert start % variants_chunk_size == 0
29
27
 
30
28
  logger.debug(f"Reading slice {start}:{stop}")
@@ -96,7 +94,7 @@ def convert(
96
94
  chunks=(samples_chunk_size,),
97
95
  )
98
96
  a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
99
- logger.debug(f"Encoded samples")
97
+ logger.debug("Encoded samples")
100
98
 
101
99
  # TODO encode these in slices - but read them in one go to avoid
102
100
  # fetching repeatedly from bim file
@@ -108,7 +106,7 @@ def convert(
108
106
  chunks=(variants_chunk_size,),
109
107
  )
110
108
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
111
- logger.debug(f"encoded variant_position")
109
+ logger.debug("encoded variant_position")
112
110
 
113
111
  alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
114
112
  a = root.array(
@@ -119,7 +117,7 @@ def convert(
119
117
  chunks=(variants_chunk_size,),
120
118
  )
121
119
  a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
122
- logger.debug(f"encoded variant_allele")
120
+ logger.debug("encoded variant_allele")
123
121
 
124
122
  # TODO remove this?
125
123
  a = root.empty(
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
201
199
  elif bed_call == 2:
202
200
  assert list(zarr_call) == [1, 1]
203
201
  else: # pragma no cover
204
- assert False
202
+ raise AssertionError(f"Unexpected bed call {bed_call}")
bio2zarr/typing.py CHANGED
@@ -1,4 +1,4 @@
1
1
  from pathlib import Path
2
2
  from typing import Union
3
3
 
4
- PathType = Union[str, Path]
4
+ PathType = Union[str, Path]
bio2zarr/vcf.py CHANGED
@@ -1,29 +1,27 @@
1
1
  import collections
2
+ import contextlib
2
3
  import dataclasses
3
4
  import functools
5
+ import json
4
6
  import logging
7
+ import math
5
8
  import os
6
9
  import pathlib
7
10
  import pickle
8
- import sys
9
11
  import shutil
10
- import json
11
- import math
12
+ import sys
12
13
  import tempfile
13
- import contextlib
14
14
  from typing import Any, List
15
15
 
16
- import humanfriendly
17
16
  import cyvcf2
17
+ import humanfriendly
18
18
  import numcodecs
19
19
  import numpy as np
20
20
  import numpy.testing as nt
21
21
  import tqdm
22
22
  import zarr
23
23
 
24
- from . import core
25
- from . import provenance
26
- from . import vcf_utils
24
+ from . import core, provenance, vcf_utils
27
25
 
28
26
  logger = logging.getLogger(__name__)
29
27
 
@@ -151,8 +149,8 @@ class VcfPartition:
151
149
 
152
150
  ICF_METADATA_FORMAT_VERSION = "0.2"
153
151
  ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
154
- cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
155
- ).get_config()
152
+ cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
153
+ )
156
154
 
157
155
 
158
156
  @dataclasses.dataclass
@@ -284,11 +282,25 @@ def scan_vcf(path, target_num_partitions):
284
282
  return metadata, vcf.raw_header
285
283
 
286
284
 
287
- def scan_vcfs(
288
- paths, show_progress, target_num_partitions, column_chunk_size, worker_processes=1
289
- ):
285
+ def check_overlap(partitions):
286
+ for i in range(1, len(partitions)):
287
+ prev_partition = partitions[i - 1]
288
+ current_partition = partitions[i]
289
+ if (
290
+ prev_partition.region.contig == current_partition.region.contig
291
+ and prev_partition.region.end > current_partition.region.start
292
+ ):
293
+ raise ValueError(
294
+ f"Multiple VCFs have the region "
295
+ f"{prev_partition.region.contig}:{prev_partition.region.start}-"
296
+ f"{current_partition.region.end}"
297
+ )
298
+
299
+
300
+ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
290
301
  logger.info(
291
- f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
302
+ f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
303
+ f" partitions."
292
304
  )
293
305
  # An easy mistake to make is to pass the same file twice. Check this early on.
294
306
  for path, count in collections.Counter(paths).items():
@@ -333,13 +345,8 @@ def scan_vcfs(
333
345
  all_partitions.sort(
334
346
  key=lambda x: (contig_index_map[x.region.contig], x.region.start)
335
347
  )
348
+ check_overlap(all_partitions)
336
349
  icf_metadata.partitions = all_partitions
337
- icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
338
- icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
339
- icf_metadata.column_chunk_size = column_chunk_size
340
- # Bare minimum here for provenance - would be nice to include versions of key
341
- # dependencies as well.
342
- icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
343
350
  logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
344
351
  return icf_metadata, header
345
352
 
@@ -799,6 +806,8 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
799
806
  for vcf_field in icf_metadata.fields:
800
807
  field_path = get_vcf_field_path(out_path, vcf_field)
801
808
  field_partition_path = field_path / f"p{partition_index}"
809
+ # Should be robust to running explode_partition twice.
810
+ field_partition_path.mkdir(exist_ok=True)
802
811
  transformer = VcfValueTransformer.factory(vcf_field, num_samples)
803
812
  self.field_writers[vcf_field.full_name] = IcfFieldWriter(
804
813
  vcf_field,
@@ -824,13 +833,7 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
824
833
  return False
825
834
 
826
835
 
827
- # TODO rename to IntermediateColumnarFormat and move to icf.py
828
-
829
-
830
836
  class IntermediateColumnarFormat(collections.abc.Mapping):
831
- # TODO Check if other compressors would give reasonable compression
832
- # with significantly faster times
833
-
834
837
  def __init__(self, path):
835
838
  self.path = pathlib.Path(path)
836
839
  # TODO raise a more informative error here telling people this
@@ -846,7 +849,7 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
846
849
  partition.num_records for partition in self.metadata.partitions
847
850
  ]
848
851
  # Allow us to find which partition a given record is in
849
- self.partition_record_index = np.cumsum([0] + partition_num_records)
852
+ self.partition_record_index = np.cumsum([0, *partition_num_records])
850
853
  for field in self.metadata.fields:
851
854
  self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
852
855
  logger.info(
@@ -856,7 +859,8 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
856
859
 
857
860
  def __repr__(self):
858
861
  return (
859
- f"IntermediateColumnarFormat(fields={len(self)}, partitions={self.num_partitions}, "
862
+ f"IntermediateColumnarFormat(fields={len(self)}, "
863
+ f"partitions={self.num_partitions}, "
860
864
  f"records={self.num_records}, path={self.path})"
861
865
  )
862
866
 
@@ -922,9 +926,12 @@ class IntermediateColumnarFormatWriter:
922
926
  worker_processes=1,
923
927
  target_num_partitions=None,
924
928
  show_progress=False,
929
+ compressor=None,
925
930
  ):
926
931
  if self.path.exists():
927
- shutil.rmtree(self.path)
932
+ raise ValueError("ICF path already exists")
933
+ if compressor is None:
934
+ compressor = ICF_DEFAULT_COMPRESSOR
928
935
  vcfs = [pathlib.Path(vcf) for vcf in vcfs]
929
936
  target_num_partitions = max(target_num_partitions, len(vcfs))
930
937
 
@@ -934,39 +941,38 @@ class IntermediateColumnarFormatWriter:
934
941
  worker_processes=worker_processes,
935
942
  show_progress=show_progress,
936
943
  target_num_partitions=target_num_partitions,
937
- column_chunk_size=column_chunk_size,
938
944
  )
939
945
  self.metadata = icf_metadata
946
+ self.metadata.format_version = ICF_METADATA_FORMAT_VERSION
947
+ self.metadata.compressor = compressor.get_config()
948
+ self.metadata.column_chunk_size = column_chunk_size
949
+ # Bare minimum here for provenance - would be nice to include versions of key
950
+ # dependencies as well.
951
+ self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
940
952
 
941
953
  self.mkdirs()
942
954
 
943
955
  # Note: this is needed for the current version of the vcfzarr spec, but it's
944
- # probably goint to be dropped.
956
+ # probably going to be dropped.
945
957
  # https://github.com/pystatgen/vcf-zarr-spec/issues/15
946
958
  # May be useful to keep lying around still though?
947
- logger.info(f"Writing VCF header")
959
+ logger.info("Writing VCF header")
948
960
  with open(self.path / "header.txt", "w") as f:
949
961
  f.write(header)
950
962
 
951
- logger.info(f"Writing WIP metadata")
963
+ logger.info("Writing WIP metadata")
952
964
  with open(self.wip_path / "metadata.json", "w") as f:
953
965
  json.dump(self.metadata.asdict(), f, indent=4)
954
966
  return self.num_partitions
955
967
 
956
968
  def mkdirs(self):
957
- # TODO add worker_processes here and do this with the ParallelWorkManager
958
- logger.info(
959
- f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
960
- )
969
+ num_dirs = len(self.metadata.fields)
970
+ logger.info(f"Creating {num_dirs} field directories")
961
971
  self.path.mkdir()
962
972
  self.wip_path.mkdir()
963
973
  for field in self.metadata.fields:
964
974
  col_path = get_vcf_field_path(self.path, field)
965
- logger.debug(f"Make directories for {field.full_name} at {col_path}")
966
975
  col_path.mkdir(parents=True)
967
- for j in range(self.num_partitions):
968
- part_path = col_path / f"p{j}"
969
- part_path.mkdir()
970
976
 
971
977
  def load_partition_summaries(self):
972
978
  summaries = []
@@ -982,13 +988,14 @@ class IntermediateColumnarFormatWriter:
982
988
  not_found.append(j)
983
989
  if len(not_found) > 0:
984
990
  raise FileNotFoundError(
985
- f"Partition metadata not found for {len(not_found)} partitions: {not_found}"
991
+ f"Partition metadata not found for {len(not_found)}"
992
+ f" partitions: {not_found}"
986
993
  )
987
994
  return summaries
988
995
 
989
996
  def load_metadata(self):
990
997
  if self.metadata is None:
991
- with open(self.wip_path / f"metadata.json") as f:
998
+ with open(self.wip_path / "metadata.json") as f:
992
999
  self.metadata = IcfMetadata.fromdict(json.load(f))
993
1000
 
994
1001
  def process_partition(self, partition_index):
@@ -1037,12 +1044,14 @@ class IntermediateColumnarFormatWriter:
1037
1044
  for field in format_fields:
1038
1045
  val = variant.format(field.name)
1039
1046
  tcw.append(field.full_name, val)
1040
- # Note: an issue with updating the progress per variant here like this
1041
- # is that we get a significant pause at the end of the counter while
1042
- # all the "small" fields get flushed. Possibly not much to be done about it.
1047
+ # Note: an issue with updating the progress per variant here like
1048
+ # this is that we get a significant pause at the end of the counter
1049
+ # while all the "small" fields get flushed. Possibly not much to be
1050
+ # done about it.
1043
1051
  core.update_progress(1)
1044
1052
  logger.info(
1045
- f"Finished reading VCF for partition {partition_index}, flushing buffers"
1053
+ f"Finished reading VCF for partition {partition_index}, "
1054
+ f"flushing buffers"
1046
1055
  )
1047
1056
 
1048
1057
  partition_metadata = {
@@ -1124,30 +1133,32 @@ class IntermediateColumnarFormatWriter:
1124
1133
  for summary in partition_summaries:
1125
1134
  field.summary.update(summary["field_summaries"][field.full_name])
1126
1135
 
1127
- logger.info(f"Finalising metadata")
1136
+ logger.info("Finalising metadata")
1128
1137
  with open(self.path / "metadata.json", "w") as f:
1129
1138
  json.dump(self.metadata.asdict(), f, indent=4)
1130
1139
 
1131
- logger.debug(f"Removing WIP directory")
1140
+ logger.debug("Removing WIP directory")
1132
1141
  shutil.rmtree(self.wip_path)
1133
1142
 
1134
1143
 
1135
1144
  def explode(
1136
- vcfs,
1137
1145
  icf_path,
1146
+ vcfs,
1138
1147
  *,
1139
1148
  column_chunk_size=16,
1140
1149
  worker_processes=1,
1141
1150
  show_progress=False,
1151
+ compressor=None,
1142
1152
  ):
1143
1153
  writer = IntermediateColumnarFormatWriter(icf_path)
1144
- num_partitions = writer.init(
1154
+ writer.init(
1145
1155
  vcfs,
1146
1156
  # Heuristic to get reasonable worker utilisation with lumpy partition sizing
1147
1157
  target_num_partitions=max(1, worker_processes * 4),
1148
1158
  worker_processes=worker_processes,
1149
1159
  show_progress=show_progress,
1150
1160
  column_chunk_size=column_chunk_size,
1161
+ compressor=compressor,
1151
1162
  )
1152
1163
  writer.explode(worker_processes=worker_processes, show_progress=show_progress)
1153
1164
  writer.finalise()
@@ -1162,6 +1173,7 @@ def explode_init(
1162
1173
  target_num_partitions=1,
1163
1174
  worker_processes=1,
1164
1175
  show_progress=False,
1176
+ compressor=None,
1165
1177
  ):
1166
1178
  writer = IntermediateColumnarFormatWriter(icf_path)
1167
1179
  return writer.init(
@@ -1170,6 +1182,7 @@ def explode_init(
1170
1182
  worker_processes=worker_processes,
1171
1183
  show_progress=show_progress,
1172
1184
  column_chunk_size=column_chunk_size,
1185
+ compressor=compressor,
1173
1186
  )
1174
1187
 
1175
1188
 
@@ -1209,20 +1222,25 @@ class ZarrColumnSpec:
1209
1222
  dtype: str
1210
1223
  shape: tuple
1211
1224
  chunks: tuple
1212
- dimensions: list
1225
+ dimensions: tuple
1213
1226
  description: str
1214
1227
  vcf_field: str
1215
- compressor: dict = None
1216
- filters: list = None
1217
- # TODO add filters
1228
+ compressor: dict
1229
+ filters: list
1218
1230
 
1219
1231
  def __post_init__(self):
1232
+ # Ensure these are tuples for ease of comparison and consistency
1220
1233
  self.shape = tuple(self.shape)
1221
1234
  self.chunks = tuple(self.chunks)
1222
1235
  self.dimensions = tuple(self.dimensions)
1223
- self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
1224
- self.filters = []
1225
- self._choose_compressor_settings()
1236
+
1237
+ @staticmethod
1238
+ def new(**kwargs):
1239
+ spec = ZarrColumnSpec(
1240
+ **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
1241
+ )
1242
+ spec._choose_compressor_settings()
1243
+ return spec
1226
1244
 
1227
1245
  def _choose_compressor_settings(self):
1228
1246
  """
@@ -1298,7 +1316,7 @@ class VcfZarrSchema:
1298
1316
  def fixed_field_spec(
1299
1317
  name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
1300
1318
  ):
1301
- return ZarrColumnSpec(
1319
+ return ZarrColumnSpec.new(
1302
1320
  vcf_field=vcf_field,
1303
1321
  name=name,
1304
1322
  dtype=dtype,
@@ -1366,14 +1384,23 @@ class VcfZarrSchema:
1366
1384
  if field.category == "FORMAT":
1367
1385
  prefix = "call_"
1368
1386
  shape.append(n)
1369
- chunks.append(samples_chunk_size),
1387
+ chunks.append(samples_chunk_size)
1370
1388
  dimensions.append("samples")
1371
1389
  # TODO make an option to add in the empty extra dimension
1372
1390
  if field.summary.max_number > 1:
1373
1391
  shape.append(field.summary.max_number)
1374
- dimensions.append(field.name)
1392
+ # TODO we should really be checking this to see if the named dimensions
1393
+ # are actually correct.
1394
+ if field.vcf_number == "R":
1395
+ dimensions.append("alleles")
1396
+ elif field.vcf_number == "A":
1397
+ dimensions.append("alt_alleles")
1398
+ elif field.vcf_number == "G":
1399
+ dimensions.append("genotypes")
1400
+ else:
1401
+ dimensions.append(f"{field.category}_{field.name}_dim")
1375
1402
  variable_name = prefix + field.name
1376
- colspec = ZarrColumnSpec(
1403
+ colspec = ZarrColumnSpec.new(
1377
1404
  vcf_field=field.full_name,
1378
1405
  name=variable_name,
1379
1406
  dtype=field.smallest_dtype(),
@@ -1391,7 +1418,7 @@ class VcfZarrSchema:
1391
1418
  dimensions = ["variants", "samples"]
1392
1419
 
1393
1420
  colspecs.append(
1394
- ZarrColumnSpec(
1421
+ ZarrColumnSpec.new(
1395
1422
  vcf_field=None,
1396
1423
  name="call_genotype_phased",
1397
1424
  dtype="bool",
@@ -1404,7 +1431,7 @@ class VcfZarrSchema:
1404
1431
  shape += [ploidy]
1405
1432
  dimensions += ["ploidy"]
1406
1433
  colspecs.append(
1407
- ZarrColumnSpec(
1434
+ ZarrColumnSpec.new(
1408
1435
  vcf_field=None,
1409
1436
  name="call_genotype",
1410
1437
  dtype=gt_field.smallest_dtype(),
@@ -1415,7 +1442,7 @@ class VcfZarrSchema:
1415
1442
  )
1416
1443
  )
1417
1444
  colspecs.append(
1418
- ZarrColumnSpec(
1445
+ ZarrColumnSpec.new(
1419
1446
  vcf_field=None,
1420
1447
  name="call_genotype_mask",
1421
1448
  dtype="bool",
@@ -1480,16 +1507,30 @@ class EncodingWork:
1480
1507
  memory: int = 0
1481
1508
 
1482
1509
 
1510
+ def parse_max_memory(max_memory):
1511
+ if max_memory is None:
1512
+ # Effectively unbounded
1513
+ return 2**63
1514
+ if isinstance(max_memory, str):
1515
+ max_memory = humanfriendly.parse_size(max_memory)
1516
+ logger.info(f"Set memory budget to {display_size(max_memory)}")
1517
+ return max_memory
1518
+
1519
+
1483
1520
  class VcfZarrWriter:
1484
- def __init__(self, path, icf, schema):
1521
+ def __init__(self, path, icf, schema, dimension_separator=None):
1485
1522
  self.path = pathlib.Path(path)
1486
1523
  self.icf = icf
1487
1524
  self.schema = schema
1525
+ # Default to using nested directories following the Zarr v3 default.
1526
+ # This seems to require version 2.17+ to work properly
1527
+ self.dimension_separator = (
1528
+ "/" if dimension_separator is None else dimension_separator
1529
+ )
1488
1530
  store = zarr.DirectoryStore(self.path)
1489
1531
  self.root = zarr.group(store=store)
1490
1532
 
1491
1533
  def init_array(self, variable):
1492
- # print("CREATE", variable)
1493
1534
  object_codec = None
1494
1535
  if variable.dtype == "O":
1495
1536
  object_codec = numcodecs.VLenUTF8()
@@ -1501,7 +1542,9 @@ class VcfZarrWriter:
1501
1542
  compressor=numcodecs.get_codec(variable.compressor),
1502
1543
  filters=[numcodecs.get_codec(filt) for filt in variable.filters],
1503
1544
  object_codec=object_codec,
1545
+ dimension_separator=self.dimension_separator,
1504
1546
  )
1547
+ # Dimension names are part of the spec in Zarr v3
1505
1548
  a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
1506
1549
 
1507
1550
  def get_array(self, name):
@@ -1593,7 +1636,9 @@ class VcfZarrWriter:
1593
1636
  try:
1594
1637
  var_filter.buff[j, lookup[f]] = True
1595
1638
  except KeyError:
1596
- raise ValueError(f"Filter '{f}' was not defined in the header.")
1639
+ raise ValueError(
1640
+ f"Filter '{f}' was not defined " f"in the header."
1641
+ ) from None
1597
1642
  var_filter.flush()
1598
1643
  logger.debug(f"Encoded FILTERS slice {start}:{stop}")
1599
1644
 
@@ -1639,6 +1684,7 @@ class VcfZarrWriter:
1639
1684
  "contig_length",
1640
1685
  self.schema.contig_length,
1641
1686
  dtype=np.int64,
1687
+ compressor=DEFAULT_ZARR_COMPRESSOR,
1642
1688
  )
1643
1689
  array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1644
1690
  return {v: j for j, v in enumerate(self.schema.contig_id)}
@@ -1661,8 +1707,6 @@ class VcfZarrWriter:
1661
1707
  self.init_array(column)
1662
1708
 
1663
1709
  def finalise(self):
1664
- # for column in self.schema.columns.values():
1665
- # self.finalise_array(column)
1666
1710
  zarr.consolidate_metadata(self.path)
1667
1711
 
1668
1712
  def encode(
@@ -1672,12 +1716,7 @@ class VcfZarrWriter:
1672
1716
  show_progress=False,
1673
1717
  max_memory=None,
1674
1718
  ):
1675
- if max_memory is None:
1676
- # Unbounded
1677
- max_memory = 2**63
1678
- else:
1679
- # Value is specified in Mibibytes
1680
- max_memory *= 2**20 # NEEDS TEST
1719
+ max_memory = parse_max_memory(max_memory)
1681
1720
 
1682
1721
  # TODO this will move into the setup logic later when we're making it possible
1683
1722
  # to split the work by slice
@@ -1702,7 +1741,8 @@ class VcfZarrWriter:
1702
1741
  variant_chunk_size = array.blocks[0].nbytes
1703
1742
  encoding_memory_requirements[col.name] = variant_chunk_size
1704
1743
  logger.debug(
1705
- f"{col.name} requires at least {display_size(variant_chunk_size)} per worker"
1744
+ f"{col.name} requires at least {display_size(variant_chunk_size)} "
1745
+ f"per worker"
1706
1746
  )
1707
1747
  total_bytes += array.nbytes
1708
1748
 
@@ -1764,8 +1804,8 @@ class VcfZarrWriter:
1764
1804
 
1765
1805
  # Fail early if we can't fit a particular column into memory
1766
1806
  for wp in work:
1767
- if wp.memory >= max_memory:
1768
- raise ValueError( # NEEDS TEST
1807
+ if wp.memory > max_memory:
1808
+ raise ValueError(
1769
1809
  f"Insufficient memory for {wp.columns}: "
1770
1810
  f"{display_size(wp.memory)} > {display_size(max_memory)}"
1771
1811
  )
@@ -1778,6 +1818,8 @@ class VcfZarrWriter:
1778
1818
  )
1779
1819
 
1780
1820
  used_memory = 0
1821
+ # We need to keep some bounds on the queue size or the memory bounds algorithm
1822
+ # below doesn't really work.
1781
1823
  max_queued = 4 * max(1, worker_processes)
1782
1824
  encoded_slices = collections.Counter()
1783
1825
 
@@ -1804,10 +1846,15 @@ class VcfZarrWriter:
1804
1846
  self.finalise_array(column)
1805
1847
 
1806
1848
  for wp in work:
1807
- if (
1849
+ while (
1808
1850
  used_memory + wp.memory > max_memory
1809
1851
  or len(future_to_work) > max_queued
1810
1852
  ):
1853
+ logger.debug(
1854
+ f"Wait: mem_required={used_memory + wp.memory} "
1855
+ f"max_mem={max_memory} queued={len(future_to_work)} "
1856
+ f"max_queued={max_queued}"
1857
+ )
1811
1858
  service_completed_futures()
1812
1859
  future = pwm.submit(wp.func, wp.start, wp.stop)
1813
1860
  used_memory += wp.memory
@@ -1832,6 +1879,7 @@ def encode(
1832
1879
  variants_chunk_size=None,
1833
1880
  samples_chunk_size=None,
1834
1881
  max_v_chunks=None,
1882
+ dimension_separator=None,
1835
1883
  max_memory=None,
1836
1884
  worker_processes=1,
1837
1885
  show_progress=False,
@@ -1849,13 +1897,13 @@ def encode(
1849
1897
  raise ValueError(
1850
1898
  "Cannot specify schema along with chunk sizes"
1851
1899
  ) # NEEDS TEST
1852
- with open(schema_path, "r") as f:
1900
+ with open(schema_path) as f:
1853
1901
  schema = VcfZarrSchema.fromjson(f.read())
1854
1902
  zarr_path = pathlib.Path(zarr_path)
1855
1903
  if zarr_path.exists():
1856
1904
  logger.warning(f"Deleting existing {zarr_path}")
1857
1905
  shutil.rmtree(zarr_path)
1858
- vzw = VcfZarrWriter(zarr_path, icf, schema)
1906
+ vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
1859
1907
  vzw.init()
1860
1908
  vzw.encode(
1861
1909
  max_v_chunks=max_v_chunks,
@@ -1876,10 +1924,11 @@ def convert(
1876
1924
  show_progress=False,
1877
1925
  # TODO add arguments to control location of tmpdir
1878
1926
  ):
1879
- with tempfile.TemporaryDirectory(prefix="vcf2zarr_if_") as if_dir:
1927
+ with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
1928
+ if_dir = pathlib.Path(tmp) / "if"
1880
1929
  explode(
1881
- vcfs,
1882
1930
  if_dir,
1931
+ vcfs,
1883
1932
  worker_processes=worker_processes,
1884
1933
  show_progress=show_progress,
1885
1934
  )
@@ -1929,7 +1978,7 @@ def assert_all_fill(zarr_val, vcf_type):
1929
1978
  elif vcf_type == "Float":
1930
1979
  assert_all_fill_float(zarr_val)
1931
1980
  else: # pragma: no cover
1932
- assert False
1981
+ assert False # noqa PT015
1933
1982
 
1934
1983
 
1935
1984
  def assert_all_missing(zarr_val, vcf_type):
@@ -1942,7 +1991,7 @@ def assert_all_missing(zarr_val, vcf_type):
1942
1991
  elif vcf_type == "Float":
1943
1992
  assert_all_missing_float(zarr_val)
1944
1993
  else: # pragma: no cover
1945
- assert False
1994
+ assert False # noqa PT015
1946
1995
 
1947
1996
 
1948
1997
  def assert_info_val_missing(zarr_val, vcf_type):
@@ -2081,7 +2130,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
2081
2130
  assert vid[j] == ("." if row.ID is None else row.ID)
2082
2131
  assert allele[j, 0] == row.REF
2083
2132
  k = len(row.ALT)
2084
- nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT),
2133
+ nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
2085
2134
  assert np.all(allele[j, k + 1 :] == "")
2086
2135
  # TODO FILTERS
2087
2136
 
bio2zarr/vcf_utils.py CHANGED
@@ -1,14 +1,14 @@
1
- from typing import IO, Any, Dict, Optional, Sequence, Union
2
1
  import contextlib
3
- import struct
4
- import pathlib
5
2
  import gzip
6
- from dataclasses import dataclass
7
3
  import os
4
+ import pathlib
5
+ import struct
6
+ from dataclasses import dataclass
7
+ from typing import IO, Any, Dict, Optional, Sequence, Union
8
8
 
9
- import numpy as np
10
9
  import cyvcf2
11
10
  import humanfriendly
11
+ import numpy as np
12
12
 
13
13
  from bio2zarr.typing import PathType
14
14
 
@@ -38,7 +38,8 @@ def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Optional[Any] = None) -> A
38
38
  fmt : str
39
39
  A Python `struct` format string.
40
40
  nodata : Optional[Any], optional
41
- The value to return in case there is no further data in the stream, by default None
41
+ The value to return in case there is no further data in the stream,
42
+ by default None
42
43
 
43
44
  Returns
44
45
  -------
@@ -277,7 +278,8 @@ class TabixIndex:
277
278
  # Create file offsets for each element in the linear index
278
279
  file_offsets = np.array([get_file_offset(vfp) for vfp in linear_index])
279
280
 
280
- # Calculate corresponding contigs and positions or each element in the linear index
281
+ # Calculate corresponding contigs and positions or each element in
282
+ # the linear index
281
283
  contig_indexes = np.hstack(
282
284
  [np.full(len(li), i) for (i, li) in enumerate(linear_indexes)]
283
285
  )
@@ -433,6 +435,22 @@ class IndexedVcf(contextlib.AbstractContextManager):
433
435
  if var.POS >= start:
434
436
  yield var
435
437
 
438
+ def _filter_empty(self, regions):
439
+ """
440
+ Return all regions in the specified list that have one or more records.
441
+
442
+ Sometimes with Tabix indexes these seem to crop up:
443
+
444
+ - https://github.com/sgkit-dev/bio2zarr/issues/45
445
+ - https://github.com/sgkit-dev/bio2zarr/issues/120
446
+ """
447
+ ret = []
448
+ for region in regions:
449
+ variants = self.variants(region)
450
+ if next(variants, None) is not None:
451
+ ret.append(region)
452
+ return ret
453
+
436
454
  def partition_into_regions(
437
455
  self,
438
456
  num_parts: Optional[int] = None,
@@ -509,4 +527,4 @@ class IndexedVcf(contextlib.AbstractContextManager):
509
527
  if self.index.record_counts[ri] > 0:
510
528
  regions.append(Region(self.sequence_names[ri]))
511
529
 
512
- return regions
530
+ return self._filter_empty(regions)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Home-page: https://github.com/pystatgen/bio2zarr
6
6
  Author: sgkit Developers
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
20
20
  Description-Content-Type: text/x-rst
21
21
  License-File: LICENSE
22
22
  Requires-Dist: numpy
23
- Requires-Dist: zarr !=2.11.0,!=2.11.1,!=2.11.2,>=2.10.0
23
+ Requires-Dist: zarr >=2.17
24
24
  Requires-Dist: click
25
25
  Requires-Dist: tabulate
26
26
  Requires-Dist: tqdm
@@ -0,0 +1,16 @@
1
+ bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
+ bio2zarr/__main__.py,sha256=hO4vV-kPFgsYq0NQwG2r-WkserPL27oqae_tUvNB7yE,527
3
+ bio2zarr/_version.py,sha256=EJB7__SNK9kQS_SWZB_U4DHJ3P8ftF6etZEihTYnuXE,411
4
+ bio2zarr/cli.py,sha256=k63xex-tQkogAlJ3N68Ikx8LqZrksXbZB2s6Z7h-zXc,11446
5
+ bio2zarr/core.py,sha256=reF9elN1dwmCoXXLgci-y5pXmAm3fTntmomHTRcG54g,8127
6
+ bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
7
+ bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
8
+ bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
9
+ bio2zarr/vcf.py,sha256=GFnwR2YP-cHU4tfHloRjyiBK9-xXDgXcAM_tz-w2qck,74324
10
+ bio2zarr/vcf_utils.py,sha256=r3NQXxWK1SYU7CcwDzSWXdX5Q8Ixk7gdCTEiFPzfUAk,17307
11
+ bio2zarr-0.0.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
+ bio2zarr-0.0.5.dist-info/METADATA,sha256=SasGYcKSRb7NqnYR98ODFvPEMdBNdpxWx5gqOt038QU,1077
13
+ bio2zarr-0.0.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
+ bio2zarr-0.0.5.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
15
+ bio2zarr-0.0.5.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
16
+ bio2zarr-0.0.5.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
2
- bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
3
- bio2zarr/_version.py,sha256=hB095avW4HuDZxn8qPHRG1UMzSSonb8ZDAsLxt9hmk8,411
4
- bio2zarr/cli.py,sha256=N_vEFj730p_TL7Dk9m9T3ceAhVV58BMYRDmBmoeKH7A,10766
5
- bio2zarr/core.py,sha256=sBlWmHjcb7tAn_7WQRBdrbGcEd_lT_3HTQ_JbzomVMg,8111
6
- bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
7
- bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
8
- bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
9
- bio2zarr/vcf.py,sha256=g2TqH9Lbp4Ds8kjOnjvHvoMAgnG6Kx8pKPN1bqBKKIQ,72201
10
- bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
11
- bio2zarr-0.0.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
- bio2zarr-0.0.3.dist-info/METADATA,sha256=dc2y5xrnkcvD1qmKGFL5GrsbM1_tiIlAYB2GrAlLunM,1106
13
- bio2zarr-0.0.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
14
- bio2zarr-0.0.3.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
15
- bio2zarr-0.0.3.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
16
- bio2zarr-0.0.3.dist-info/RECORD,,