bio2zarr 0.0.8__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

Files changed (40) hide show
  1. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/CHANGELOG.md +8 -1
  2. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/PKG-INFO +2 -2
  3. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/_version.py +2 -2
  4. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/cli.py +1 -1
  5. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/vcf.py +192 -163
  6. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/vcf_utils.py +21 -13
  7. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/PKG-INFO +2 -2
  8. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/requires.txt +1 -1
  9. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/pyproject.toml +1 -1
  10. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/validation-data/Makefile +6 -1
  11. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/.github/workflows/ci.yml +0 -0
  12. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/.github/workflows/docs.yml +0 -0
  13. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/.gitignore +0 -0
  14. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/.pre-commit-config.yaml +0 -0
  15. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/LICENSE +0 -0
  16. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/MANIFEST.in +0 -0
  17. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/README.md +0 -0
  18. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/__init__.py +0 -0
  19. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/__main__.py +0 -0
  20. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/core.py +0 -0
  21. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/plink.py +0 -0
  22. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/provenance.py +0 -0
  23. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/typing.py +0 -0
  24. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/SOURCES.txt +0 -0
  25. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/dependency_links.txt +0 -0
  26. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/entry_points.txt +0 -0
  27. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/top_level.txt +0 -0
  28. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/Makefile +0 -0
  29. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/_config.yml +0 -0
  30. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/_toc.yml +0 -0
  31. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/build.sh +0 -0
  32. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/cli.md +0 -0
  33. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/intro.md +0 -0
  34. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/logo.png +0 -0
  35. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/references.bib +0 -0
  36. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/requirements.txt +0 -0
  37. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/setup.cfg +0 -0
  38. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/validation-data/split.sh +0 -0
  39. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/validation.py +0 -0
  40. {bio2zarr-0.0.8 → bio2zarr-0.0.9}/vcf_generator.py +0 -0
@@ -1,4 +1,11 @@
1
- # 0.0.7 2024-04-30
1
+ # 0.0.9 2024-05-02
2
+
3
+ - Change on-disk format for explode and schema
4
+ - Support older tabix indexes
5
+ - Fix some bugs in explode
6
+
7
+ # 0.0.8 2024-04-30
8
+
2
9
  - Change on-disk format of distributed encode and simplify
3
10
  - Check for all partitions nominally completed encoding before doing
4
11
  anything destructive in dencode-finalise
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -234,7 +234,7 @@ Requires-Dist: pysam; extra == "dev"
234
234
  Requires-Dist: pytest; extra == "dev"
235
235
  Requires-Dist: pytest-coverage; extra == "dev"
236
236
  Requires-Dist: pytest-xdist; extra == "dev"
237
- Requires-Dist: sgkit; extra == "dev"
237
+ Requires-Dist: sgkit>=0.8.0; extra == "dev"
238
238
  Requires-Dist: tqdm; extra == "dev"
239
239
 
240
240
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.0.8'
16
- __version_tuple__ = version_tuple = (0, 0, 8)
15
+ __version__ = version = '0.0.9'
16
+ __version_tuple__ = version_tuple = (0, 0, 9)
@@ -233,7 +233,7 @@ def dexplode_partition(icf_path, partition, verbose):
233
233
  from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
234
234
  """
235
235
  setup_logging(verbose)
236
- vcf.explode_partition(icf_path, partition, show_progress=False)
236
+ vcf.explode_partition(icf_path, partition)
237
237
 
238
238
 
239
239
  @click.command
@@ -1,7 +1,6 @@
1
1
  import collections
2
2
  import contextlib
3
3
  import dataclasses
4
- import functools
5
4
  import json
6
5
  import logging
7
6
  import math
@@ -145,29 +144,41 @@ class VcfPartition:
145
144
  num_records: int = -1
146
145
 
147
146
 
148
- ICF_METADATA_FORMAT_VERSION = "0.2"
147
+ ICF_METADATA_FORMAT_VERSION = "0.3"
149
148
  ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
150
149
  cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
151
150
  )
152
151
 
153
- # TODO refactor this to have embedded Contig dataclass, Filters
154
- # and Samples dataclasses to allow for more information to be
155
- # retained and forward compatibility.
152
+
153
+ @dataclasses.dataclass
154
+ class Contig:
155
+ id: str
156
+ length: int = None
157
+
158
+
159
+ @dataclasses.dataclass
160
+ class Sample:
161
+ id: str
162
+
163
+
164
+ @dataclasses.dataclass
165
+ class Filter:
166
+ id: str
167
+ description: str = ""
156
168
 
157
169
 
158
170
  @dataclasses.dataclass
159
171
  class IcfMetadata:
160
172
  samples: list
161
- contig_names: list
162
- contig_record_counts: dict
173
+ contigs: list
163
174
  filters: list
164
175
  fields: list
165
176
  partitions: list = None
166
- contig_lengths: list = None
167
177
  format_version: str = None
168
178
  compressor: dict = None
169
179
  column_chunk_size: int = None
170
180
  provenance: dict = None
181
+ num_records: int = -1
171
182
 
172
183
  @property
173
184
  def info_fields(self):
@@ -187,16 +198,12 @@ class IcfMetadata:
187
198
 
188
199
  @property
189
200
  def num_contigs(self):
190
- return len(self.contig_names)
201
+ return len(self.contigs)
191
202
 
192
203
  @property
193
204
  def num_filters(self):
194
205
  return len(self.filters)
195
206
 
196
- @property
197
- def num_records(self):
198
- return sum(self.contig_record_counts.values())
199
-
200
207
  @staticmethod
201
208
  def fromdict(d):
202
209
  if d["format_version"] != ICF_METADATA_FORMAT_VERSION:
@@ -204,18 +211,23 @@ class IcfMetadata:
204
211
  "Intermediate columnar metadata format version mismatch: "
205
212
  f"{d['format_version']} != {ICF_METADATA_FORMAT_VERSION}"
206
213
  )
207
- fields = [VcfField.fromdict(fd) for fd in d["fields"]]
208
214
  partitions = [VcfPartition(**pd) for pd in d["partitions"]]
209
215
  for p in partitions:
210
216
  p.region = vcf_utils.Region(**p.region)
211
217
  d = d.copy()
212
- d["fields"] = fields
213
218
  d["partitions"] = partitions
219
+ d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
220
+ d["samples"] = [Sample(**sd) for sd in d["samples"]]
221
+ d["filters"] = [Filter(**fd) for fd in d["filters"]]
222
+ d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
214
223
  return IcfMetadata(**d)
215
224
 
216
225
  def asdict(self):
217
226
  return dataclasses.asdict(self)
218
227
 
228
+ def asjson(self):
229
+ return json.dumps(self.asdict(), indent=4)
230
+
219
231
 
220
232
  def fixed_vcf_field_definitions():
221
233
  def make_field_def(name, vcf_type, vcf_number):
@@ -243,15 +255,22 @@ def fixed_vcf_field_definitions():
243
255
  def scan_vcf(path, target_num_partitions):
244
256
  with vcf_utils.IndexedVcf(path) as indexed_vcf:
245
257
  vcf = indexed_vcf.vcf
246
- filters = [
247
- h["ID"]
248
- for h in vcf.header_iter()
249
- if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str)
250
- ]
258
+ filters = []
259
+ pass_index = -1
260
+ for h in vcf.header_iter():
261
+ if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str):
262
+ try:
263
+ description = h["Description"].strip('"')
264
+ except KeyError:
265
+ description = ""
266
+ if h["ID"] == "PASS":
267
+ pass_index = len(filters)
268
+ filters.append(Filter(h["ID"], description))
269
+
251
270
  # Ensure PASS is the first filter if present
252
- if "PASS" in filters:
253
- filters.remove("PASS")
254
- filters.insert(0, "PASS")
271
+ if pass_index > 0:
272
+ pass_filter = filters.pop(pass_index)
273
+ filters.insert(0, pass_filter)
255
274
 
256
275
  fields = fixed_vcf_field_definitions()
257
276
  for h in vcf.header_iter():
@@ -262,18 +281,22 @@ def scan_vcf(path, target_num_partitions):
262
281
  field.vcf_number = "."
263
282
  fields.append(field)
264
283
 
284
+ try:
285
+ contig_lengths = vcf.seqlens
286
+ except AttributeError:
287
+ contig_lengths = [None for _ in vcf.seqnames]
288
+
265
289
  metadata = IcfMetadata(
266
- samples=vcf.samples,
267
- contig_names=vcf.seqnames,
268
- contig_record_counts=indexed_vcf.contig_record_counts(),
290
+ samples=[Sample(sample_id) for sample_id in vcf.samples],
291
+ contigs=[
292
+ Contig(contig_id, length)
293
+ for contig_id, length in zip(vcf.seqnames, contig_lengths)
294
+ ],
269
295
  filters=filters,
270
296
  fields=fields,
271
297
  partitions=[],
298
+ num_records=sum(indexed_vcf.contig_record_counts().values()),
272
299
  )
273
- try:
274
- metadata.contig_lengths = vcf.seqlens
275
- except AttributeError:
276
- pass
277
300
 
278
301
  regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
279
302
  logger.info(
@@ -292,22 +315,6 @@ def scan_vcf(path, target_num_partitions):
292
315
  return metadata, vcf.raw_header
293
316
 
294
317
 
295
- def check_overlap(partitions):
296
- for i in range(1, len(partitions)):
297
- prev_region = partitions[i - 1].region
298
- current_region = partitions[i].region
299
- if prev_region.contig == current_region.contig:
300
- if prev_region.end is None:
301
- logger.warning("Cannot check overlaps; issue #146")
302
- continue
303
- if prev_region.end > current_region.start:
304
- raise ValueError(
305
- f"Multiple VCFs have the region "
306
- f"{prev_region.contig}:{prev_region.start}-"
307
- f"{current_region.end}"
308
- )
309
-
310
-
311
318
  def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
312
319
  logger.info(
313
320
  f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
@@ -336,27 +343,30 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
336
343
  # We just take the first header, assuming the others
337
344
  # are compatible.
338
345
  all_partitions = []
339
- contig_record_counts = collections.Counter()
346
+ total_records = 0
340
347
  for metadata, _ in results:
341
- all_partitions.extend(metadata.partitions)
342
- metadata.partitions.clear()
343
- contig_record_counts += metadata.contig_record_counts
344
- metadata.contig_record_counts.clear()
348
+ for partition in metadata.partitions:
349
+ logger.debug(f"Scanned partition {partition}")
350
+ all_partitions.append(partition)
351
+ total_records += metadata.num_records
352
+ metadata.num_records = 0
353
+ metadata.partitions = []
345
354
 
346
355
  icf_metadata, header = results[0]
347
356
  for metadata, _ in results[1:]:
348
357
  if metadata != icf_metadata:
349
358
  raise ValueError("Incompatible VCF chunks")
350
359
 
351
- icf_metadata.contig_record_counts = dict(contig_record_counts)
360
+ # Note: this will be infinity here if any of the chunks has an index
361
+ # that doesn't keep track of the number of records per-contig
362
+ icf_metadata.num_records = total_records
352
363
 
353
364
  # Sort by contig (in the order they appear in the header) first,
354
365
  # then by start coordinate
355
- contig_index_map = {contig: j for j, contig in enumerate(metadata.contig_names)}
366
+ contig_index_map = {contig.id: j for j, contig in enumerate(metadata.contigs)}
356
367
  all_partitions.sort(
357
368
  key=lambda x: (contig_index_map[x.region.contig], x.region.start)
358
369
  )
359
- check_overlap(all_partitions)
360
370
  icf_metadata.partitions = all_partitions
361
371
  logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
362
372
  return icf_metadata, header
@@ -853,19 +863,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
853
863
  self.metadata = IcfMetadata.fromdict(json.load(f))
854
864
  with open(self.path / "header.txt") as f:
855
865
  self.vcf_header = f.read()
856
-
857
866
  self.compressor = numcodecs.get_codec(self.metadata.compressor)
858
- self.columns = {}
867
+ self.fields = {}
859
868
  partition_num_records = [
860
869
  partition.num_records for partition in self.metadata.partitions
861
870
  ]
862
871
  # Allow us to find which partition a given record is in
863
872
  self.partition_record_index = np.cumsum([0, *partition_num_records])
864
873
  for field in self.metadata.fields:
865
- self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
874
+ self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
866
875
  logger.info(
867
876
  f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
868
- f"records={self.num_records}, columns={self.num_columns})"
877
+ f"records={self.num_records}, fields={self.num_fields})"
869
878
  )
870
879
 
871
880
  def __repr__(self):
@@ -876,17 +885,17 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
876
885
  )
877
886
 
878
887
  def __getitem__(self, key):
879
- return self.columns[key]
888
+ return self.fields[key]
880
889
 
881
890
  def __iter__(self):
882
- return iter(self.columns)
891
+ return iter(self.fields)
883
892
 
884
893
  def __len__(self):
885
- return len(self.columns)
894
+ return len(self.fields)
886
895
 
887
896
  def summary_table(self):
888
897
  data = []
889
- for name, col in self.columns.items():
898
+ for name, col in self.fields.items():
890
899
  summary = col.vcf_field.summary
891
900
  d = {
892
901
  "name": name,
@@ -902,9 +911,9 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
902
911
  data.append(d)
903
912
  return data
904
913
 
905
- @functools.cached_property
914
+ @property
906
915
  def num_records(self):
907
- return sum(self.metadata.contig_record_counts.values())
916
+ return self.metadata.num_records
908
917
 
909
918
  @property
910
919
  def num_partitions(self):
@@ -915,8 +924,42 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
915
924
  return len(self.metadata.samples)
916
925
 
917
926
  @property
918
- def num_columns(self):
919
- return len(self.columns)
927
+ def num_fields(self):
928
+ return len(self.fields)
929
+
930
+
931
+ @dataclasses.dataclass
932
+ class IcfPartitionMetadata:
933
+ num_records: int
934
+ last_position: int
935
+ field_summaries: dict
936
+
937
+ def asdict(self):
938
+ return dataclasses.asdict(self)
939
+
940
+ def asjson(self):
941
+ return json.dumps(self.asdict(), indent=4)
942
+
943
+ @staticmethod
944
+ def fromdict(d):
945
+ md = IcfPartitionMetadata(**d)
946
+ for k, v in md.field_summaries.items():
947
+ md.field_summaries[k] = VcfFieldSummary.fromdict(v)
948
+ return md
949
+
950
+
951
+ def check_overlapping_partitions(partitions):
952
+ for i in range(1, len(partitions)):
953
+ prev_region = partitions[i - 1].region
954
+ current_region = partitions[i].region
955
+ if prev_region.contig == current_region.contig:
956
+ assert prev_region.end is not None
957
+ # Regions are *inclusive*
958
+ if prev_region.end >= current_region.start:
959
+ raise ValueError(
960
+ f"Overlapping VCF regions in partitions {i - 1} and {i}: "
961
+ f"{prev_region} and {current_region}"
962
+ )
920
963
 
921
964
 
922
965
  class IntermediateColumnarFormatWriter:
@@ -990,11 +1033,8 @@ class IntermediateColumnarFormatWriter:
990
1033
  not_found = []
991
1034
  for j in range(self.num_partitions):
992
1035
  try:
993
- with open(self.wip_path / f"p{j}_summary.json") as f:
994
- summary = json.load(f)
995
- for k, v in summary["field_summaries"].items():
996
- summary["field_summaries"][k] = VcfFieldSummary.fromdict(v)
997
- summaries.append(summary)
1036
+ with open(self.wip_path / f"p{j}.json") as f:
1037
+ summaries.append(IcfPartitionMetadata.fromdict(json.load(f)))
998
1038
  except FileNotFoundError:
999
1039
  not_found.append(j)
1000
1040
  if len(not_found) > 0:
@@ -1011,7 +1051,7 @@ class IntermediateColumnarFormatWriter:
1011
1051
 
1012
1052
  def process_partition(self, partition_index):
1013
1053
  self.load_metadata()
1014
- summary_path = self.wip_path / f"p{partition_index}_summary.json"
1054
+ summary_path = self.wip_path / f"p{partition_index}.json"
1015
1055
  # If someone is rewriting a summary path (for whatever reason), make sure it
1016
1056
  # doesn't look like it's already been completed.
1017
1057
  # NOTE to do this properly we probably need to take a lock on this file - but
@@ -1032,6 +1072,7 @@ class IntermediateColumnarFormatWriter:
1032
1072
  else:
1033
1073
  format_fields.append(field)
1034
1074
 
1075
+ last_position = None
1035
1076
  with IcfPartitionWriter(
1036
1077
  self.metadata,
1037
1078
  self.path,
@@ -1041,6 +1082,7 @@ class IntermediateColumnarFormatWriter:
1041
1082
  num_records = 0
1042
1083
  for variant in ivcf.variants(partition.region):
1043
1084
  num_records += 1
1085
+ last_position = variant.POS
1044
1086
  tcw.append("CHROM", variant.CHROM)
1045
1087
  tcw.append("POS", variant.POS)
1046
1088
  tcw.append("QUAL", variant.QUAL)
@@ -1065,37 +1107,32 @@ class IntermediateColumnarFormatWriter:
1065
1107
  f"flushing buffers"
1066
1108
  )
1067
1109
 
1068
- partition_metadata = {
1069
- "num_records": num_records,
1070
- "field_summaries": {k: v.asdict() for k, v in tcw.field_summaries.items()},
1071
- }
1110
+ partition_metadata = IcfPartitionMetadata(
1111
+ num_records=num_records,
1112
+ last_position=last_position,
1113
+ field_summaries=tcw.field_summaries,
1114
+ )
1072
1115
  with open(summary_path, "w") as f:
1073
- json.dump(partition_metadata, f, indent=4)
1116
+ f.write(partition_metadata.asjson())
1074
1117
  logger.info(
1075
- f"Finish p{partition_index} {partition.vcf_path}__{partition.region}="
1076
- f"{num_records} records"
1118
+ f"Finish p{partition_index} {partition.vcf_path}__{partition.region} "
1119
+ f"{num_records} records last_pos={last_position}"
1077
1120
  )
1078
1121
 
1079
- def process_partition_slice(
1080
- self,
1081
- start,
1082
- stop,
1083
- *,
1084
- worker_processes=1,
1085
- show_progress=False,
1086
- ):
1122
+ def explode(self, *, worker_processes=1, show_progress=False):
1087
1123
  self.load_metadata()
1088
- if start == 0 and stop == self.num_partitions:
1089
- num_records = self.metadata.num_records
1090
- else:
1091
- # We only know the number of records if all partitions are done at once,
1092
- # and we signal this to tqdm by passing None as the total.
1124
+ num_records = self.metadata.num_records
1125
+ if np.isinf(num_records):
1126
+ logger.warning(
1127
+ "Total records unknown, cannot show progress; "
1128
+ "reindex VCFs with bcftools index to fix"
1129
+ )
1093
1130
  num_records = None
1094
- num_columns = len(self.metadata.fields)
1131
+ num_fields = len(self.metadata.fields)
1095
1132
  num_samples = len(self.metadata.samples)
1096
1133
  logger.info(
1097
- f"Exploding columns={num_columns} samples={num_samples}; "
1098
- f"partitions={stop - start} "
1134
+ f"Exploding fields={num_fields} samples={num_samples}; "
1135
+ f"partitions={self.num_partitions} "
1099
1136
  f"variants={'unknown' if num_records is None else num_records}"
1100
1137
  )
1101
1138
  progress_config = core.ProgressConfig(
@@ -1105,48 +1142,43 @@ class IntermediateColumnarFormatWriter:
1105
1142
  show=show_progress,
1106
1143
  )
1107
1144
  with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
1108
- for j in range(start, stop):
1145
+ for j in range(self.num_partitions):
1109
1146
  pwm.submit(self.process_partition, j)
1110
1147
 
1111
- def explode(self, *, worker_processes=1, show_progress=False):
1112
- self.load_metadata()
1113
- return self.process_partition_slice(
1114
- 0,
1115
- self.num_partitions,
1116
- worker_processes=worker_processes,
1117
- show_progress=show_progress,
1118
- )
1119
-
1120
- def explode_partition(self, partition, *, show_progress=False, worker_processes=1):
1148
+ def explode_partition(self, partition):
1121
1149
  self.load_metadata()
1122
1150
  if partition < 0 or partition >= self.num_partitions:
1123
1151
  raise ValueError(
1124
1152
  "Partition index must be in the range 0 <= index < num_partitions"
1125
1153
  )
1126
- return self.process_partition_slice(
1127
- partition,
1128
- partition + 1,
1129
- worker_processes=worker_processes,
1130
- show_progress=show_progress,
1131
- )
1154
+ self.process_partition(partition)
1132
1155
 
1133
1156
  def finalise(self):
1134
1157
  self.load_metadata()
1135
1158
  partition_summaries = self.load_partition_summaries()
1136
1159
  total_records = 0
1137
1160
  for index, summary in enumerate(partition_summaries):
1138
- partition_records = summary["num_records"]
1161
+ partition_records = summary.num_records
1139
1162
  self.metadata.partitions[index].num_records = partition_records
1163
+ self.metadata.partitions[index].region.end = summary.last_position
1140
1164
  total_records += partition_records
1141
- assert total_records == self.metadata.num_records
1165
+ if not np.isinf(self.metadata.num_records):
1166
+ # Note: this is just telling us that there's a bug in the
1167
+ # index based record counting code, but it doesn't actually
1168
+ # matter much. We may want to just make this a warning if
1169
+ # we hit regular problems.
1170
+ assert total_records == self.metadata.num_records
1171
+ self.metadata.num_records = total_records
1172
+
1173
+ check_overlapping_partitions(self.metadata.partitions)
1142
1174
 
1143
1175
  for field in self.metadata.fields:
1144
1176
  for summary in partition_summaries:
1145
- field.summary.update(summary["field_summaries"][field.full_name])
1177
+ field.summary.update(summary.field_summaries[field.full_name])
1146
1178
 
1147
1179
  logger.info("Finalising metadata")
1148
1180
  with open(self.path / "metadata.json", "w") as f:
1149
- json.dump(self.metadata.asdict(), f, indent=4)
1181
+ f.write(self.metadata.asjson())
1150
1182
 
1151
1183
  logger.debug("Removing WIP directory")
1152
1184
  shutil.rmtree(self.wip_path)
@@ -1197,14 +1229,9 @@ def explode_init(
1197
1229
  )
1198
1230
 
1199
1231
 
1200
- # NOTE only including worker_processes here so we can use the 0 option to get the
1201
- # work done syncronously and so we can get test coverage on it. Should find a
1202
- # better way to do this.
1203
- def explode_partition(icf_path, partition, *, show_progress=False, worker_processes=1):
1232
+ def explode_partition(icf_path, partition):
1204
1233
  writer = IntermediateColumnarFormatWriter(icf_path)
1205
- writer.explode_partition(
1206
- partition, show_progress=show_progress, worker_processes=worker_processes
1207
- )
1234
+ writer.explode_partition(partition)
1208
1235
 
1209
1236
 
1210
1237
  def explode_finalise(icf_path):
@@ -1332,7 +1359,7 @@ class ZarrColumnSpec:
1332
1359
  return chunk_items * dt.itemsize
1333
1360
 
1334
1361
 
1335
- ZARR_SCHEMA_FORMAT_VERSION = "0.2"
1362
+ ZARR_SCHEMA_FORMAT_VERSION = "0.3"
1336
1363
 
1337
1364
 
1338
1365
  @dataclasses.dataclass
@@ -1341,11 +1368,10 @@ class VcfZarrSchema:
1341
1368
  samples_chunk_size: int
1342
1369
  variants_chunk_size: int
1343
1370
  dimensions: list
1344
- sample_id: list
1345
- contig_id: list
1346
- contig_length: list
1347
- filter_id: list
1348
- columns: dict
1371
+ samples: list
1372
+ contigs: list
1373
+ filters: list
1374
+ fields: dict
1349
1375
 
1350
1376
  def asdict(self):
1351
1377
  return dataclasses.asdict(self)
@@ -1361,8 +1387,11 @@ class VcfZarrSchema:
1361
1387
  f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
1362
1388
  )
1363
1389
  ret = VcfZarrSchema(**d)
1364
- ret.columns = {
1365
- key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
1390
+ ret.samples = [Sample(**sd) for sd in d["samples"]]
1391
+ ret.contigs = [Contig(**sd) for sd in d["contigs"]]
1392
+ ret.filters = [Filter(**sd) for sd in d["filters"]]
1393
+ ret.fields = {
1394
+ key: ZarrColumnSpec(**value) for key, value in d["fields"].items()
1366
1395
  }
1367
1396
  return ret
1368
1397
 
@@ -1406,7 +1435,7 @@ class VcfZarrSchema:
1406
1435
  chunks=[variants_chunk_size],
1407
1436
  )
1408
1437
 
1409
- alt_col = icf.columns["ALT"]
1438
+ alt_col = icf.fields["ALT"]
1410
1439
  max_alleles = alt_col.vcf_field.summary.max_number + 1
1411
1440
 
1412
1441
  colspecs = [
@@ -1498,12 +1527,11 @@ class VcfZarrSchema:
1498
1527
  format_version=ZARR_SCHEMA_FORMAT_VERSION,
1499
1528
  samples_chunk_size=samples_chunk_size,
1500
1529
  variants_chunk_size=variants_chunk_size,
1501
- columns={col.name: col for col in colspecs},
1530
+ fields={col.name: col for col in colspecs},
1502
1531
  dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
1503
- sample_id=icf.metadata.samples,
1504
- contig_id=icf.metadata.contig_names,
1505
- contig_length=icf.metadata.contig_lengths,
1506
- filter_id=icf.metadata.filters,
1532
+ samples=icf.metadata.samples,
1533
+ contigs=icf.metadata.contigs,
1534
+ filters=icf.metadata.filters,
1507
1535
  )
1508
1536
 
1509
1537
 
@@ -1671,7 +1699,7 @@ class VcfZarrWriter:
1671
1699
  store = zarr.DirectoryStore(self.arrays_path)
1672
1700
  root = zarr.group(store=store)
1673
1701
 
1674
- for column in self.schema.columns.values():
1702
+ for column in self.schema.fields.values():
1675
1703
  self.init_array(root, column, partitions[-1].stop)
1676
1704
 
1677
1705
  logger.info("Writing WIP metadata")
@@ -1680,13 +1708,13 @@ class VcfZarrWriter:
1680
1708
  return len(partitions)
1681
1709
 
1682
1710
  def encode_samples(self, root):
1683
- if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
1711
+ if self.schema.samples != self.icf.metadata.samples:
1684
1712
  raise ValueError(
1685
1713
  "Subsetting or reordering samples not supported currently"
1686
1714
  ) # NEEDS TEST
1687
1715
  array = root.array(
1688
1716
  "sample_id",
1689
- self.schema.sample_id,
1717
+ [sample.id for sample in self.schema.samples],
1690
1718
  dtype="str",
1691
1719
  compressor=DEFAULT_ZARR_COMPRESSOR,
1692
1720
  chunks=(self.schema.samples_chunk_size,),
@@ -1697,24 +1725,26 @@ class VcfZarrWriter:
1697
1725
  def encode_contig_id(self, root):
1698
1726
  array = root.array(
1699
1727
  "contig_id",
1700
- self.schema.contig_id,
1728
+ [contig.id for contig in self.schema.contigs],
1701
1729
  dtype="str",
1702
1730
  compressor=DEFAULT_ZARR_COMPRESSOR,
1703
1731
  )
1704
1732
  array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1705
- if self.schema.contig_length is not None:
1733
+ if all(contig.length is not None for contig in self.schema.contigs):
1706
1734
  array = root.array(
1707
1735
  "contig_length",
1708
- self.schema.contig_length,
1736
+ [contig.length for contig in self.schema.contigs],
1709
1737
  dtype=np.int64,
1710
1738
  compressor=DEFAULT_ZARR_COMPRESSOR,
1711
1739
  )
1712
1740
  array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
1713
1741
 
1714
1742
  def encode_filter_id(self, root):
1743
+ # TODO need a way to store description also
1744
+ # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
1715
1745
  array = root.array(
1716
1746
  "filter_id",
1717
- self.schema.filter_id,
1747
+ [filt.id for filt in self.schema.filters],
1718
1748
  dtype="str",
1719
1749
  compressor=DEFAULT_ZARR_COMPRESSOR,
1720
1750
  )
@@ -1782,16 +1812,16 @@ class VcfZarrWriter:
1782
1812
  self.encode_filters_partition(partition_index)
1783
1813
  self.encode_contig_partition(partition_index)
1784
1814
  self.encode_alleles_partition(partition_index)
1785
- for col in self.schema.columns.values():
1815
+ for col in self.schema.fields.values():
1786
1816
  if col.vcf_field is not None:
1787
1817
  self.encode_array_partition(col, partition_index)
1788
- if "call_genotype" in self.schema.columns:
1818
+ if "call_genotype" in self.schema.fields:
1789
1819
  self.encode_genotypes_partition(partition_index)
1790
1820
 
1791
1821
  final_path = self.partition_path(partition_index)
1792
1822
  logger.info(f"Finalising {partition_index} at {final_path}")
1793
1823
  if final_path.exists():
1794
- logger.warning("Removing existing partition at {final_path}")
1824
+ logger.warning(f"Removing existing partition at {final_path}")
1795
1825
  shutil.rmtree(final_path)
1796
1826
  os.rename(partition_path, final_path)
1797
1827
 
@@ -1813,7 +1843,7 @@ class VcfZarrWriter:
1813
1843
 
1814
1844
  partition = self.metadata.partitions[partition_index]
1815
1845
  ba = core.BufferedArray(array, partition.start)
1816
- source_col = self.icf.columns[column.vcf_field]
1846
+ source_col = self.icf.fields[column.vcf_field]
1817
1847
  sanitiser = source_col.sanitiser_factory(ba.buff.shape)
1818
1848
 
1819
1849
  for value in source_col.iter_values(partition.start, partition.stop):
@@ -1836,7 +1866,7 @@ class VcfZarrWriter:
1836
1866
  gt_mask = core.BufferedArray(gt_mask_array, partition.start)
1837
1867
  gt_phased = core.BufferedArray(gt_phased_array, partition.start)
1838
1868
 
1839
- source_col = self.icf.columns["FORMAT/GT"]
1869
+ source_col = self.icf.fields["FORMAT/GT"]
1840
1870
  for value in source_col.iter_values(partition.start, partition.stop):
1841
1871
  j = gt.next_buffer_row()
1842
1872
  sanitise_value_int_2d(gt.buff, j, value[:, :-1])
@@ -1859,8 +1889,8 @@ class VcfZarrWriter:
1859
1889
  alleles_array = self.init_partition_array(partition_index, array_name)
1860
1890
  partition = self.metadata.partitions[partition_index]
1861
1891
  alleles = core.BufferedArray(alleles_array, partition.start)
1862
- ref_col = self.icf.columns["REF"]
1863
- alt_col = self.icf.columns["ALT"]
1892
+ ref_col = self.icf.fields["REF"]
1893
+ alt_col = self.icf.fields["ALT"]
1864
1894
 
1865
1895
  for ref, alt in zip(
1866
1896
  ref_col.iter_values(partition.start, partition.stop),
@@ -1880,7 +1910,7 @@ class VcfZarrWriter:
1880
1910
  partition = self.metadata.partitions[partition_index]
1881
1911
  vid = core.BufferedArray(vid_array, partition.start)
1882
1912
  vid_mask = core.BufferedArray(vid_mask_array, partition.start)
1883
- col = self.icf.columns["ID"]
1913
+ col = self.icf.fields["ID"]
1884
1914
 
1885
1915
  for value in col.iter_values(partition.start, partition.stop):
1886
1916
  j = vid.next_buffer_row()
@@ -1899,13 +1929,13 @@ class VcfZarrWriter:
1899
1929
  self.finalise_partition_array(partition_index, "variant_id_mask")
1900
1930
 
1901
1931
  def encode_filters_partition(self, partition_index):
1902
- lookup = {filt: index for index, filt in enumerate(self.schema.filter_id)}
1932
+ lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
1903
1933
  array_name = "variant_filter"
1904
1934
  array = self.init_partition_array(partition_index, array_name)
1905
1935
  partition = self.metadata.partitions[partition_index]
1906
1936
  var_filter = core.BufferedArray(array, partition.start)
1907
1937
 
1908
- col = self.icf.columns["FILTERS"]
1938
+ col = self.icf.fields["FILTERS"]
1909
1939
  for value in col.iter_values(partition.start, partition.stop):
1910
1940
  j = var_filter.next_buffer_row()
1911
1941
  var_filter.buff[j] = False
@@ -1921,12 +1951,12 @@ class VcfZarrWriter:
1921
1951
  self.finalise_partition_array(partition_index, array_name)
1922
1952
 
1923
1953
  def encode_contig_partition(self, partition_index):
1924
- lookup = {contig: index for index, contig in enumerate(self.schema.contig_id)}
1954
+ lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
1925
1955
  array_name = "variant_contig"
1926
1956
  array = self.init_partition_array(partition_index, array_name)
1927
1957
  partition = self.metadata.partitions[partition_index]
1928
1958
  contig = core.BufferedArray(array, partition.start)
1929
- col = self.icf.columns["CHROM"]
1959
+ col = self.icf.fields["CHROM"]
1930
1960
 
1931
1961
  for value in col.iter_values(partition.start, partition.stop):
1932
1962
  j = contig.next_buffer_row()
@@ -1986,7 +2016,7 @@ class VcfZarrWriter:
1986
2016
  raise FileNotFoundError(f"Partitions not encoded: {missing}")
1987
2017
 
1988
2018
  progress_config = core.ProgressConfig(
1989
- total=len(self.schema.columns),
2019
+ total=len(self.schema.fields),
1990
2020
  title="Finalise",
1991
2021
  units="array",
1992
2022
  show=show_progress,
@@ -2000,7 +2030,7 @@ class VcfZarrWriter:
2000
2030
  # for multiple workers, or making a standard wrapper for tqdm
2001
2031
  # that allows us to have a consistent look and feel.
2002
2032
  with core.ParallelWorkManager(0, progress_config) as pwm:
2003
- for name in self.schema.columns:
2033
+ for name in self.schema.fields:
2004
2034
  pwm.submit(self.finalise_array, name)
2005
2035
  logger.debug(f"Removing {self.wip_path}")
2006
2036
  shutil.rmtree(self.wip_path)
@@ -2016,18 +2046,17 @@ class VcfZarrWriter:
2016
2046
  Return the approximate maximum memory used to encode a variant chunk.
2017
2047
  """
2018
2048
  max_encoding_mem = max(
2019
- col.variant_chunk_nbytes for col in self.schema.columns.values()
2049
+ col.variant_chunk_nbytes for col in self.schema.fields.values()
2020
2050
  )
2021
2051
  gt_mem = 0
2022
- if "call_genotype" in self.schema.columns:
2052
+ if "call_genotype" in self.schema.fields:
2023
2053
  encoded_together = [
2024
2054
  "call_genotype",
2025
2055
  "call_genotype_phased",
2026
2056
  "call_genotype_mask",
2027
2057
  ]
2028
2058
  gt_mem = sum(
2029
- self.schema.columns[col].variant_chunk_nbytes
2030
- for col in encoded_together
2059
+ self.schema.fields[col].variant_chunk_nbytes for col in encoded_together
2031
2060
  )
2032
2061
  return max(max_encoding_mem, gt_mem)
2033
2062
 
@@ -2059,7 +2088,7 @@ class VcfZarrWriter:
2059
2088
  num_workers = min(max_num_workers, worker_processes)
2060
2089
 
2061
2090
  total_bytes = 0
2062
- for col in self.schema.columns.values():
2091
+ for col in self.schema.fields.values():
2063
2092
  # Open the array definition to get the total size
2064
2093
  total_bytes += zarr.open(self.arrays_path / col.name).nbytes
2065
2094
 
@@ -76,6 +76,10 @@ def read_bytes_as_tuple(f: IO[Any], fmt: str) -> Sequence[Any]:
76
76
 
77
77
  @dataclass
78
78
  class Region:
79
+ """
80
+ A htslib style region, where coordinates are 1-based and inclusive.
81
+ """
82
+
79
83
  contig: str
80
84
  start: Optional[int] = None
81
85
  end: Optional[int] = None
@@ -86,7 +90,7 @@ class Region:
86
90
  assert self.start > 0
87
91
  if self.end is not None:
88
92
  self.end = int(self.end)
89
- assert self.end > self.start
93
+ assert self.end >= self.start
90
94
 
91
95
  def __str__(self):
92
96
  s = f"{self.contig}"
@@ -113,6 +117,9 @@ class CSIBin:
113
117
  chunks: Sequence[Chunk]
114
118
 
115
119
 
120
+ RECORD_COUNT_UNKNOWN = np.inf
121
+
122
+
116
123
  @dataclass
117
124
  class CSIIndex:
118
125
  min_shift: int
@@ -221,7 +228,9 @@ def read_csi(
221
228
  for _ in range(n_ref):
222
229
  n_bin = read_bytes_as_value(f, "<i")
223
230
  seq_bins = []
224
- record_count = -1
231
+ # Distinguish between counts that are zero because the sequence
232
+ # isn't there, vs counts that aren't in the index.
233
+ record_count = 0 if n_bin == 0 else RECORD_COUNT_UNKNOWN
225
234
  for _ in range(n_bin):
226
235
  bin, loffset, n_chunk = read_bytes_as_tuple(f, "<IQi")
227
236
  chunks = []
@@ -337,7 +346,9 @@ def read_tabix(
337
346
  for _ in range(header.n_ref):
338
347
  n_bin = read_bytes_as_value(f, "<i")
339
348
  seq_bins = []
340
- record_count = -1
349
+ # Distinguish between counts that are zero because the sequence
350
+ # isn't there, vs counts that aren't in the index.
351
+ record_count = 0 if n_bin == 0 else RECORD_COUNT_UNKNOWN
341
352
  for _ in range(n_bin):
342
353
  bin, n_chunk = read_bytes_as_tuple(f, "<Ii")
343
354
  chunks = []
@@ -436,19 +447,16 @@ class IndexedVcf(contextlib.AbstractContextManager):
436
447
  if var.POS >= start:
437
448
  yield var
438
449
 
439
- def _filter_empty(self, regions):
450
+ def _filter_empty_and_refine(self, regions):
440
451
  """
441
- Return all regions in the specified list that have one or more records.
442
-
443
- Sometimes with Tabix indexes these seem to crop up:
444
-
445
- - https://github.com/sgkit-dev/bio2zarr/issues/45
446
- - https://github.com/sgkit-dev/bio2zarr/issues/120
452
+ Return all regions in the specified list that have one or more records,
453
+ and refine the start coordinate of the region to be the actual first coord
447
454
  """
448
455
  ret = []
449
456
  for region in regions:
450
- variants = self.variants(region)
451
- if next(variants, None) is not None:
457
+ var = next(self.variants(region), None)
458
+ if var is not None:
459
+ region.start = var.POS
452
460
  ret.append(region)
453
461
  return ret
454
462
 
@@ -528,4 +536,4 @@ class IndexedVcf(contextlib.AbstractContextManager):
528
536
  if self.index.record_counts[ri] > 0:
529
537
  regions.append(Region(self.sequence_names[ri]))
530
538
 
531
- return self._filter_empty(regions)
539
+ return self._filter_empty_and_refine(regions)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bio2zarr
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -234,7 +234,7 @@ Requires-Dist: pysam; extra == "dev"
234
234
  Requires-Dist: pytest; extra == "dev"
235
235
  Requires-Dist: pytest-coverage; extra == "dev"
236
236
  Requires-Dist: pytest-xdist; extra == "dev"
237
- Requires-Dist: sgkit; extra == "dev"
237
+ Requires-Dist: sgkit>=0.8.0; extra == "dev"
238
238
  Requires-Dist: tqdm; extra == "dev"
239
239
 
240
240
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
@@ -13,5 +13,5 @@ pysam
13
13
  pytest
14
14
  pytest-coverage
15
15
  pytest-xdist
16
- sgkit
16
+ sgkit>=0.8.0
17
17
  tqdm
@@ -53,7 +53,7 @@ dev = [
53
53
  "pytest",
54
54
  "pytest-coverage",
55
55
  "pytest-xdist",
56
- "sgkit",
56
+ "sgkit>=0.8.0",
57
57
  "tqdm"
58
58
  ]
59
59
 
@@ -39,9 +39,14 @@ all: 1kg_2020_chr20.bcf.csi \
39
39
  # 1000 genomes phase 1
40
40
  1KG_P1_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chr6.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
41
41
 
42
+ old_tabix:
43
+ rm -fR tabix old_tabix
44
+ git clone https://github.com/samtools/tabix.git
45
+ cd tabix && make
46
+ cp tabix/tabix ./old_tabix
42
47
 
43
48
  %.vcf.gz.tbi: %.vcf.gz
44
- tabix $<
49
+ ./old_tabix $<
45
50
 
46
51
  %.2.split: %
47
52
  ./split.sh $< 2
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes