bio2zarr 0.0.8__tar.gz → 0.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/CHANGELOG.md +8 -1
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/PKG-INFO +2 -2
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/_version.py +2 -2
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/cli.py +1 -1
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/vcf.py +192 -163
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/vcf_utils.py +21 -13
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/PKG-INFO +2 -2
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/requires.txt +1 -1
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/pyproject.toml +1 -1
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/validation-data/Makefile +6 -1
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/.github/workflows/ci.yml +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/.github/workflows/docs.yml +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/.gitignore +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/.pre-commit-config.yaml +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/LICENSE +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/MANIFEST.in +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/README.md +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/__init__.py +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/__main__.py +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/core.py +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/plink.py +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/provenance.py +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr/typing.py +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/SOURCES.txt +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/dependency_links.txt +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/entry_points.txt +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/bio2zarr.egg-info/top_level.txt +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/Makefile +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/_config.yml +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/_toc.yml +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/build.sh +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/cli.md +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/intro.md +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/logo.png +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/references.bib +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/docs/requirements.txt +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/setup.cfg +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/validation-data/split.sh +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/validation.py +0 -0
- {bio2zarr-0.0.8 → bio2zarr-0.0.9}/vcf_generator.py +0 -0
|
@@ -1,4 +1,11 @@
|
|
|
1
|
-
# 0.0.
|
|
1
|
+
# 0.0.9 2024-05-02
|
|
2
|
+
|
|
3
|
+
- Change on-disk format for explode and schema
|
|
4
|
+
- Support older tabix indexes
|
|
5
|
+
- Fix some bugs in explode
|
|
6
|
+
|
|
7
|
+
# 0.0.8 2024-04-30
|
|
8
|
+
|
|
2
9
|
- Change on-disk format of distributed encode and simplify
|
|
3
10
|
- Check for all partitions nominally completed encoding before doing
|
|
4
11
|
anything destructive in dencode-finalise
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -234,7 +234,7 @@ Requires-Dist: pysam; extra == "dev"
|
|
|
234
234
|
Requires-Dist: pytest; extra == "dev"
|
|
235
235
|
Requires-Dist: pytest-coverage; extra == "dev"
|
|
236
236
|
Requires-Dist: pytest-xdist; extra == "dev"
|
|
237
|
-
Requires-Dist: sgkit; extra == "dev"
|
|
237
|
+
Requires-Dist: sgkit>=0.8.0; extra == "dev"
|
|
238
238
|
Requires-Dist: tqdm; extra == "dev"
|
|
239
239
|
|
|
240
240
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
@@ -233,7 +233,7 @@ def dexplode_partition(icf_path, partition, verbose):
|
|
|
233
233
|
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
|
|
234
234
|
"""
|
|
235
235
|
setup_logging(verbose)
|
|
236
|
-
vcf.explode_partition(icf_path, partition
|
|
236
|
+
vcf.explode_partition(icf_path, partition)
|
|
237
237
|
|
|
238
238
|
|
|
239
239
|
@click.command
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import contextlib
|
|
3
3
|
import dataclasses
|
|
4
|
-
import functools
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
6
|
import math
|
|
@@ -145,29 +144,41 @@ class VcfPartition:
|
|
|
145
144
|
num_records: int = -1
|
|
146
145
|
|
|
147
146
|
|
|
148
|
-
ICF_METADATA_FORMAT_VERSION = "0.
|
|
147
|
+
ICF_METADATA_FORMAT_VERSION = "0.3"
|
|
149
148
|
ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
|
|
150
149
|
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
|
|
151
150
|
)
|
|
152
151
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
152
|
+
|
|
153
|
+
@dataclasses.dataclass
|
|
154
|
+
class Contig:
|
|
155
|
+
id: str
|
|
156
|
+
length: int = None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclasses.dataclass
|
|
160
|
+
class Sample:
|
|
161
|
+
id: str
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@dataclasses.dataclass
|
|
165
|
+
class Filter:
|
|
166
|
+
id: str
|
|
167
|
+
description: str = ""
|
|
156
168
|
|
|
157
169
|
|
|
158
170
|
@dataclasses.dataclass
|
|
159
171
|
class IcfMetadata:
|
|
160
172
|
samples: list
|
|
161
|
-
|
|
162
|
-
contig_record_counts: dict
|
|
173
|
+
contigs: list
|
|
163
174
|
filters: list
|
|
164
175
|
fields: list
|
|
165
176
|
partitions: list = None
|
|
166
|
-
contig_lengths: list = None
|
|
167
177
|
format_version: str = None
|
|
168
178
|
compressor: dict = None
|
|
169
179
|
column_chunk_size: int = None
|
|
170
180
|
provenance: dict = None
|
|
181
|
+
num_records: int = -1
|
|
171
182
|
|
|
172
183
|
@property
|
|
173
184
|
def info_fields(self):
|
|
@@ -187,16 +198,12 @@ class IcfMetadata:
|
|
|
187
198
|
|
|
188
199
|
@property
|
|
189
200
|
def num_contigs(self):
|
|
190
|
-
return len(self.
|
|
201
|
+
return len(self.contigs)
|
|
191
202
|
|
|
192
203
|
@property
|
|
193
204
|
def num_filters(self):
|
|
194
205
|
return len(self.filters)
|
|
195
206
|
|
|
196
|
-
@property
|
|
197
|
-
def num_records(self):
|
|
198
|
-
return sum(self.contig_record_counts.values())
|
|
199
|
-
|
|
200
207
|
@staticmethod
|
|
201
208
|
def fromdict(d):
|
|
202
209
|
if d["format_version"] != ICF_METADATA_FORMAT_VERSION:
|
|
@@ -204,18 +211,23 @@ class IcfMetadata:
|
|
|
204
211
|
"Intermediate columnar metadata format version mismatch: "
|
|
205
212
|
f"{d['format_version']} != {ICF_METADATA_FORMAT_VERSION}"
|
|
206
213
|
)
|
|
207
|
-
fields = [VcfField.fromdict(fd) for fd in d["fields"]]
|
|
208
214
|
partitions = [VcfPartition(**pd) for pd in d["partitions"]]
|
|
209
215
|
for p in partitions:
|
|
210
216
|
p.region = vcf_utils.Region(**p.region)
|
|
211
217
|
d = d.copy()
|
|
212
|
-
d["fields"] = fields
|
|
213
218
|
d["partitions"] = partitions
|
|
219
|
+
d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
|
|
220
|
+
d["samples"] = [Sample(**sd) for sd in d["samples"]]
|
|
221
|
+
d["filters"] = [Filter(**fd) for fd in d["filters"]]
|
|
222
|
+
d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
|
|
214
223
|
return IcfMetadata(**d)
|
|
215
224
|
|
|
216
225
|
def asdict(self):
|
|
217
226
|
return dataclasses.asdict(self)
|
|
218
227
|
|
|
228
|
+
def asjson(self):
|
|
229
|
+
return json.dumps(self.asdict(), indent=4)
|
|
230
|
+
|
|
219
231
|
|
|
220
232
|
def fixed_vcf_field_definitions():
|
|
221
233
|
def make_field_def(name, vcf_type, vcf_number):
|
|
@@ -243,15 +255,22 @@ def fixed_vcf_field_definitions():
|
|
|
243
255
|
def scan_vcf(path, target_num_partitions):
|
|
244
256
|
with vcf_utils.IndexedVcf(path) as indexed_vcf:
|
|
245
257
|
vcf = indexed_vcf.vcf
|
|
246
|
-
filters = [
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str)
|
|
250
|
-
|
|
258
|
+
filters = []
|
|
259
|
+
pass_index = -1
|
|
260
|
+
for h in vcf.header_iter():
|
|
261
|
+
if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str):
|
|
262
|
+
try:
|
|
263
|
+
description = h["Description"].strip('"')
|
|
264
|
+
except KeyError:
|
|
265
|
+
description = ""
|
|
266
|
+
if h["ID"] == "PASS":
|
|
267
|
+
pass_index = len(filters)
|
|
268
|
+
filters.append(Filter(h["ID"], description))
|
|
269
|
+
|
|
251
270
|
# Ensure PASS is the first filter if present
|
|
252
|
-
if
|
|
253
|
-
filters.
|
|
254
|
-
filters.insert(0,
|
|
271
|
+
if pass_index > 0:
|
|
272
|
+
pass_filter = filters.pop(pass_index)
|
|
273
|
+
filters.insert(0, pass_filter)
|
|
255
274
|
|
|
256
275
|
fields = fixed_vcf_field_definitions()
|
|
257
276
|
for h in vcf.header_iter():
|
|
@@ -262,18 +281,22 @@ def scan_vcf(path, target_num_partitions):
|
|
|
262
281
|
field.vcf_number = "."
|
|
263
282
|
fields.append(field)
|
|
264
283
|
|
|
284
|
+
try:
|
|
285
|
+
contig_lengths = vcf.seqlens
|
|
286
|
+
except AttributeError:
|
|
287
|
+
contig_lengths = [None for _ in vcf.seqnames]
|
|
288
|
+
|
|
265
289
|
metadata = IcfMetadata(
|
|
266
|
-
samples=vcf.samples,
|
|
267
|
-
|
|
268
|
-
|
|
290
|
+
samples=[Sample(sample_id) for sample_id in vcf.samples],
|
|
291
|
+
contigs=[
|
|
292
|
+
Contig(contig_id, length)
|
|
293
|
+
for contig_id, length in zip(vcf.seqnames, contig_lengths)
|
|
294
|
+
],
|
|
269
295
|
filters=filters,
|
|
270
296
|
fields=fields,
|
|
271
297
|
partitions=[],
|
|
298
|
+
num_records=sum(indexed_vcf.contig_record_counts().values()),
|
|
272
299
|
)
|
|
273
|
-
try:
|
|
274
|
-
metadata.contig_lengths = vcf.seqlens
|
|
275
|
-
except AttributeError:
|
|
276
|
-
pass
|
|
277
300
|
|
|
278
301
|
regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
|
|
279
302
|
logger.info(
|
|
@@ -292,22 +315,6 @@ def scan_vcf(path, target_num_partitions):
|
|
|
292
315
|
return metadata, vcf.raw_header
|
|
293
316
|
|
|
294
317
|
|
|
295
|
-
def check_overlap(partitions):
|
|
296
|
-
for i in range(1, len(partitions)):
|
|
297
|
-
prev_region = partitions[i - 1].region
|
|
298
|
-
current_region = partitions[i].region
|
|
299
|
-
if prev_region.contig == current_region.contig:
|
|
300
|
-
if prev_region.end is None:
|
|
301
|
-
logger.warning("Cannot check overlaps; issue #146")
|
|
302
|
-
continue
|
|
303
|
-
if prev_region.end > current_region.start:
|
|
304
|
-
raise ValueError(
|
|
305
|
-
f"Multiple VCFs have the region "
|
|
306
|
-
f"{prev_region.contig}:{prev_region.start}-"
|
|
307
|
-
f"{current_region.end}"
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
|
|
311
318
|
def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
312
319
|
logger.info(
|
|
313
320
|
f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
|
|
@@ -336,27 +343,30 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
|
|
|
336
343
|
# We just take the first header, assuming the others
|
|
337
344
|
# are compatible.
|
|
338
345
|
all_partitions = []
|
|
339
|
-
|
|
346
|
+
total_records = 0
|
|
340
347
|
for metadata, _ in results:
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
metadata.
|
|
348
|
+
for partition in metadata.partitions:
|
|
349
|
+
logger.debug(f"Scanned partition {partition}")
|
|
350
|
+
all_partitions.append(partition)
|
|
351
|
+
total_records += metadata.num_records
|
|
352
|
+
metadata.num_records = 0
|
|
353
|
+
metadata.partitions = []
|
|
345
354
|
|
|
346
355
|
icf_metadata, header = results[0]
|
|
347
356
|
for metadata, _ in results[1:]:
|
|
348
357
|
if metadata != icf_metadata:
|
|
349
358
|
raise ValueError("Incompatible VCF chunks")
|
|
350
359
|
|
|
351
|
-
|
|
360
|
+
# Note: this will be infinity here if any of the chunks has an index
|
|
361
|
+
# that doesn't keep track of the number of records per-contig
|
|
362
|
+
icf_metadata.num_records = total_records
|
|
352
363
|
|
|
353
364
|
# Sort by contig (in the order they appear in the header) first,
|
|
354
365
|
# then by start coordinate
|
|
355
|
-
contig_index_map = {contig: j for j, contig in enumerate(metadata.
|
|
366
|
+
contig_index_map = {contig.id: j for j, contig in enumerate(metadata.contigs)}
|
|
356
367
|
all_partitions.sort(
|
|
357
368
|
key=lambda x: (contig_index_map[x.region.contig], x.region.start)
|
|
358
369
|
)
|
|
359
|
-
check_overlap(all_partitions)
|
|
360
370
|
icf_metadata.partitions = all_partitions
|
|
361
371
|
logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
|
|
362
372
|
return icf_metadata, header
|
|
@@ -853,19 +863,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
853
863
|
self.metadata = IcfMetadata.fromdict(json.load(f))
|
|
854
864
|
with open(self.path / "header.txt") as f:
|
|
855
865
|
self.vcf_header = f.read()
|
|
856
|
-
|
|
857
866
|
self.compressor = numcodecs.get_codec(self.metadata.compressor)
|
|
858
|
-
self.
|
|
867
|
+
self.fields = {}
|
|
859
868
|
partition_num_records = [
|
|
860
869
|
partition.num_records for partition in self.metadata.partitions
|
|
861
870
|
]
|
|
862
871
|
# Allow us to find which partition a given record is in
|
|
863
872
|
self.partition_record_index = np.cumsum([0, *partition_num_records])
|
|
864
873
|
for field in self.metadata.fields:
|
|
865
|
-
self.
|
|
874
|
+
self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
|
|
866
875
|
logger.info(
|
|
867
876
|
f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
|
|
868
|
-
f"records={self.num_records},
|
|
877
|
+
f"records={self.num_records}, fields={self.num_fields})"
|
|
869
878
|
)
|
|
870
879
|
|
|
871
880
|
def __repr__(self):
|
|
@@ -876,17 +885,17 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
876
885
|
)
|
|
877
886
|
|
|
878
887
|
def __getitem__(self, key):
|
|
879
|
-
return self.
|
|
888
|
+
return self.fields[key]
|
|
880
889
|
|
|
881
890
|
def __iter__(self):
|
|
882
|
-
return iter(self.
|
|
891
|
+
return iter(self.fields)
|
|
883
892
|
|
|
884
893
|
def __len__(self):
|
|
885
|
-
return len(self.
|
|
894
|
+
return len(self.fields)
|
|
886
895
|
|
|
887
896
|
def summary_table(self):
|
|
888
897
|
data = []
|
|
889
|
-
for name, col in self.
|
|
898
|
+
for name, col in self.fields.items():
|
|
890
899
|
summary = col.vcf_field.summary
|
|
891
900
|
d = {
|
|
892
901
|
"name": name,
|
|
@@ -902,9 +911,9 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
902
911
|
data.append(d)
|
|
903
912
|
return data
|
|
904
913
|
|
|
905
|
-
@
|
|
914
|
+
@property
|
|
906
915
|
def num_records(self):
|
|
907
|
-
return
|
|
916
|
+
return self.metadata.num_records
|
|
908
917
|
|
|
909
918
|
@property
|
|
910
919
|
def num_partitions(self):
|
|
@@ -915,8 +924,42 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
|
|
|
915
924
|
return len(self.metadata.samples)
|
|
916
925
|
|
|
917
926
|
@property
|
|
918
|
-
def
|
|
919
|
-
return len(self.
|
|
927
|
+
def num_fields(self):
|
|
928
|
+
return len(self.fields)
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
@dataclasses.dataclass
|
|
932
|
+
class IcfPartitionMetadata:
|
|
933
|
+
num_records: int
|
|
934
|
+
last_position: int
|
|
935
|
+
field_summaries: dict
|
|
936
|
+
|
|
937
|
+
def asdict(self):
|
|
938
|
+
return dataclasses.asdict(self)
|
|
939
|
+
|
|
940
|
+
def asjson(self):
|
|
941
|
+
return json.dumps(self.asdict(), indent=4)
|
|
942
|
+
|
|
943
|
+
@staticmethod
|
|
944
|
+
def fromdict(d):
|
|
945
|
+
md = IcfPartitionMetadata(**d)
|
|
946
|
+
for k, v in md.field_summaries.items():
|
|
947
|
+
md.field_summaries[k] = VcfFieldSummary.fromdict(v)
|
|
948
|
+
return md
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
def check_overlapping_partitions(partitions):
|
|
952
|
+
for i in range(1, len(partitions)):
|
|
953
|
+
prev_region = partitions[i - 1].region
|
|
954
|
+
current_region = partitions[i].region
|
|
955
|
+
if prev_region.contig == current_region.contig:
|
|
956
|
+
assert prev_region.end is not None
|
|
957
|
+
# Regions are *inclusive*
|
|
958
|
+
if prev_region.end >= current_region.start:
|
|
959
|
+
raise ValueError(
|
|
960
|
+
f"Overlapping VCF regions in partitions {i - 1} and {i}: "
|
|
961
|
+
f"{prev_region} and {current_region}"
|
|
962
|
+
)
|
|
920
963
|
|
|
921
964
|
|
|
922
965
|
class IntermediateColumnarFormatWriter:
|
|
@@ -990,11 +1033,8 @@ class IntermediateColumnarFormatWriter:
|
|
|
990
1033
|
not_found = []
|
|
991
1034
|
for j in range(self.num_partitions):
|
|
992
1035
|
try:
|
|
993
|
-
with open(self.wip_path / f"p{j}
|
|
994
|
-
|
|
995
|
-
for k, v in summary["field_summaries"].items():
|
|
996
|
-
summary["field_summaries"][k] = VcfFieldSummary.fromdict(v)
|
|
997
|
-
summaries.append(summary)
|
|
1036
|
+
with open(self.wip_path / f"p{j}.json") as f:
|
|
1037
|
+
summaries.append(IcfPartitionMetadata.fromdict(json.load(f)))
|
|
998
1038
|
except FileNotFoundError:
|
|
999
1039
|
not_found.append(j)
|
|
1000
1040
|
if len(not_found) > 0:
|
|
@@ -1011,7 +1051,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
1011
1051
|
|
|
1012
1052
|
def process_partition(self, partition_index):
|
|
1013
1053
|
self.load_metadata()
|
|
1014
|
-
summary_path = self.wip_path / f"p{partition_index}
|
|
1054
|
+
summary_path = self.wip_path / f"p{partition_index}.json"
|
|
1015
1055
|
# If someone is rewriting a summary path (for whatever reason), make sure it
|
|
1016
1056
|
# doesn't look like it's already been completed.
|
|
1017
1057
|
# NOTE to do this properly we probably need to take a lock on this file - but
|
|
@@ -1032,6 +1072,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
1032
1072
|
else:
|
|
1033
1073
|
format_fields.append(field)
|
|
1034
1074
|
|
|
1075
|
+
last_position = None
|
|
1035
1076
|
with IcfPartitionWriter(
|
|
1036
1077
|
self.metadata,
|
|
1037
1078
|
self.path,
|
|
@@ -1041,6 +1082,7 @@ class IntermediateColumnarFormatWriter:
|
|
|
1041
1082
|
num_records = 0
|
|
1042
1083
|
for variant in ivcf.variants(partition.region):
|
|
1043
1084
|
num_records += 1
|
|
1085
|
+
last_position = variant.POS
|
|
1044
1086
|
tcw.append("CHROM", variant.CHROM)
|
|
1045
1087
|
tcw.append("POS", variant.POS)
|
|
1046
1088
|
tcw.append("QUAL", variant.QUAL)
|
|
@@ -1065,37 +1107,32 @@ class IntermediateColumnarFormatWriter:
|
|
|
1065
1107
|
f"flushing buffers"
|
|
1066
1108
|
)
|
|
1067
1109
|
|
|
1068
|
-
partition_metadata =
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1110
|
+
partition_metadata = IcfPartitionMetadata(
|
|
1111
|
+
num_records=num_records,
|
|
1112
|
+
last_position=last_position,
|
|
1113
|
+
field_summaries=tcw.field_summaries,
|
|
1114
|
+
)
|
|
1072
1115
|
with open(summary_path, "w") as f:
|
|
1073
|
-
|
|
1116
|
+
f.write(partition_metadata.asjson())
|
|
1074
1117
|
logger.info(
|
|
1075
|
-
f"Finish p{partition_index} {partition.vcf_path}__{partition.region}
|
|
1076
|
-
f"{num_records} records"
|
|
1118
|
+
f"Finish p{partition_index} {partition.vcf_path}__{partition.region} "
|
|
1119
|
+
f"{num_records} records last_pos={last_position}"
|
|
1077
1120
|
)
|
|
1078
1121
|
|
|
1079
|
-
def
|
|
1080
|
-
self,
|
|
1081
|
-
start,
|
|
1082
|
-
stop,
|
|
1083
|
-
*,
|
|
1084
|
-
worker_processes=1,
|
|
1085
|
-
show_progress=False,
|
|
1086
|
-
):
|
|
1122
|
+
def explode(self, *, worker_processes=1, show_progress=False):
|
|
1087
1123
|
self.load_metadata()
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1124
|
+
num_records = self.metadata.num_records
|
|
1125
|
+
if np.isinf(num_records):
|
|
1126
|
+
logger.warning(
|
|
1127
|
+
"Total records unknown, cannot show progress; "
|
|
1128
|
+
"reindex VCFs with bcftools index to fix"
|
|
1129
|
+
)
|
|
1093
1130
|
num_records = None
|
|
1094
|
-
|
|
1131
|
+
num_fields = len(self.metadata.fields)
|
|
1095
1132
|
num_samples = len(self.metadata.samples)
|
|
1096
1133
|
logger.info(
|
|
1097
|
-
f"Exploding
|
|
1098
|
-
f"partitions={
|
|
1134
|
+
f"Exploding fields={num_fields} samples={num_samples}; "
|
|
1135
|
+
f"partitions={self.num_partitions} "
|
|
1099
1136
|
f"variants={'unknown' if num_records is None else num_records}"
|
|
1100
1137
|
)
|
|
1101
1138
|
progress_config = core.ProgressConfig(
|
|
@@ -1105,48 +1142,43 @@ class IntermediateColumnarFormatWriter:
|
|
|
1105
1142
|
show=show_progress,
|
|
1106
1143
|
)
|
|
1107
1144
|
with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
|
|
1108
|
-
for j in range(
|
|
1145
|
+
for j in range(self.num_partitions):
|
|
1109
1146
|
pwm.submit(self.process_partition, j)
|
|
1110
1147
|
|
|
1111
|
-
def
|
|
1112
|
-
self.load_metadata()
|
|
1113
|
-
return self.process_partition_slice(
|
|
1114
|
-
0,
|
|
1115
|
-
self.num_partitions,
|
|
1116
|
-
worker_processes=worker_processes,
|
|
1117
|
-
show_progress=show_progress,
|
|
1118
|
-
)
|
|
1119
|
-
|
|
1120
|
-
def explode_partition(self, partition, *, show_progress=False, worker_processes=1):
|
|
1148
|
+
def explode_partition(self, partition):
|
|
1121
1149
|
self.load_metadata()
|
|
1122
1150
|
if partition < 0 or partition >= self.num_partitions:
|
|
1123
1151
|
raise ValueError(
|
|
1124
1152
|
"Partition index must be in the range 0 <= index < num_partitions"
|
|
1125
1153
|
)
|
|
1126
|
-
|
|
1127
|
-
partition,
|
|
1128
|
-
partition + 1,
|
|
1129
|
-
worker_processes=worker_processes,
|
|
1130
|
-
show_progress=show_progress,
|
|
1131
|
-
)
|
|
1154
|
+
self.process_partition(partition)
|
|
1132
1155
|
|
|
1133
1156
|
def finalise(self):
|
|
1134
1157
|
self.load_metadata()
|
|
1135
1158
|
partition_summaries = self.load_partition_summaries()
|
|
1136
1159
|
total_records = 0
|
|
1137
1160
|
for index, summary in enumerate(partition_summaries):
|
|
1138
|
-
partition_records = summary
|
|
1161
|
+
partition_records = summary.num_records
|
|
1139
1162
|
self.metadata.partitions[index].num_records = partition_records
|
|
1163
|
+
self.metadata.partitions[index].region.end = summary.last_position
|
|
1140
1164
|
total_records += partition_records
|
|
1141
|
-
|
|
1165
|
+
if not np.isinf(self.metadata.num_records):
|
|
1166
|
+
# Note: this is just telling us that there's a bug in the
|
|
1167
|
+
# index based record counting code, but it doesn't actually
|
|
1168
|
+
# matter much. We may want to just make this a warning if
|
|
1169
|
+
# we hit regular problems.
|
|
1170
|
+
assert total_records == self.metadata.num_records
|
|
1171
|
+
self.metadata.num_records = total_records
|
|
1172
|
+
|
|
1173
|
+
check_overlapping_partitions(self.metadata.partitions)
|
|
1142
1174
|
|
|
1143
1175
|
for field in self.metadata.fields:
|
|
1144
1176
|
for summary in partition_summaries:
|
|
1145
|
-
field.summary.update(summary
|
|
1177
|
+
field.summary.update(summary.field_summaries[field.full_name])
|
|
1146
1178
|
|
|
1147
1179
|
logger.info("Finalising metadata")
|
|
1148
1180
|
with open(self.path / "metadata.json", "w") as f:
|
|
1149
|
-
|
|
1181
|
+
f.write(self.metadata.asjson())
|
|
1150
1182
|
|
|
1151
1183
|
logger.debug("Removing WIP directory")
|
|
1152
1184
|
shutil.rmtree(self.wip_path)
|
|
@@ -1197,14 +1229,9 @@ def explode_init(
|
|
|
1197
1229
|
)
|
|
1198
1230
|
|
|
1199
1231
|
|
|
1200
|
-
|
|
1201
|
-
# work done syncronously and so we can get test coverage on it. Should find a
|
|
1202
|
-
# better way to do this.
|
|
1203
|
-
def explode_partition(icf_path, partition, *, show_progress=False, worker_processes=1):
|
|
1232
|
+
def explode_partition(icf_path, partition):
|
|
1204
1233
|
writer = IntermediateColumnarFormatWriter(icf_path)
|
|
1205
|
-
writer.explode_partition(
|
|
1206
|
-
partition, show_progress=show_progress, worker_processes=worker_processes
|
|
1207
|
-
)
|
|
1234
|
+
writer.explode_partition(partition)
|
|
1208
1235
|
|
|
1209
1236
|
|
|
1210
1237
|
def explode_finalise(icf_path):
|
|
@@ -1332,7 +1359,7 @@ class ZarrColumnSpec:
|
|
|
1332
1359
|
return chunk_items * dt.itemsize
|
|
1333
1360
|
|
|
1334
1361
|
|
|
1335
|
-
ZARR_SCHEMA_FORMAT_VERSION = "0.
|
|
1362
|
+
ZARR_SCHEMA_FORMAT_VERSION = "0.3"
|
|
1336
1363
|
|
|
1337
1364
|
|
|
1338
1365
|
@dataclasses.dataclass
|
|
@@ -1341,11 +1368,10 @@ class VcfZarrSchema:
|
|
|
1341
1368
|
samples_chunk_size: int
|
|
1342
1369
|
variants_chunk_size: int
|
|
1343
1370
|
dimensions: list
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
columns: dict
|
|
1371
|
+
samples: list
|
|
1372
|
+
contigs: list
|
|
1373
|
+
filters: list
|
|
1374
|
+
fields: dict
|
|
1349
1375
|
|
|
1350
1376
|
def asdict(self):
|
|
1351
1377
|
return dataclasses.asdict(self)
|
|
@@ -1361,8 +1387,11 @@ class VcfZarrSchema:
|
|
|
1361
1387
|
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
|
|
1362
1388
|
)
|
|
1363
1389
|
ret = VcfZarrSchema(**d)
|
|
1364
|
-
ret.
|
|
1365
|
-
|
|
1390
|
+
ret.samples = [Sample(**sd) for sd in d["samples"]]
|
|
1391
|
+
ret.contigs = [Contig(**sd) for sd in d["contigs"]]
|
|
1392
|
+
ret.filters = [Filter(**sd) for sd in d["filters"]]
|
|
1393
|
+
ret.fields = {
|
|
1394
|
+
key: ZarrColumnSpec(**value) for key, value in d["fields"].items()
|
|
1366
1395
|
}
|
|
1367
1396
|
return ret
|
|
1368
1397
|
|
|
@@ -1406,7 +1435,7 @@ class VcfZarrSchema:
|
|
|
1406
1435
|
chunks=[variants_chunk_size],
|
|
1407
1436
|
)
|
|
1408
1437
|
|
|
1409
|
-
alt_col = icf.
|
|
1438
|
+
alt_col = icf.fields["ALT"]
|
|
1410
1439
|
max_alleles = alt_col.vcf_field.summary.max_number + 1
|
|
1411
1440
|
|
|
1412
1441
|
colspecs = [
|
|
@@ -1498,12 +1527,11 @@ class VcfZarrSchema:
|
|
|
1498
1527
|
format_version=ZARR_SCHEMA_FORMAT_VERSION,
|
|
1499
1528
|
samples_chunk_size=samples_chunk_size,
|
|
1500
1529
|
variants_chunk_size=variants_chunk_size,
|
|
1501
|
-
|
|
1530
|
+
fields={col.name: col for col in colspecs},
|
|
1502
1531
|
dimensions=["variants", "samples", "ploidy", "alleles", "filters"],
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
filter_id=icf.metadata.filters,
|
|
1532
|
+
samples=icf.metadata.samples,
|
|
1533
|
+
contigs=icf.metadata.contigs,
|
|
1534
|
+
filters=icf.metadata.filters,
|
|
1507
1535
|
)
|
|
1508
1536
|
|
|
1509
1537
|
|
|
@@ -1671,7 +1699,7 @@ class VcfZarrWriter:
|
|
|
1671
1699
|
store = zarr.DirectoryStore(self.arrays_path)
|
|
1672
1700
|
root = zarr.group(store=store)
|
|
1673
1701
|
|
|
1674
|
-
for column in self.schema.
|
|
1702
|
+
for column in self.schema.fields.values():
|
|
1675
1703
|
self.init_array(root, column, partitions[-1].stop)
|
|
1676
1704
|
|
|
1677
1705
|
logger.info("Writing WIP metadata")
|
|
@@ -1680,13 +1708,13 @@ class VcfZarrWriter:
|
|
|
1680
1708
|
return len(partitions)
|
|
1681
1709
|
|
|
1682
1710
|
def encode_samples(self, root):
|
|
1683
|
-
if
|
|
1711
|
+
if self.schema.samples != self.icf.metadata.samples:
|
|
1684
1712
|
raise ValueError(
|
|
1685
1713
|
"Subsetting or reordering samples not supported currently"
|
|
1686
1714
|
) # NEEDS TEST
|
|
1687
1715
|
array = root.array(
|
|
1688
1716
|
"sample_id",
|
|
1689
|
-
self.schema.
|
|
1717
|
+
[sample.id for sample in self.schema.samples],
|
|
1690
1718
|
dtype="str",
|
|
1691
1719
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1692
1720
|
chunks=(self.schema.samples_chunk_size,),
|
|
@@ -1697,24 +1725,26 @@ class VcfZarrWriter:
|
|
|
1697
1725
|
def encode_contig_id(self, root):
|
|
1698
1726
|
array = root.array(
|
|
1699
1727
|
"contig_id",
|
|
1700
|
-
self.schema.
|
|
1728
|
+
[contig.id for contig in self.schema.contigs],
|
|
1701
1729
|
dtype="str",
|
|
1702
1730
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1703
1731
|
)
|
|
1704
1732
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1705
|
-
if
|
|
1733
|
+
if all(contig.length is not None for contig in self.schema.contigs):
|
|
1706
1734
|
array = root.array(
|
|
1707
1735
|
"contig_length",
|
|
1708
|
-
self.schema.
|
|
1736
|
+
[contig.length for contig in self.schema.contigs],
|
|
1709
1737
|
dtype=np.int64,
|
|
1710
1738
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1711
1739
|
)
|
|
1712
1740
|
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
|
|
1713
1741
|
|
|
1714
1742
|
def encode_filter_id(self, root):
|
|
1743
|
+
# TODO need a way to store description also
|
|
1744
|
+
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
|
|
1715
1745
|
array = root.array(
|
|
1716
1746
|
"filter_id",
|
|
1717
|
-
self.schema.
|
|
1747
|
+
[filt.id for filt in self.schema.filters],
|
|
1718
1748
|
dtype="str",
|
|
1719
1749
|
compressor=DEFAULT_ZARR_COMPRESSOR,
|
|
1720
1750
|
)
|
|
@@ -1782,16 +1812,16 @@ class VcfZarrWriter:
|
|
|
1782
1812
|
self.encode_filters_partition(partition_index)
|
|
1783
1813
|
self.encode_contig_partition(partition_index)
|
|
1784
1814
|
self.encode_alleles_partition(partition_index)
|
|
1785
|
-
for col in self.schema.
|
|
1815
|
+
for col in self.schema.fields.values():
|
|
1786
1816
|
if col.vcf_field is not None:
|
|
1787
1817
|
self.encode_array_partition(col, partition_index)
|
|
1788
|
-
if "call_genotype" in self.schema.
|
|
1818
|
+
if "call_genotype" in self.schema.fields:
|
|
1789
1819
|
self.encode_genotypes_partition(partition_index)
|
|
1790
1820
|
|
|
1791
1821
|
final_path = self.partition_path(partition_index)
|
|
1792
1822
|
logger.info(f"Finalising {partition_index} at {final_path}")
|
|
1793
1823
|
if final_path.exists():
|
|
1794
|
-
logger.warning("Removing existing partition at {final_path}")
|
|
1824
|
+
logger.warning(f"Removing existing partition at {final_path}")
|
|
1795
1825
|
shutil.rmtree(final_path)
|
|
1796
1826
|
os.rename(partition_path, final_path)
|
|
1797
1827
|
|
|
@@ -1813,7 +1843,7 @@ class VcfZarrWriter:
|
|
|
1813
1843
|
|
|
1814
1844
|
partition = self.metadata.partitions[partition_index]
|
|
1815
1845
|
ba = core.BufferedArray(array, partition.start)
|
|
1816
|
-
source_col = self.icf.
|
|
1846
|
+
source_col = self.icf.fields[column.vcf_field]
|
|
1817
1847
|
sanitiser = source_col.sanitiser_factory(ba.buff.shape)
|
|
1818
1848
|
|
|
1819
1849
|
for value in source_col.iter_values(partition.start, partition.stop):
|
|
@@ -1836,7 +1866,7 @@ class VcfZarrWriter:
|
|
|
1836
1866
|
gt_mask = core.BufferedArray(gt_mask_array, partition.start)
|
|
1837
1867
|
gt_phased = core.BufferedArray(gt_phased_array, partition.start)
|
|
1838
1868
|
|
|
1839
|
-
source_col = self.icf.
|
|
1869
|
+
source_col = self.icf.fields["FORMAT/GT"]
|
|
1840
1870
|
for value in source_col.iter_values(partition.start, partition.stop):
|
|
1841
1871
|
j = gt.next_buffer_row()
|
|
1842
1872
|
sanitise_value_int_2d(gt.buff, j, value[:, :-1])
|
|
@@ -1859,8 +1889,8 @@ class VcfZarrWriter:
|
|
|
1859
1889
|
alleles_array = self.init_partition_array(partition_index, array_name)
|
|
1860
1890
|
partition = self.metadata.partitions[partition_index]
|
|
1861
1891
|
alleles = core.BufferedArray(alleles_array, partition.start)
|
|
1862
|
-
ref_col = self.icf.
|
|
1863
|
-
alt_col = self.icf.
|
|
1892
|
+
ref_col = self.icf.fields["REF"]
|
|
1893
|
+
alt_col = self.icf.fields["ALT"]
|
|
1864
1894
|
|
|
1865
1895
|
for ref, alt in zip(
|
|
1866
1896
|
ref_col.iter_values(partition.start, partition.stop),
|
|
@@ -1880,7 +1910,7 @@ class VcfZarrWriter:
|
|
|
1880
1910
|
partition = self.metadata.partitions[partition_index]
|
|
1881
1911
|
vid = core.BufferedArray(vid_array, partition.start)
|
|
1882
1912
|
vid_mask = core.BufferedArray(vid_mask_array, partition.start)
|
|
1883
|
-
col = self.icf.
|
|
1913
|
+
col = self.icf.fields["ID"]
|
|
1884
1914
|
|
|
1885
1915
|
for value in col.iter_values(partition.start, partition.stop):
|
|
1886
1916
|
j = vid.next_buffer_row()
|
|
@@ -1899,13 +1929,13 @@ class VcfZarrWriter:
|
|
|
1899
1929
|
self.finalise_partition_array(partition_index, "variant_id_mask")
|
|
1900
1930
|
|
|
1901
1931
|
def encode_filters_partition(self, partition_index):
|
|
1902
|
-
lookup = {filt: index for index, filt in enumerate(self.schema.
|
|
1932
|
+
lookup = {filt.id: index for index, filt in enumerate(self.schema.filters)}
|
|
1903
1933
|
array_name = "variant_filter"
|
|
1904
1934
|
array = self.init_partition_array(partition_index, array_name)
|
|
1905
1935
|
partition = self.metadata.partitions[partition_index]
|
|
1906
1936
|
var_filter = core.BufferedArray(array, partition.start)
|
|
1907
1937
|
|
|
1908
|
-
col = self.icf.
|
|
1938
|
+
col = self.icf.fields["FILTERS"]
|
|
1909
1939
|
for value in col.iter_values(partition.start, partition.stop):
|
|
1910
1940
|
j = var_filter.next_buffer_row()
|
|
1911
1941
|
var_filter.buff[j] = False
|
|
@@ -1921,12 +1951,12 @@ class VcfZarrWriter:
|
|
|
1921
1951
|
self.finalise_partition_array(partition_index, array_name)
|
|
1922
1952
|
|
|
1923
1953
|
def encode_contig_partition(self, partition_index):
|
|
1924
|
-
lookup = {contig: index for index, contig in enumerate(self.schema.
|
|
1954
|
+
lookup = {contig.id: index for index, contig in enumerate(self.schema.contigs)}
|
|
1925
1955
|
array_name = "variant_contig"
|
|
1926
1956
|
array = self.init_partition_array(partition_index, array_name)
|
|
1927
1957
|
partition = self.metadata.partitions[partition_index]
|
|
1928
1958
|
contig = core.BufferedArray(array, partition.start)
|
|
1929
|
-
col = self.icf.
|
|
1959
|
+
col = self.icf.fields["CHROM"]
|
|
1930
1960
|
|
|
1931
1961
|
for value in col.iter_values(partition.start, partition.stop):
|
|
1932
1962
|
j = contig.next_buffer_row()
|
|
@@ -1986,7 +2016,7 @@ class VcfZarrWriter:
|
|
|
1986
2016
|
raise FileNotFoundError(f"Partitions not encoded: {missing}")
|
|
1987
2017
|
|
|
1988
2018
|
progress_config = core.ProgressConfig(
|
|
1989
|
-
total=len(self.schema.
|
|
2019
|
+
total=len(self.schema.fields),
|
|
1990
2020
|
title="Finalise",
|
|
1991
2021
|
units="array",
|
|
1992
2022
|
show=show_progress,
|
|
@@ -2000,7 +2030,7 @@ class VcfZarrWriter:
|
|
|
2000
2030
|
# for multiple workers, or making a standard wrapper for tqdm
|
|
2001
2031
|
# that allows us to have a consistent look and feel.
|
|
2002
2032
|
with core.ParallelWorkManager(0, progress_config) as pwm:
|
|
2003
|
-
for name in self.schema.
|
|
2033
|
+
for name in self.schema.fields:
|
|
2004
2034
|
pwm.submit(self.finalise_array, name)
|
|
2005
2035
|
logger.debug(f"Removing {self.wip_path}")
|
|
2006
2036
|
shutil.rmtree(self.wip_path)
|
|
@@ -2016,18 +2046,17 @@ class VcfZarrWriter:
|
|
|
2016
2046
|
Return the approximate maximum memory used to encode a variant chunk.
|
|
2017
2047
|
"""
|
|
2018
2048
|
max_encoding_mem = max(
|
|
2019
|
-
col.variant_chunk_nbytes for col in self.schema.
|
|
2049
|
+
col.variant_chunk_nbytes for col in self.schema.fields.values()
|
|
2020
2050
|
)
|
|
2021
2051
|
gt_mem = 0
|
|
2022
|
-
if "call_genotype" in self.schema.
|
|
2052
|
+
if "call_genotype" in self.schema.fields:
|
|
2023
2053
|
encoded_together = [
|
|
2024
2054
|
"call_genotype",
|
|
2025
2055
|
"call_genotype_phased",
|
|
2026
2056
|
"call_genotype_mask",
|
|
2027
2057
|
]
|
|
2028
2058
|
gt_mem = sum(
|
|
2029
|
-
self.schema.
|
|
2030
|
-
for col in encoded_together
|
|
2059
|
+
self.schema.fields[col].variant_chunk_nbytes for col in encoded_together
|
|
2031
2060
|
)
|
|
2032
2061
|
return max(max_encoding_mem, gt_mem)
|
|
2033
2062
|
|
|
@@ -2059,7 +2088,7 @@ class VcfZarrWriter:
|
|
|
2059
2088
|
num_workers = min(max_num_workers, worker_processes)
|
|
2060
2089
|
|
|
2061
2090
|
total_bytes = 0
|
|
2062
|
-
for col in self.schema.
|
|
2091
|
+
for col in self.schema.fields.values():
|
|
2063
2092
|
# Open the array definition to get the total size
|
|
2064
2093
|
total_bytes += zarr.open(self.arrays_path / col.name).nbytes
|
|
2065
2094
|
|
|
@@ -76,6 +76,10 @@ def read_bytes_as_tuple(f: IO[Any], fmt: str) -> Sequence[Any]:
|
|
|
76
76
|
|
|
77
77
|
@dataclass
|
|
78
78
|
class Region:
|
|
79
|
+
"""
|
|
80
|
+
A htslib style region, where coordinates are 1-based and inclusive.
|
|
81
|
+
"""
|
|
82
|
+
|
|
79
83
|
contig: str
|
|
80
84
|
start: Optional[int] = None
|
|
81
85
|
end: Optional[int] = None
|
|
@@ -86,7 +90,7 @@ class Region:
|
|
|
86
90
|
assert self.start > 0
|
|
87
91
|
if self.end is not None:
|
|
88
92
|
self.end = int(self.end)
|
|
89
|
-
assert self.end
|
|
93
|
+
assert self.end >= self.start
|
|
90
94
|
|
|
91
95
|
def __str__(self):
|
|
92
96
|
s = f"{self.contig}"
|
|
@@ -113,6 +117,9 @@ class CSIBin:
|
|
|
113
117
|
chunks: Sequence[Chunk]
|
|
114
118
|
|
|
115
119
|
|
|
120
|
+
RECORD_COUNT_UNKNOWN = np.inf
|
|
121
|
+
|
|
122
|
+
|
|
116
123
|
@dataclass
|
|
117
124
|
class CSIIndex:
|
|
118
125
|
min_shift: int
|
|
@@ -221,7 +228,9 @@ def read_csi(
|
|
|
221
228
|
for _ in range(n_ref):
|
|
222
229
|
n_bin = read_bytes_as_value(f, "<i")
|
|
223
230
|
seq_bins = []
|
|
224
|
-
|
|
231
|
+
# Distinguish between counts that are zero because the sequence
|
|
232
|
+
# isn't there, vs counts that aren't in the index.
|
|
233
|
+
record_count = 0 if n_bin == 0 else RECORD_COUNT_UNKNOWN
|
|
225
234
|
for _ in range(n_bin):
|
|
226
235
|
bin, loffset, n_chunk = read_bytes_as_tuple(f, "<IQi")
|
|
227
236
|
chunks = []
|
|
@@ -337,7 +346,9 @@ def read_tabix(
|
|
|
337
346
|
for _ in range(header.n_ref):
|
|
338
347
|
n_bin = read_bytes_as_value(f, "<i")
|
|
339
348
|
seq_bins = []
|
|
340
|
-
|
|
349
|
+
# Distinguish between counts that are zero because the sequence
|
|
350
|
+
# isn't there, vs counts that aren't in the index.
|
|
351
|
+
record_count = 0 if n_bin == 0 else RECORD_COUNT_UNKNOWN
|
|
341
352
|
for _ in range(n_bin):
|
|
342
353
|
bin, n_chunk = read_bytes_as_tuple(f, "<Ii")
|
|
343
354
|
chunks = []
|
|
@@ -436,19 +447,16 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
436
447
|
if var.POS >= start:
|
|
437
448
|
yield var
|
|
438
449
|
|
|
439
|
-
def
|
|
450
|
+
def _filter_empty_and_refine(self, regions):
|
|
440
451
|
"""
|
|
441
|
-
Return all regions in the specified list that have one or more records
|
|
442
|
-
|
|
443
|
-
Sometimes with Tabix indexes these seem to crop up:
|
|
444
|
-
|
|
445
|
-
- https://github.com/sgkit-dev/bio2zarr/issues/45
|
|
446
|
-
- https://github.com/sgkit-dev/bio2zarr/issues/120
|
|
452
|
+
Return all regions in the specified list that have one or more records,
|
|
453
|
+
and refine the start coordinate of the region to be the actual first coord
|
|
447
454
|
"""
|
|
448
455
|
ret = []
|
|
449
456
|
for region in regions:
|
|
450
|
-
|
|
451
|
-
if
|
|
457
|
+
var = next(self.variants(region), None)
|
|
458
|
+
if var is not None:
|
|
459
|
+
region.start = var.POS
|
|
452
460
|
ret.append(region)
|
|
453
461
|
return ret
|
|
454
462
|
|
|
@@ -528,4 +536,4 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
528
536
|
if self.index.record_counts[ri] > 0:
|
|
529
537
|
regions.append(Region(self.sequence_names[ri]))
|
|
530
538
|
|
|
531
|
-
return self.
|
|
539
|
+
return self._filter_empty_and_refine(regions)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.9
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
6
|
License: Apache License
|
|
@@ -234,7 +234,7 @@ Requires-Dist: pysam; extra == "dev"
|
|
|
234
234
|
Requires-Dist: pytest; extra == "dev"
|
|
235
235
|
Requires-Dist: pytest-coverage; extra == "dev"
|
|
236
236
|
Requires-Dist: pytest-xdist; extra == "dev"
|
|
237
|
-
Requires-Dist: sgkit; extra == "dev"
|
|
237
|
+
Requires-Dist: sgkit>=0.8.0; extra == "dev"
|
|
238
238
|
Requires-Dist: tqdm; extra == "dev"
|
|
239
239
|
|
|
240
240
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
@@ -39,9 +39,14 @@ all: 1kg_2020_chr20.bcf.csi \
|
|
|
39
39
|
# 1000 genomes phase 1
|
|
40
40
|
1KG_P1_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chr6.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
|
|
41
41
|
|
|
42
|
+
old_tabix:
|
|
43
|
+
rm -fR tabix old_tabix
|
|
44
|
+
git clone https://github.com/samtools/tabix.git
|
|
45
|
+
cd tabix && make
|
|
46
|
+
cp tabix/tabix ./old_tabix
|
|
42
47
|
|
|
43
48
|
%.vcf.gz.tbi: %.vcf.gz
|
|
44
|
-
|
|
49
|
+
./old_tabix $<
|
|
45
50
|
|
|
46
51
|
%.2.split: %
|
|
47
52
|
./split.sh $< 2
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|