bio2zarr 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

bio2zarr/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.4'
21
- __version_tuple__ = version_tuple = (0, 1, 4)
20
+ __version__ = version = '0.1.5'
21
+ __version_tuple__ = version_tuple = (0, 1, 5)
bio2zarr/cli.py CHANGED
@@ -624,8 +624,8 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
624
624
  num_parts_per_path = max(1, num_partitions // len(vcfs))
625
625
 
626
626
  for vcf_path in vcfs:
627
- indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
628
- regions = indexed_vcf.partition_into_regions(
627
+ vcf_file = vcf_utils.VcfFile(vcf_path)
628
+ regions = vcf_file.partition_into_regions(
629
629
  num_parts=num_parts_per_path, target_part_size=partition_size
630
630
  )
631
631
  for region in regions:
bio2zarr/vcf2zarr/icf.py CHANGED
@@ -228,8 +228,8 @@ def fixed_vcf_field_definitions():
228
228
 
229
229
 
230
230
  def scan_vcf(path, target_num_partitions):
231
- with vcf_utils.IndexedVcf(path) as indexed_vcf:
232
- vcf = indexed_vcf.vcf
231
+ with vcf_utils.VcfFile(path) as vcf_file:
232
+ vcf = vcf_file.vcf
233
233
  filters = []
234
234
  pass_index = -1
235
235
  for h in vcf.header_iter():
@@ -270,10 +270,10 @@ def scan_vcf(path, target_num_partitions):
270
270
  filters=filters,
271
271
  fields=fields,
272
272
  partitions=[],
273
- num_records=sum(indexed_vcf.contig_record_counts().values()),
273
+ num_records=sum(vcf_file.contig_record_counts().values()),
274
274
  )
275
275
 
276
- regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
276
+ regions = vcf_file.partition_into_regions(num_parts=target_num_partitions)
277
277
  for region in regions:
278
278
  metadata.partitions.append(
279
279
  VcfPartition(
@@ -324,14 +324,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
324
324
  # are compatible.
325
325
  all_partitions = []
326
326
  total_records = 0
327
+ contigs = {}
327
328
  for metadata, _ in results:
328
329
  for partition in metadata.partitions:
329
330
  logger.debug(f"Scanned partition {partition}")
330
331
  all_partitions.append(partition)
332
+ for contig in metadata.contigs:
333
+ if contig.id in contigs:
334
+ if contig != contigs[contig.id]:
335
+ raise ValueError(
336
+ "Incompatible contig definitions: "
337
+ f"{contig} != {contigs[contig.id]}"
338
+ )
339
+ else:
340
+ contigs[contig.id] = contig
331
341
  total_records += metadata.num_records
332
342
  metadata.num_records = 0
333
343
  metadata.partitions = []
334
344
 
345
+ contig_union = list(contigs.values())
346
+ for metadata, _ in results:
347
+ metadata.contigs = contig_union
348
+
335
349
  icf_metadata, header = results[0]
336
350
  for metadata, _ in results[1:]:
337
351
  if metadata != icf_metadata:
@@ -1079,9 +1093,9 @@ class IntermediateColumnarFormatWriter:
1079
1093
  self.path,
1080
1094
  partition_index,
1081
1095
  ) as tcw:
1082
- with vcf_utils.IndexedVcf(partition.vcf_path) as ivcf:
1096
+ with vcf_utils.VcfFile(partition.vcf_path) as vcf:
1083
1097
  num_records = 0
1084
- for variant in ivcf.variants(partition.region):
1098
+ for variant in vcf.variants(partition.region):
1085
1099
  num_records += 1
1086
1100
  last_position = variant.POS
1087
1101
  tcw.append("CHROM", variant.CHROM)
bio2zarr/vcf_utils.py CHANGED
@@ -89,7 +89,10 @@ class Region:
89
89
  end: Optional[int] = None
90
90
 
91
91
  def __post_init__(self):
92
- if self.start is not None:
92
+ assert self.contig is not None
93
+ if self.start is None:
94
+ self.start = 1
95
+ else:
93
96
  self.start = int(self.start)
94
97
  assert self.start > 0
95
98
  if self.end is not None:
@@ -393,9 +396,12 @@ class VcfIndexType(Enum):
393
396
  TABIX = ".tbi"
394
397
 
395
398
 
396
- class IndexedVcf(contextlib.AbstractContextManager):
399
+ class VcfFile(contextlib.AbstractContextManager):
397
400
  def __init__(self, vcf_path, index_path=None):
398
401
  self.vcf = None
402
+ self.file_type = None
403
+ self.index_type = None
404
+
399
405
  vcf_path = pathlib.Path(vcf_path)
400
406
  if not vcf_path.exists():
401
407
  raise FileNotFoundError(vcf_path)
@@ -408,30 +414,34 @@ class IndexedVcf(contextlib.AbstractContextManager):
408
414
  vcf_path.suffix + VcfIndexType.CSI.value
409
415
  )
410
416
  if not index_path.exists():
411
- raise FileNotFoundError(
412
- f"Cannot find .tbi or .csi file for {vcf_path}"
413
- )
417
+ # No supported index found
418
+ index_path = None
414
419
  else:
415
420
  index_path = pathlib.Path(index_path)
421
+ if not index_path.exists():
422
+ raise FileNotFoundError(
423
+ f"Specified index path {index_path} does not exist"
424
+ )
416
425
 
417
426
  self.vcf_path = vcf_path
418
427
  self.index_path = index_path
419
- self.file_type = None
420
- self.index_type = None
421
-
422
- if index_path.suffix == VcfIndexType.CSI.value:
423
- self.index_type = VcfIndexType.CSI
424
- elif index_path.suffix == VcfIndexType.TABIX.value:
425
- self.index_type = VcfIndexType.TABIX
426
- self.file_type = VcfFileType.VCF
427
- else:
428
- raise ValueError("Only .tbi or .csi indexes are supported.")
428
+ if index_path is not None:
429
+ if index_path.suffix == VcfIndexType.CSI.value:
430
+ self.index_type = VcfIndexType.CSI
431
+ elif index_path.suffix == VcfIndexType.TABIX.value:
432
+ self.index_type = VcfIndexType.TABIX
433
+ self.file_type = VcfFileType.VCF
434
+ else:
435
+ raise ValueError("Only .tbi or .csi indexes are supported.")
429
436
 
430
437
  self.vcf = cyvcf2.VCF(vcf_path)
431
- self.vcf.set_index(str(self.index_path))
438
+ if self.index_path is not None:
439
+ self.vcf.set_index(str(self.index_path))
440
+
432
441
  logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
433
442
  self.sequence_names = None
434
443
 
444
+ self.index = None
435
445
  if self.index_type == VcfIndexType.CSI:
436
446
  # Determine the file-type based on the "aux" field.
437
447
  self.index = read_csi(self.index_path)
@@ -441,9 +451,17 @@ class IndexedVcf(contextlib.AbstractContextManager):
441
451
  self.sequence_names = self.index.parse_vcf_aux()
442
452
  else:
443
453
  self.sequence_names = self.vcf.seqnames
444
- else:
454
+ elif self.index_type == VcfIndexType.TABIX:
445
455
  self.index = read_tabix(self.index_path)
456
+ self.file_type = VcfFileType.VCF
446
457
  self.sequence_names = self.index.sequence_names
458
+ else:
459
+ assert self.index is None
460
+ var = next(self.vcf)
461
+ self.sequence_names = [var.CHROM]
462
+ self.vcf.close()
463
+ # There doesn't seem to be a way to reset the iterator
464
+ self.vcf = cyvcf2.VCF(vcf_path)
447
465
 
448
466
  def __exit__(self, exc_type, exc_val, exc_tb):
449
467
  if self.vcf is not None:
@@ -452,6 +470,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
452
470
  return False
453
471
 
454
472
  def contig_record_counts(self):
473
+ if self.index is None:
474
+ return {self.sequence_names[0]: RECORD_COUNT_UNKNOWN}
455
475
  d = dict(zip(self.sequence_names, self.index.record_counts))
456
476
  if self.file_type == VcfFileType.BCF:
457
477
  d = {k: v for k, v in d.items() if v > 0}
@@ -460,12 +480,21 @@ class IndexedVcf(contextlib.AbstractContextManager):
460
480
  def count_variants(self, region):
461
481
  return sum(1 for _ in self.variants(region))
462
482
 
463
- def variants(self, region):
464
- start = 1 if region.start is None else region.start
465
- for var in self.vcf(str(region)):
466
- # Need to filter because of indels overlapping the region
467
- if var.POS >= start:
483
+ def variants(self, region=None):
484
+ if self.index is None:
485
+ contig = self.sequence_names[0]
486
+ if region is not None:
487
+ assert region.contig == contig
488
+ for var in self.vcf:
489
+ if var.CHROM != contig:
490
+ raise ValueError("Multi-contig VCFs must be indexed")
468
491
  yield var
492
+ else:
493
+ start = 1 if region.start is None else region.start
494
+ for var in self.vcf(str(region)):
495
+ # Need to filter because of indels overlapping the region
496
+ if var.POS >= start:
497
+ yield var
469
498
 
470
499
  def _filter_empty_and_refine(self, regions):
471
500
  """
@@ -505,6 +534,9 @@ class IndexedVcf(contextlib.AbstractContextManager):
505
534
  if target_part_size_bytes < 1:
506
535
  raise ValueError("target_part_size must be positive")
507
536
 
537
+ if self.index is None:
538
+ return [Region(self.sequence_names[0])]
539
+
508
540
  # Calculate the desired part file boundaries
509
541
  file_length = os.stat(self.vcf_path).st_size
510
542
  if num_parts is not None:
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: bio2zarr
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -241,6 +241,7 @@ Requires-Dist: pytest-coverage; extra == "dev"
241
241
  Requires-Dist: pytest-xdist; extra == "dev"
242
242
  Requires-Dist: sgkit>=0.8.0; extra == "dev"
243
243
  Requires-Dist: tqdm; extra == "dev"
244
+ Dynamic: license-file
244
245
 
245
246
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
246
247
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
@@ -1,21 +1,21 @@
1
1
  bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
2
  bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
3
- bio2zarr/_version.py,sha256=hcPkC9vIGgfrKK6ft7ysLT7iOCjpFmCBmyKLmXiaZ1g,511
4
- bio2zarr/cli.py,sha256=Iife89BfTR_AUarm-AIW0lAIYxd370OmP1KKePgFXzk,16008
3
+ bio2zarr/_version.py,sha256=Y4jy4bEMmwl_qNPCmiMFnlQ2ofMoqyG37hp8uwI3m10,511
4
+ bio2zarr/cli.py,sha256=eyOSqU7hlZuvXEVB2g3qWPK6ys0A1C1gMahVz51hRqs,15999
5
5
  bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
6
  bio2zarr/core.py,sha256=4xqNf3Txgyhcx23bzXZHq3GW0Jh24fPQwob7lKO7s0w,11668
7
7
  bio2zarr/plink.py,sha256=Yr1meT4AgS2qnwM64-Nmthh4HbjaPXsddYiJdtfYWBg,6999
8
8
  bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
9
  bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
10
- bio2zarr/vcf_utils.py,sha256=tuPzuMiwGYgMlQA49L6EuIplw9DOVaOw1DTa03OJS7k,18268
10
+ bio2zarr/vcf_utils.py,sha256=u1nkFRecY__IgkfV3N0Sr3AFIUSN8sYEF463K1HIgEE,19496
11
11
  bio2zarr/zarr_utils.py,sha256=99J7ycaG92K_AcWRF2S9A4ec2_4cXL6kjYT99GBfli4,415
12
12
  bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
13
- bio2zarr/vcf2zarr/icf.py,sha256=_gYCn4PBTB6MUmXoc7DROz0VaMdloY-Eeo3GPbSq28Q,42415
13
+ bio2zarr/vcf2zarr/icf.py,sha256=G70eC6LgrJUvGBHKYrcV83BA7Mm3D170zIsoXRZgoUA,42895
14
14
  bio2zarr/vcf2zarr/vcz.py,sha256=cfUCBsQW5dbhDu7NzXkd1Dalsev7UkFDXVOyChAHw8Q,49409
15
15
  bio2zarr/vcf2zarr/verification.py,sha256=uM-mg0yvUTBs-MvWBd4jxTS0zKCUbxEQpm4ALJADdMI,8037
16
- bio2zarr-0.1.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
- bio2zarr-0.1.4.dist-info/METADATA,sha256=goC7scfCITs1Uw_AWy42Eq3iOaL8uEZvwss2CcuAvos,14978
18
- bio2zarr-0.1.4.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
19
- bio2zarr-0.1.4.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
20
- bio2zarr-0.1.4.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
21
- bio2zarr-0.1.4.dist-info/RECORD,,
16
+ bio2zarr-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
+ bio2zarr-0.1.5.dist-info/METADATA,sha256=rWYid_erOvB8gywz8N4TXBfR7ezSELuaF5Hyq3iV86w,15000
18
+ bio2zarr-0.1.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
19
+ bio2zarr-0.1.5.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
20
+ bio2zarr-0.1.5.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
21
+ bio2zarr-0.1.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.0.0)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5