bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
- import cyvcf2
2
1
  import numpy as np
3
2
  import numpy.testing as nt
4
3
  import tqdm
5
4
  import zarr
6
5
 
6
+ from bio2zarr import core
7
7
  from bio2zarr.zarr_utils import first_dim_iter
8
8
 
9
- from .. import constants
9
+ from . import constants
10
10
 
11
11
 
12
12
  def assert_all_missing_float(a):
@@ -146,7 +146,10 @@ def assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number):
146
146
  nt.assert_equal(vcf_val, zarr_val)
147
147
 
148
148
 
149
+ @core.requires_optional_dependency("cyvcf2", "vcf")
149
150
  def verify(vcf_path, zarr_path, show_progress=False):
151
+ import cyvcf2
152
+
150
153
  root = zarr.open(store=zarr_path, mode="r")
151
154
  pos = root["variant_position"][:]
152
155
  allele = root["variant_allele"][:]
bio2zarr/zarr_utils.py CHANGED
@@ -1,18 +1,185 @@
1
+ import logging
2
+ import os
3
+
1
4
  import zarr
2
5
 
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # Use zarr format v2 by default even when running with zarr-python v3
9
+ # NOTE: this interface was introduced for experimentation with zarr
10
+ # format 3 and is not envisaged as a long-term interface.
11
+ try:
12
+ ZARR_FORMAT = int(os.environ.get("BIO2ZARR_ZARR_FORMAT", "2"))
13
+ except Exception:
14
+ ZARR_FORMAT = 2
15
+
3
16
 
4
17
  def zarr_v3() -> bool:
5
18
  return zarr.__version__ >= "3"
6
19
 
7
20
 
8
21
  if zarr_v3():
9
- # Use zarr format v2 even when running with zarr-python v3
10
- ZARR_FORMAT_KWARGS = dict(zarr_format=2)
22
+ ZARR_FORMAT_KWARGS = dict(zarr_format=ZARR_FORMAT)
23
+ # In zarr-python v3 strings are stored as string arrays (T) with itemsize 16
24
+ STRING_DTYPE_NAME = "T"
25
+ STRING_ITEMSIZE = 16
11
26
  else:
12
27
  ZARR_FORMAT_KWARGS = dict()
28
+ # In zarr-python v2 strings are stored as object arrays (O) with itemsize 8
29
+ STRING_DTYPE_NAME = "O"
30
+ STRING_ITEMSIZE = 8
13
31
 
14
32
 
15
33
  # See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
16
34
  def first_dim_iter(z):
17
35
  for chunk in range(z.cdata_shape[0]):
18
36
  yield from z.blocks[chunk]
37
+
38
+
39
+ def zarr_exists(path):
40
+ # NOTE: this is too strict, we should support more general Zarrs, see #276
41
+ return (path / ".zmetadata").exists() or (path / "zarr.json").exists()
42
+
43
+
44
+ def create_group_array(
45
+ group,
46
+ name,
47
+ *,
48
+ data,
49
+ shape,
50
+ dtype,
51
+ compressor=None,
52
+ dimension_names=None,
53
+ **kwargs,
54
+ ):
55
+ """Create an array within a group."""
56
+ if ZARR_FORMAT == 2:
57
+ array = group.array(
58
+ name,
59
+ data=data,
60
+ shape=shape,
61
+ dtype=dtype,
62
+ compressor=compressor,
63
+ **kwargs,
64
+ )
65
+ if dimension_names is not None:
66
+ array.attrs["_ARRAY_DIMENSIONS"] = dimension_names
67
+ return array
68
+ else:
69
+ new_kwargs = {**kwargs}
70
+ if compressor is not None:
71
+ compressors = [_convert_v2_compressor_to_v3_codec(compressor, dtype)]
72
+ # TODO: seems odd that we need to set this
73
+ new_kwargs["compressor"] = "auto"
74
+ new_kwargs["compressors"] = compressors
75
+ return group.array(
76
+ name,
77
+ data=data,
78
+ shape=shape,
79
+ dtype=dtype,
80
+ dimension_names=dimension_names,
81
+ **new_kwargs,
82
+ )
83
+
84
+
85
+ def create_empty_group_array(
86
+ group,
87
+ name,
88
+ *,
89
+ shape,
90
+ dtype,
91
+ chunks,
92
+ compressor=None,
93
+ filters=None,
94
+ dimension_names=None,
95
+ **kwargs,
96
+ ):
97
+ """Create an empty array within a group."""
98
+ if ZARR_FORMAT == 2:
99
+ array = group.empty(
100
+ name=name,
101
+ shape=shape,
102
+ dtype=dtype,
103
+ chunks=chunks,
104
+ compressor=compressor,
105
+ filters=filters,
106
+ **kwargs,
107
+ )
108
+ if dimension_names is not None:
109
+ array.attrs["_ARRAY_DIMENSIONS"] = dimension_names
110
+ return array
111
+ else:
112
+ new_kwargs = {**kwargs}
113
+ new_kwargs.pop("zarr_format")
114
+ if compressor is not None:
115
+ compressors = [_convert_v2_compressor_to_v3_codec(compressor, dtype)]
116
+ # TODO: seems odd that we need to set this
117
+ new_kwargs["compressor"] = "auto"
118
+ new_kwargs["compressors"] = compressors
119
+ return group.array(
120
+ name=name,
121
+ shape=shape,
122
+ dtype=dtype,
123
+ chunks=chunks,
124
+ dimension_names=dimension_names,
125
+ **new_kwargs,
126
+ )
127
+
128
+
129
+ def get_compressor(array):
130
+ try:
131
+ # zarr format v2: compressor (singular)
132
+ return array.compressor
133
+ except TypeError as e:
134
+ # zarr format v3: compressors (plural)
135
+ compressors = array.compressors
136
+ if len(compressors) > 1:
137
+ raise ValueError(
138
+ f"Only one compressor is supported but found {compressors}"
139
+ ) from e
140
+ return compressors[0] if len(compressors) == 1 else None
141
+
142
+
143
+ def get_compressor_config(array):
144
+ compressor = get_compressor(array)
145
+ if hasattr(compressor, "get_config"):
146
+ return compressor.get_config()
147
+ else:
148
+ from zarr.codecs.blosc import BloscCodec
149
+
150
+ if isinstance(compressor, BloscCodec):
151
+ return compressor._blosc_codec.get_config()
152
+ else:
153
+ return compressor.as_dict()["configuration"]
154
+
155
+
156
+ def _convert_v2_compressor_to_v3_codec(compressor, dtype):
157
+ # import here since this is zarr-python v3 only
158
+ from zarr.core.dtype import parse_dtype
159
+ from zarr.metadata.migrate_v3 import _convert_compressor
160
+
161
+ return _convert_compressor(compressor, parse_dtype(dtype, zarr_format=3))
162
+
163
+
164
+ def move_chunks(src_path, dest_path, partition, name):
165
+ if ZARR_FORMAT == 2:
166
+ dest = dest_path / name
167
+ chunk_files = [
168
+ path for path in src_path.iterdir() if not path.name.startswith(".")
169
+ ]
170
+ else:
171
+ dest = dest_path / name / "c"
172
+ dest.mkdir(exist_ok=True)
173
+ src_chunks = src_path / "c"
174
+ if not src_chunks.exists():
175
+ chunk_files = []
176
+ else:
177
+ chunk_files = [
178
+ path for path in src_chunks.iterdir() if not path.name.startswith(".")
179
+ ]
180
+ # TODO check for a count of then number of files. If we require a
181
+ # dimension_separator of "/" then we could make stronger assertions
182
+ # here, as we'd always have num_variant_chunks
183
+ logger.debug(f"Moving {len(chunk_files)} chunks for {name} partition {partition}")
184
+ for chunk_file in chunk_files:
185
+ os.rename(chunk_file, dest / chunk_file.name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bio2zarr
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -216,35 +216,50 @@ Classifier: Operating System :: MacOS :: MacOS X
216
216
  Classifier: Intended Audience :: Science/Research
217
217
  Classifier: Programming Language :: Python
218
218
  Classifier: Programming Language :: Python :: 3
219
- Classifier: Programming Language :: Python :: 3.9
220
219
  Classifier: Programming Language :: Python :: 3.10
221
220
  Classifier: Programming Language :: Python :: 3.11
222
221
  Classifier: Programming Language :: Python :: 3.12
222
+ Classifier: Programming Language :: Python :: 3.13
223
223
  Classifier: Topic :: Scientific/Engineering
224
- Requires-Python: >=3.9
224
+ Requires-Python: >=3.10
225
225
  Description-Content-Type: text/markdown
226
226
  License-File: LICENSE
227
- Requires-Dist: numpy>=1.26
227
+ Requires-Dist: numpy>=2
228
228
  Requires-Dist: zarr<3,>=2.17
229
- Requires-Dist: click
229
+ Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
230
230
  Requires-Dist: tabulate
231
231
  Requires-Dist: tqdm
232
232
  Requires-Dist: humanfriendly
233
- Requires-Dist: cyvcf2
234
- Requires-Dist: bed_reader
233
+ Requires-Dist: coloredlogs
234
+ Requires-Dist: click
235
+ Requires-Dist: pandas
235
236
  Provides-Extra: dev
237
+ Requires-Dist: click>=8.2.0; extra == "dev"
236
238
  Requires-Dist: hypothesis-vcf; extra == "dev"
237
239
  Requires-Dist: msprime; extra == "dev"
238
240
  Requires-Dist: pysam; extra == "dev"
239
241
  Requires-Dist: pytest; extra == "dev"
240
242
  Requires-Dist: pytest-coverage; extra == "dev"
241
243
  Requires-Dist: pytest-xdist; extra == "dev"
242
- Requires-Dist: sgkit>=0.8.0; extra == "dev"
243
244
  Requires-Dist: tqdm; extra == "dev"
245
+ Requires-Dist: tskit>=1; extra == "dev"
246
+ Requires-Dist: bed_reader; extra == "dev"
247
+ Requires-Dist: cyvcf2; extra == "dev"
248
+ Requires-Dist: xarray<2025.03.1; extra == "dev"
249
+ Requires-Dist: dask[array]<=2024.8.0,>=2022.01.0; extra == "dev"
250
+ Provides-Extra: tskit
251
+ Requires-Dist: tskit>=1; extra == "tskit"
252
+ Provides-Extra: vcf
253
+ Requires-Dist: cyvcf2; extra == "vcf"
254
+ Provides-Extra: all
255
+ Requires-Dist: tskit>=1; extra == "all"
256
+ Requires-Dist: cyvcf2; extra == "all"
244
257
  Dynamic: license-file
245
258
 
246
259
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
247
260
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
261
+ [![PyPI Downloads](https://static.pepy.tech/badge/bio2zarr)](https://pepy.tech/projects/bio2zarr)
262
+ [![Anaconda-Server Badge](https://anaconda.org/bioconda/bio2zarr/badges/downloads.svg)](https://anaconda.org/bioconda/bio2zarr)
248
263
 
249
264
 
250
265
  # bio2zarr
@@ -0,0 +1,21 @@
1
+ bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
+ bio2zarr/__main__.py,sha256=4pF1IBO4CcswA_Fe7NmK_pqGOUHCwsd_8YU7dP92n9c,578
3
+ bio2zarr/_version.py,sha256=szvPIs2C82UunpzuvVg3MbF4QhzbBYTsVJ8DmPfq6_E,704
4
+ bio2zarr/cli.py,sha256=iHfmc-qU2roQXm9Bt3TyR2bmgH-2p3DqYosQERePMZ8,17873
5
+ bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
+ bio2zarr/core.py,sha256=mYi2Vmh_YdNEd3weE0zZIPr7ToEUynq8nNCVvONVaqM,12140
7
+ bio2zarr/plink.py,sha256=ELGhsSdH1Xmxx6agCfTx1kYyntrU0XQ384wxTEn87BM,11717
8
+ bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
+ bio2zarr/tskit.py,sha256=iLheNWtX7Pad1oNfijf6THMphzXwEtuQ6Zmi94pRZHg,10847
10
+ bio2zarr/typing.py,sha256=HdXNwIBEqYtGNwKyeUDQv6-H-pKSwNZO0qD2_VxTXEY,48
11
+ bio2zarr/vcf.py,sha256=3aXCdTAIuGoUmpbPIPVKhNj4oevkF0s_l7gRB0QmaPU,60738
12
+ bio2zarr/vcf_utils.py,sha256=xrsmxpu1xyXtl6FaYuU562WZP-iVUIaqzxD-11MHfAM,19541
13
+ bio2zarr/vcz.py,sha256=3IkcrAsQkWCiHiMBh0bbxzHtvX8qaUV3W84y1ojUWSs,42204
14
+ bio2zarr/vcz_verification.py,sha256=4YZZnAuMH-z9uPqAeBONdsZADz2MtY57D7RAbMa90yY,8119
15
+ bio2zarr/zarr_utils.py,sha256=4vE6CqnOLqZExc_7Z0jGGbA-kjqz9NPSqSBue10bzHk,5443
16
+ bio2zarr-0.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
+ bio2zarr-0.1.7.dist-info/METADATA,sha256=wXANeYEuZh41wH_nay96e4xobWhpBhL-BzkBcdGAR04,15736
18
+ bio2zarr-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
19
+ bio2zarr-0.1.7.dist-info/entry_points.txt,sha256=bbIbR8fWMGruyLaoCxO1O22nKidWKUzMgYbTYdsN6YQ,181
20
+ bio2zarr-0.1.7.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
21
+ bio2zarr-0.1.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,3 +1,5 @@
1
1
  [console_scripts]
2
+ plink2zarr = bio2zarr.cli:plink2zarr_main
3
+ tskit2zarr = bio2zarr.cli:tskit2zarr_main
2
4
  vcf2zarr = bio2zarr.cli:vcf2zarr_main
3
5
  vcfpartition = bio2zarr.cli:vcfpartition
@@ -1,38 +0,0 @@
1
- from .icf import (
2
- IntermediateColumnarFormat,
3
- explode,
4
- explode_finalise,
5
- explode_init,
6
- explode_partition,
7
- )
8
- from .vcz import (
9
- VcfZarrSchema,
10
- convert,
11
- encode,
12
- encode_finalise,
13
- encode_init,
14
- encode_partition,
15
- inspect,
16
- mkschema,
17
- )
18
- from .verification import verify
19
-
20
- # NOTE some of these aren't intended to be part of the external
21
- # interface (like IntermediateColumnarFormat), but putting
22
- # them into the list to keep the lint nagging under control
23
- __all__ = [
24
- "IntermediateColumnarFormat",
25
- "explode",
26
- "explode_finalise",
27
- "explode_init",
28
- "explode_partition",
29
- "VcfZarrSchema",
30
- "convert",
31
- "encode",
32
- "encode_finalise",
33
- "encode_init",
34
- "encode_partition",
35
- "inspect",
36
- "mkschema",
37
- "verify",
38
- ]
@@ -1,21 +0,0 @@
1
- bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
- bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
3
- bio2zarr/_version.py,sha256=Y4jy4bEMmwl_qNPCmiMFnlQ2ofMoqyG37hp8uwI3m10,511
4
- bio2zarr/cli.py,sha256=eyOSqU7hlZuvXEVB2g3qWPK6ys0A1C1gMahVz51hRqs,15999
5
- bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
- bio2zarr/core.py,sha256=4xqNf3Txgyhcx23bzXZHq3GW0Jh24fPQwob7lKO7s0w,11668
7
- bio2zarr/plink.py,sha256=Yr1meT4AgS2qnwM64-Nmthh4HbjaPXsddYiJdtfYWBg,6999
8
- bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
- bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
10
- bio2zarr/vcf_utils.py,sha256=u1nkFRecY__IgkfV3N0Sr3AFIUSN8sYEF463K1HIgEE,19496
11
- bio2zarr/zarr_utils.py,sha256=99J7ycaG92K_AcWRF2S9A4ec2_4cXL6kjYT99GBfli4,415
12
- bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
13
- bio2zarr/vcf2zarr/icf.py,sha256=G70eC6LgrJUvGBHKYrcV83BA7Mm3D170zIsoXRZgoUA,42895
14
- bio2zarr/vcf2zarr/vcz.py,sha256=cfUCBsQW5dbhDu7NzXkd1Dalsev7UkFDXVOyChAHw8Q,49409
15
- bio2zarr/vcf2zarr/verification.py,sha256=uM-mg0yvUTBs-MvWBd4jxTS0zKCUbxEQpm4ALJADdMI,8037
16
- bio2zarr-0.1.5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
- bio2zarr-0.1.5.dist-info/METADATA,sha256=rWYid_erOvB8gywz8N4TXBfR7ezSELuaF5Hyq3iV86w,15000
18
- bio2zarr-0.1.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
19
- bio2zarr-0.1.5.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
20
- bio2zarr-0.1.5.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
21
- bio2zarr-0.1.5.dist-info/RECORD,,