bio2zarr 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bio2zarr/_version.py CHANGED
@@ -1,7 +1,14 @@
1
1
  # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
3
 
4
- __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
5
12
 
6
13
  TYPE_CHECKING = False
7
14
  if TYPE_CHECKING:
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
9
16
  from typing import Union
10
17
 
11
18
  VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
12
20
  else:
13
21
  VERSION_TUPLE = object
22
+ COMMIT_ID = object
14
23
 
15
24
  version: str
16
25
  __version__: str
17
26
  __version_tuple__: VERSION_TUPLE
18
27
  version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
19
30
 
20
- __version__ = version = '0.1.6'
21
- __version_tuple__ = version_tuple = (0, 1, 6)
31
+ __version__ = version = '0.1.7'
32
+ __version_tuple__ = version_tuple = (0, 1, 7)
33
+
34
+ __commit_id__ = commit_id = None
bio2zarr/cli.py CHANGED
@@ -652,7 +652,12 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
652
652
  @click.argument("zarr_path", type=click.Path())
653
653
  @click.option("--contig-id", type=str, help="Contig/chromosome ID (default: '1')")
654
654
  @click.option(
655
- "--isolated-as-missing", is_flag=True, help="Treat isolated nodes as missing"
655
+ "--isolated-as-missing/--isolated-as-ancestral",
656
+ default=None,
657
+ help=(
658
+ "Treat isolated samples without mutations as missing or ancestral "
659
+ "(default: tskit default)"
660
+ ),
656
661
  )
657
662
  @variants_chunk_size
658
663
  @samples_chunk_size
@@ -660,6 +665,7 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
660
665
  @progress
661
666
  @worker_processes
662
667
  @force
668
+ @core.requires_optional_dependency("tskit", "tskit")
663
669
  def convert_tskit(
664
670
  ts_path,
665
671
  zarr_path,
@@ -675,11 +681,18 @@ def convert_tskit(
675
681
  setup_logging(verbose)
676
682
  check_overwrite_dir(zarr_path, force)
677
683
 
684
+ import tskit
685
+
686
+ ts = tskit.load(ts_path)
687
+ model_mapping = ts.map_to_vcf_model(
688
+ contig_id=contig_id,
689
+ isolated_as_missing=isolated_as_missing,
690
+ )
691
+
678
692
  tskit_mod.convert(
679
693
  ts_path,
680
694
  zarr_path,
681
- contig_id=contig_id,
682
- isolated_as_missing=isolated_as_missing,
695
+ model_mapping=model_mapping,
683
696
  variants_chunk_size=variants_chunk_size,
684
697
  samples_chunk_size=samples_chunk_size,
685
698
  worker_processes=worker_processes,
bio2zarr/plink.py CHANGED
@@ -6,6 +6,7 @@ import numpy as np
6
6
  import pandas as pd
7
7
 
8
8
  from bio2zarr import constants, core, vcz
9
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
@@ -198,7 +199,7 @@ class PlinkFormat(vcz.Source):
198
199
  ref_iter = self.bim.allele_2.values[start:stop]
199
200
  gt_iter = self.bed_reader.iter_decode(start, stop)
200
201
  for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
201
- alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
202
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
202
203
  alleles[0] = ref
203
204
  alleles[1 : 1 + len(alt)] = alt
204
205
  phased = np.zeros(gt.shape[0], dtype=bool)
@@ -234,8 +235,9 @@ class PlinkFormat(vcz.Source):
234
235
  )
235
236
  # If we don't have SVLEN or END annotations, the rlen field is defined
236
237
  # as the length of the REF
237
- max_len = self.bim.allele_2.values.itemsize
238
-
238
+ # Explicitly cast to fixed size array to support pandas 2.x and 3.x
239
+ allele_2_array = self.bim.allele_2.values.astype("S")
240
+ max_len = allele_2_array.itemsize
239
241
  array_specs = [
240
242
  vcz.ZarrArraySpec(
241
243
  source="position",
@@ -246,13 +248,13 @@ class PlinkFormat(vcz.Source):
246
248
  ),
247
249
  vcz.ZarrArraySpec(
248
250
  name="variant_allele",
249
- dtype="O",
251
+ dtype=STRING_DTYPE_NAME,
250
252
  dimensions=["variants", "alleles"],
251
253
  description=None,
252
254
  ),
253
255
  vcz.ZarrArraySpec(
254
256
  name="variant_id",
255
- dtype="O",
257
+ dtype=STRING_DTYPE_NAME,
256
258
  dimensions=["variants"],
257
259
  description=None,
258
260
  ),
bio2zarr/tskit.py CHANGED
@@ -4,6 +4,7 @@ import pathlib
4
4
  import numpy as np
5
5
 
6
6
  from bio2zarr import constants, core, vcz
7
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
@@ -15,8 +16,6 @@ class TskitFormat(vcz.Source):
15
16
  ts,
16
17
  *,
17
18
  model_mapping=None,
18
- contig_id=None,
19
- isolated_as_missing=False,
20
19
  ):
21
20
  import tskit
22
21
 
@@ -35,14 +34,14 @@ class TskitFormat(vcz.Source):
35
34
  f"{self.ts.num_sites} sites"
36
35
  )
37
36
 
38
- self.contig_id = contig_id if contig_id is not None else "1"
39
- self.isolated_as_missing = isolated_as_missing
40
-
41
- self.positions = self.ts.sites_position
42
-
43
37
  if model_mapping is None:
44
38
  model_mapping = self.ts.map_to_vcf_model()
45
39
 
40
+ self.contig_id = model_mapping.contig_id
41
+ self.contig_length = model_mapping.contig_length
42
+ self.isolated_as_missing = model_mapping.isolated_as_missing
43
+ self.raw_positions = self.ts.sites_position
44
+ self.vcf_positions = model_mapping.transformed_positions
46
45
  individuals_nodes = model_mapping.individuals_nodes
47
46
  sample_ids = model_mapping.individuals_name
48
47
 
@@ -91,14 +90,14 @@ class TskitFormat(vcz.Source):
91
90
 
92
91
  @property
93
92
  def contigs(self):
94
- return [vcz.Contig(id=self.contig_id)]
93
+ return [vcz.Contig(id=self.contig_id, length=self.contig_length)]
95
94
 
96
95
  def iter_contig(self, start, stop):
97
96
  yield from (0 for _ in range(start, stop))
98
97
 
99
98
  def iter_field(self, field_name, shape, start, stop):
100
99
  if field_name == "position":
101
- for pos in self.ts.sites_position[start:stop]:
100
+ for pos in self.vcf_positions[start:stop]:
102
101
  yield int(pos)
103
102
  else:
104
103
  raise ValueError(f"Unknown field {field_name}")
@@ -110,13 +109,13 @@ class TskitFormat(vcz.Source):
110
109
 
111
110
  for variant in self.ts.variants(
112
111
  isolated_as_missing=self.isolated_as_missing,
113
- left=self.positions[start],
114
- right=self.positions[stop] if stop < self.num_records else None,
112
+ left=self.raw_positions[start],
113
+ right=self.raw_positions[stop] if stop < self.num_records else None,
115
114
  samples=self.tskit_samples,
116
115
  copy=False,
117
116
  ):
118
117
  gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
119
- alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
118
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
120
119
  # length is the length of the REF allele unless other fields
121
120
  # are included.
122
121
  variant_length = len(variant.alleles[0])
@@ -176,8 +175,8 @@ class TskitFormat(vcz.Source):
176
175
  min_position = 0
177
176
  max_position = 0
178
177
  if self.ts.num_sites > 0:
179
- min_position = np.min(self.ts.sites_position)
180
- max_position = np.max(self.ts.sites_position)
178
+ min_position = np.min(self.vcf_positions)
179
+ max_position = np.max(self.vcf_positions)
181
180
 
182
181
  tables = self.ts.tables
183
182
  ancestral_state_offsets = tables.sites.ancestral_state_offset
@@ -200,7 +199,7 @@ class TskitFormat(vcz.Source):
200
199
  vcz.ZarrArraySpec(
201
200
  source=None,
202
201
  name="variant_allele",
203
- dtype="O",
202
+ dtype=STRING_DTYPE_NAME,
204
203
  dimensions=["variants", "alleles"],
205
204
  description="Alleles for each variant",
206
205
  ),
@@ -252,8 +251,6 @@ def convert(
252
251
  vcz_path,
253
252
  *,
254
253
  model_mapping=None,
255
- contig_id=None,
256
- isolated_as_missing=False,
257
254
  variants_chunk_size=None,
258
255
  samples_chunk_size=None,
259
256
  worker_processes=core.DEFAULT_WORKER_PROCESSES,
@@ -277,8 +274,6 @@ def convert(
277
274
  tskit_format = TskitFormat(
278
275
  ts_or_path,
279
276
  model_mapping=model_mapping,
280
- contig_id=contig_id,
281
- isolated_as_missing=isolated_as_missing,
282
277
  )
283
278
  schema_instance = tskit_format.generate_schema(
284
279
  variants_chunk_size=variants_chunk_size,
bio2zarr/vcf.py CHANGED
@@ -16,6 +16,8 @@ from typing import Any
16
16
  import numcodecs
17
17
  import numpy as np
18
18
 
19
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME, zarr_exists
20
+
19
21
  from . import constants, core, provenance, vcf_utils, vcz
20
22
 
21
23
  logger = logging.getLogger(__name__)
@@ -110,7 +112,7 @@ class VcfField:
110
112
  ret = "U1"
111
113
  else:
112
114
  assert self.vcf_type == "String"
113
- ret = "O"
115
+ ret = STRING_DTYPE_NAME
114
116
  return ret
115
117
 
116
118
 
@@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value):
397
399
 
398
400
  def sanitise_value_string_1d(shape, value):
399
401
  if value is None:
400
- return np.full(shape, ".", dtype="O")
402
+ return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
401
403
  else:
402
404
  value = drop_empty_second_dim(value)
403
405
  result = np.full(shape, "", dtype=value.dtype)
@@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value):
407
409
 
408
410
  def sanitise_value_string_2d(shape, value):
409
411
  if value is None:
410
- return np.full(shape, ".", dtype="O")
412
+ return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
411
413
  else:
412
- result = np.full(shape, "", dtype="O")
414
+ result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
413
415
  if value.ndim == 2:
414
416
  result[: value.shape[0], : value.shape[1]] = value
415
417
  else:
@@ -569,7 +571,12 @@ class StringValueTransformer(VcfValueTransformer):
569
571
  value = np.array(list(vcf_value.split(",")))
570
572
  else:
571
573
  # TODO can we make this faster??
572
- value = np.array([v.split(",") for v in vcf_value], dtype="O")
574
+ var_len_values = [v.split(",") for v in vcf_value]
575
+ number = max(len(v) for v in var_len_values)
576
+ value = np.array(
577
+ [v + [""] * (number - len(v)) for v in var_len_values],
578
+ dtype=STRING_DTYPE_NAME,
579
+ )
573
580
  # print("HERE", vcf_value, value)
574
581
  # for v in vcf_value:
575
582
  # print("\t", type(v), len(v), v.split(","))
@@ -1044,7 +1051,7 @@ class IntermediateColumnarFormat(vcz.Source):
1044
1051
  ref_field.iter_values(start, stop),
1045
1052
  alt_field.iter_values(start, stop),
1046
1053
  ):
1047
- alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
1054
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
1048
1055
  alleles[0] = ref[0]
1049
1056
  alleles[1 : 1 + len(alt)] = alt
1050
1057
  yield alleles
@@ -1068,14 +1075,16 @@ class IntermediateColumnarFormat(vcz.Source):
1068
1075
  for variant_length, alleles in zip(
1069
1076
  variant_lengths, self.iter_alleles(start, stop, num_alleles)
1070
1077
  ):
1071
- yield vcz.VariantData(variant_length, alleles, None, None)
1078
+ # Stored ICF values are always at least 1D arrays; "rlen" is Number=1
1079
+ # so we must extract the scalar to avoid NumPy scalar-conversion issues.
1080
+ yield vcz.VariantData(variant_length[0], alleles, None, None)
1072
1081
  else:
1073
1082
  for variant_length, alleles, (gt, phased) in zip(
1074
1083
  variant_lengths,
1075
1084
  self.iter_alleles(start, stop, num_alleles),
1076
1085
  self.iter_genotypes(shape, start, stop),
1077
1086
  ):
1078
- yield vcz.VariantData(variant_length, alleles, gt, phased)
1087
+ yield vcz.VariantData(variant_length[0], alleles, gt, phased)
1079
1088
 
1080
1089
  def generate_schema(
1081
1090
  self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
@@ -1087,8 +1096,10 @@ class IntermediateColumnarFormat(vcz.Source):
1087
1096
 
1088
1097
  # Add ploidy and genotypes dimensions only when needed
1089
1098
  max_genotypes = 0
1099
+ has_g_field = False
1090
1100
  for field in self.metadata.format_fields:
1091
1101
  if field.vcf_number == "G":
1102
+ has_g_field = True
1092
1103
  max_genotypes = max(max_genotypes, field.summary.max_number)
1093
1104
 
1094
1105
  ploidy = None
@@ -1100,7 +1111,7 @@ class IntermediateColumnarFormat(vcz.Source):
1100
1111
  genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
1101
1112
  # assert max_genotypes == genotypes_size
1102
1113
  else:
1103
- if max_genotypes > 0:
1114
+ if max_genotypes > 0 or has_g_field:
1104
1115
  # there is no GT field, but there is at least one Number=G field,
1105
1116
  # so need to define genotypes dimension
1106
1117
  genotypes_size = max_genotypes
@@ -1163,7 +1174,7 @@ class IntermediateColumnarFormat(vcz.Source):
1163
1174
  ),
1164
1175
  fixed_field_spec(
1165
1176
  name="variant_allele",
1166
- dtype="O",
1177
+ dtype=STRING_DTYPE_NAME,
1167
1178
  dimensions=["variants", "alleles"],
1168
1179
  ),
1169
1180
  fixed_field_spec(
@@ -1173,7 +1184,7 @@ class IntermediateColumnarFormat(vcz.Source):
1173
1184
  ),
1174
1185
  fixed_field_spec(
1175
1186
  name="variant_id",
1176
- dtype="O",
1187
+ dtype=STRING_DTYPE_NAME,
1177
1188
  ),
1178
1189
  fixed_field_spec(
1179
1190
  name="variant_id_mask",
@@ -1581,8 +1592,7 @@ def inspect(path):
1581
1592
  raise ValueError(f"Path not found: {path}")
1582
1593
  if (path / "metadata.json").exists():
1583
1594
  obj = IntermediateColumnarFormat(path)
1584
- # NOTE: this is too strict, we should support more general Zarrs, see #276
1585
- elif (path / ".zmetadata").exists():
1595
+ elif zarr_exists(path):
1586
1596
  obj = vcz.VcfZarr(path)
1587
1597
  else:
1588
1598
  raise ValueError(f"{path} not in ICF or VCF Zarr format")
bio2zarr/vcz.py CHANGED
@@ -284,7 +284,7 @@ class ZarrArraySpec:
284
284
  for size in self.get_shape(schema)[1:]:
285
285
  chunk_items *= size
286
286
  dt = np.dtype(self.dtype)
287
- if dt.kind == "O" and "samples" in self.dimensions:
287
+ if dt.kind == zarr_utils.STRING_DTYPE_NAME and "samples" in self.dimensions:
288
288
  logger.warning(
289
289
  f"Field {self.name} is a string; max memory usage may "
290
290
  "be a significant underestimate"
@@ -643,55 +643,60 @@ class VcfZarrWriter:
643
643
 
644
644
  def encode_samples(self, root):
645
645
  samples = self.source.samples
646
- array = root.array(
646
+ zarr_utils.create_group_array(
647
+ root,
647
648
  "sample_id",
648
649
  data=[sample.id for sample in samples],
649
650
  shape=len(samples),
650
651
  dtype="str",
651
652
  compressor=DEFAULT_ZARR_COMPRESSOR,
652
653
  chunks=(self.schema.get_chunks(["samples"])[0],),
654
+ dimension_names=["samples"],
653
655
  )
654
- array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
655
656
  logger.debug("Samples done")
656
657
 
657
658
  def encode_contigs(self, root):
658
659
  contigs = self.source.contigs
659
- array = root.array(
660
+ zarr_utils.create_group_array(
661
+ root,
660
662
  "contig_id",
661
663
  data=[contig.id for contig in contigs],
662
664
  shape=len(contigs),
663
665
  dtype="str",
664
666
  compressor=DEFAULT_ZARR_COMPRESSOR,
667
+ dimension_names=["contigs"],
665
668
  )
666
- array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
667
669
  if all(contig.length is not None for contig in contigs):
668
- array = root.array(
670
+ zarr_utils.create_group_array(
671
+ root,
669
672
  "contig_length",
670
673
  data=[contig.length for contig in contigs],
671
674
  shape=len(contigs),
672
675
  dtype=np.int64,
673
676
  compressor=DEFAULT_ZARR_COMPRESSOR,
677
+ dimension_names=["contigs"],
674
678
  )
675
- array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
676
679
 
677
680
  def encode_filters(self, root):
678
681
  filters = self.source.filters
679
- array = root.array(
682
+ zarr_utils.create_group_array(
683
+ root,
680
684
  "filter_id",
681
685
  data=[filt.id for filt in filters],
682
686
  shape=len(filters),
683
687
  dtype="str",
684
688
  compressor=DEFAULT_ZARR_COMPRESSOR,
689
+ dimension_names=["filters"],
685
690
  )
686
- array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
687
- array = root.array(
691
+ zarr_utils.create_group_array(
692
+ root,
688
693
  "filter_description",
689
694
  data=[filt.description for filt in filters],
690
695
  shape=len(filters),
691
696
  dtype="str",
692
697
  compressor=DEFAULT_ZARR_COMPRESSOR,
698
+ dimension_names=["filters"],
693
699
  )
694
- array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
695
700
 
696
701
  def init_array(self, root, schema, array_spec, variants_dim_size):
697
702
  kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
@@ -707,34 +712,33 @@ class VcfZarrWriter:
707
712
  else schema.defaults["compressor"]
708
713
  )
709
714
  compressor = numcodecs.get_codec(compressor)
710
- if array_spec.dtype == "O":
715
+ if array_spec.dtype == zarr_utils.STRING_DTYPE_NAME:
711
716
  if zarr_utils.zarr_v3():
712
717
  filters = [*list(filters), numcodecs.VLenUTF8()]
713
718
  else:
714
719
  kwargs["object_codec"] = numcodecs.VLenUTF8()
715
720
 
716
- if not zarr_utils.zarr_v3():
721
+ if zarr_utils.zarr_v3():
722
+ # see https://github.com/zarr-developers/zarr-python/issues/3197
723
+ kwargs["fill_value"] = None
724
+ else:
717
725
  kwargs["dimension_separator"] = self.metadata.dimension_separator
718
726
 
719
727
  shape = schema.get_shape(array_spec.dimensions)
720
728
  # Truncate the variants dimension if max_variant_chunks was specified
721
729
  shape[0] = variants_dim_size
722
- a = root.empty(
730
+ a = zarr_utils.create_empty_group_array(
731
+ root,
723
732
  name=array_spec.name,
724
733
  shape=shape,
725
734
  chunks=schema.get_chunks(array_spec.dimensions),
726
735
  dtype=array_spec.dtype,
727
736
  compressor=compressor,
728
737
  filters=filters,
738
+ dimension_names=array_spec.dimensions,
729
739
  **kwargs,
730
740
  )
731
- a.attrs.update(
732
- {
733
- "description": array_spec.description,
734
- # Dimension names are part of the spec in Zarr v3
735
- "_ARRAY_DIMENSIONS": array_spec.dimensions,
736
- }
737
- )
741
+ a.attrs.update({"description": array_spec.description})
738
742
  logger.debug(f"Initialised {a}")
739
743
  return a
740
744
 
@@ -977,19 +981,7 @@ class VcfZarrWriter:
977
981
  if not src.exists():
978
982
  # Needs test
979
983
  raise ValueError(f"Partition {partition} of {name} does not exist")
980
- dest = self.arrays_path / name
981
- # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
982
- chunk_files = [
983
- path for path in src.iterdir() if not path.name.startswith(".")
984
- ]
985
- # TODO check for a count of then number of files. If we require a
986
- # dimension_separator of "/" then we could make stronger assertions
987
- # here, as we'd always have num_variant_chunks
988
- logger.debug(
989
- f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
990
- )
991
- for chunk_file in chunk_files:
992
- os.rename(chunk_file, dest / chunk_file.name)
984
+ zarr_utils.move_chunks(src, self.arrays_path, partition, name)
993
985
  # Finally, once all the chunks have moved into the arrays dir,
994
986
  # we move it out of wip
995
987
  os.rename(self.arrays_path / name, self.path / name)
@@ -1108,7 +1100,7 @@ class VcfZarrWriter:
1108
1100
 
1109
1101
  class VcfZarr:
1110
1102
  def __init__(self, path):
1111
- if not (path / ".zmetadata").exists():
1103
+ if not zarr_utils.zarr_exists(path):
1112
1104
  raise ValueError("Not in VcfZarr format") # NEEDS TEST
1113
1105
  self.path = path
1114
1106
  self.root = zarr.open(path, mode="r")
@@ -1129,7 +1121,7 @@ class VcfZarr:
1129
1121
  "avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
1130
1122
  "shape": str(array.shape),
1131
1123
  "chunk_shape": str(array.chunks),
1132
- "compressor": str(array.compressor),
1124
+ "compressor": str(zarr_utils.get_compressor(array)),
1133
1125
  "filters": str(array.filters),
1134
1126
  }
1135
1127
  data.append(d)
@@ -1192,7 +1184,8 @@ class VcfZarrIndexer:
1192
1184
  kwargs = {}
1193
1185
  if not zarr_utils.zarr_v3():
1194
1186
  kwargs["dimension_separator"] = "/"
1195
- array = root.array(
1187
+ zarr_utils.create_group_array(
1188
+ root,
1196
1189
  "region_index",
1197
1190
  data=index,
1198
1191
  shape=index.shape,
@@ -1200,12 +1193,12 @@ class VcfZarrIndexer:
1200
1193
  dtype=index.dtype,
1201
1194
  compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
1202
1195
  fill_value=None,
1196
+ dimension_names=[
1197
+ "region_index_values",
1198
+ "region_index_fields",
1199
+ ],
1203
1200
  **kwargs,
1204
1201
  )
1205
- array.attrs["_ARRAY_DIMENSIONS"] = [
1206
- "region_index_values",
1207
- "region_index_fields",
1208
- ]
1209
1202
 
1210
1203
  logger.info("Consolidating Zarr metadata")
1211
1204
  zarr.consolidate_metadata(self.path)
bio2zarr/zarr_utils.py CHANGED
@@ -1,18 +1,185 @@
1
+ import logging
2
+ import os
3
+
1
4
  import zarr
2
5
 
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # Use zarr format v2 by default even when running with zarr-python v3
9
+ # NOTE: this interface was introduced for experimentation with zarr
10
+ # format 3 and is not envisaged as a long-term interface.
11
+ try:
12
+ ZARR_FORMAT = int(os.environ.get("BIO2ZARR_ZARR_FORMAT", "2"))
13
+ except Exception:
14
+ ZARR_FORMAT = 2
15
+
3
16
 
4
17
  def zarr_v3() -> bool:
5
18
  return zarr.__version__ >= "3"
6
19
 
7
20
 
8
21
  if zarr_v3():
9
- # Use zarr format v2 even when running with zarr-python v3
10
- ZARR_FORMAT_KWARGS = dict(zarr_format=2)
22
+ ZARR_FORMAT_KWARGS = dict(zarr_format=ZARR_FORMAT)
23
+ # In zarr-python v3 strings are stored as string arrays (T) with itemsize 16
24
+ STRING_DTYPE_NAME = "T"
25
+ STRING_ITEMSIZE = 16
11
26
  else:
12
27
  ZARR_FORMAT_KWARGS = dict()
28
+ # In zarr-python v2 strings are stored as object arrays (O) with itemsize 8
29
+ STRING_DTYPE_NAME = "O"
30
+ STRING_ITEMSIZE = 8
13
31
 
14
32
 
15
33
  # See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
16
34
  def first_dim_iter(z):
17
35
  for chunk in range(z.cdata_shape[0]):
18
36
  yield from z.blocks[chunk]
37
+
38
+
39
+ def zarr_exists(path):
40
+ # NOTE: this is too strict, we should support more general Zarrs, see #276
41
+ return (path / ".zmetadata").exists() or (path / "zarr.json").exists()
42
+
43
+
44
+ def create_group_array(
45
+ group,
46
+ name,
47
+ *,
48
+ data,
49
+ shape,
50
+ dtype,
51
+ compressor=None,
52
+ dimension_names=None,
53
+ **kwargs,
54
+ ):
55
+ """Create an array within a group."""
56
+ if ZARR_FORMAT == 2:
57
+ array = group.array(
58
+ name,
59
+ data=data,
60
+ shape=shape,
61
+ dtype=dtype,
62
+ compressor=compressor,
63
+ **kwargs,
64
+ )
65
+ if dimension_names is not None:
66
+ array.attrs["_ARRAY_DIMENSIONS"] = dimension_names
67
+ return array
68
+ else:
69
+ new_kwargs = {**kwargs}
70
+ if compressor is not None:
71
+ compressors = [_convert_v2_compressor_to_v3_codec(compressor, dtype)]
72
+ # TODO: seems odd that we need to set this
73
+ new_kwargs["compressor"] = "auto"
74
+ new_kwargs["compressors"] = compressors
75
+ return group.array(
76
+ name,
77
+ data=data,
78
+ shape=shape,
79
+ dtype=dtype,
80
+ dimension_names=dimension_names,
81
+ **new_kwargs,
82
+ )
83
+
84
+
85
+ def create_empty_group_array(
86
+ group,
87
+ name,
88
+ *,
89
+ shape,
90
+ dtype,
91
+ chunks,
92
+ compressor=None,
93
+ filters=None,
94
+ dimension_names=None,
95
+ **kwargs,
96
+ ):
97
+ """Create an empty array within a group."""
98
+ if ZARR_FORMAT == 2:
99
+ array = group.empty(
100
+ name=name,
101
+ shape=shape,
102
+ dtype=dtype,
103
+ chunks=chunks,
104
+ compressor=compressor,
105
+ filters=filters,
106
+ **kwargs,
107
+ )
108
+ if dimension_names is not None:
109
+ array.attrs["_ARRAY_DIMENSIONS"] = dimension_names
110
+ return array
111
+ else:
112
+ new_kwargs = {**kwargs}
113
+ new_kwargs.pop("zarr_format")
114
+ if compressor is not None:
115
+ compressors = [_convert_v2_compressor_to_v3_codec(compressor, dtype)]
116
+ # TODO: seems odd that we need to set this
117
+ new_kwargs["compressor"] = "auto"
118
+ new_kwargs["compressors"] = compressors
119
+ return group.array(
120
+ name=name,
121
+ shape=shape,
122
+ dtype=dtype,
123
+ chunks=chunks,
124
+ dimension_names=dimension_names,
125
+ **new_kwargs,
126
+ )
127
+
128
+
129
+ def get_compressor(array):
130
+ try:
131
+ # zarr format v2: compressor (singular)
132
+ return array.compressor
133
+ except TypeError as e:
134
+ # zarr format v3: compressors (plural)
135
+ compressors = array.compressors
136
+ if len(compressors) > 1:
137
+ raise ValueError(
138
+ f"Only one compressor is supported but found {compressors}"
139
+ ) from e
140
+ return compressors[0] if len(compressors) == 1 else None
141
+
142
+
143
+ def get_compressor_config(array):
144
+ compressor = get_compressor(array)
145
+ if hasattr(compressor, "get_config"):
146
+ return compressor.get_config()
147
+ else:
148
+ from zarr.codecs.blosc import BloscCodec
149
+
150
+ if isinstance(compressor, BloscCodec):
151
+ return compressor._blosc_codec.get_config()
152
+ else:
153
+ return compressor.as_dict()["configuration"]
154
+
155
+
156
+ def _convert_v2_compressor_to_v3_codec(compressor, dtype):
157
+ # import here since this is zarr-python v3 only
158
+ from zarr.core.dtype import parse_dtype
159
+ from zarr.metadata.migrate_v3 import _convert_compressor
160
+
161
+ return _convert_compressor(compressor, parse_dtype(dtype, zarr_format=3))
162
+
163
+
164
+ def move_chunks(src_path, dest_path, partition, name):
165
+ if ZARR_FORMAT == 2:
166
+ dest = dest_path / name
167
+ chunk_files = [
168
+ path for path in src_path.iterdir() if not path.name.startswith(".")
169
+ ]
170
+ else:
171
+ dest = dest_path / name / "c"
172
+ dest.mkdir(exist_ok=True)
173
+ src_chunks = src_path / "c"
174
+ if not src_chunks.exists():
175
+ chunk_files = []
176
+ else:
177
+ chunk_files = [
178
+ path for path in src_chunks.iterdir() if not path.name.startswith(".")
179
+ ]
180
+ # TODO check for a count of then number of files. If we require a
181
+ # dimension_separator of "/" then we could make stronger assertions
182
+ # here, as we'd always have num_variant_chunks
183
+ logger.debug(f"Moving {len(chunk_files)} chunks for {name} partition {partition}")
184
+ for chunk_file in chunk_files:
185
+ os.rename(chunk_file, dest / chunk_file.name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bio2zarr
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
6
  License: Apache License
@@ -219,11 +219,12 @@ Classifier: Programming Language :: Python :: 3
219
219
  Classifier: Programming Language :: Python :: 3.10
220
220
  Classifier: Programming Language :: Python :: 3.11
221
221
  Classifier: Programming Language :: Python :: 3.12
222
+ Classifier: Programming Language :: Python :: 3.13
222
223
  Classifier: Topic :: Scientific/Engineering
223
224
  Requires-Python: >=3.10
224
225
  Description-Content-Type: text/markdown
225
226
  License-File: LICENSE
226
- Requires-Dist: numpy>=1.26
227
+ Requires-Dist: numpy>=2
227
228
  Requires-Dist: zarr<3,>=2.17
228
229
  Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
229
230
  Requires-Dist: tabulate
@@ -240,22 +241,25 @@ Requires-Dist: pysam; extra == "dev"
240
241
  Requires-Dist: pytest; extra == "dev"
241
242
  Requires-Dist: pytest-coverage; extra == "dev"
242
243
  Requires-Dist: pytest-xdist; extra == "dev"
243
- Requires-Dist: sgkit>=0.8.0; extra == "dev"
244
244
  Requires-Dist: tqdm; extra == "dev"
245
- Requires-Dist: tskit>=0.6.4; extra == "dev"
245
+ Requires-Dist: tskit>=1; extra == "dev"
246
246
  Requires-Dist: bed_reader; extra == "dev"
247
247
  Requires-Dist: cyvcf2; extra == "dev"
248
+ Requires-Dist: xarray<2025.03.1; extra == "dev"
249
+ Requires-Dist: dask[array]<=2024.8.0,>=2022.01.0; extra == "dev"
248
250
  Provides-Extra: tskit
249
- Requires-Dist: tskit>=0.6.4; extra == "tskit"
251
+ Requires-Dist: tskit>=1; extra == "tskit"
250
252
  Provides-Extra: vcf
251
253
  Requires-Dist: cyvcf2; extra == "vcf"
252
254
  Provides-Extra: all
253
- Requires-Dist: tskit>=0.6.4; extra == "all"
255
+ Requires-Dist: tskit>=1; extra == "all"
254
256
  Requires-Dist: cyvcf2; extra == "all"
255
257
  Dynamic: license-file
256
258
 
257
259
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
258
260
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
261
+ [![PyPI Downloads](https://static.pepy.tech/badge/bio2zarr)](https://pepy.tech/projects/bio2zarr)
262
+ [![Anaconda-Server Badge](https://anaconda.org/bioconda/bio2zarr/badges/downloads.svg)](https://anaconda.org/bioconda/bio2zarr)
259
263
 
260
264
 
261
265
  # bio2zarr
@@ -0,0 +1,21 @@
1
+ bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
+ bio2zarr/__main__.py,sha256=4pF1IBO4CcswA_Fe7NmK_pqGOUHCwsd_8YU7dP92n9c,578
3
+ bio2zarr/_version.py,sha256=szvPIs2C82UunpzuvVg3MbF4QhzbBYTsVJ8DmPfq6_E,704
4
+ bio2zarr/cli.py,sha256=iHfmc-qU2roQXm9Bt3TyR2bmgH-2p3DqYosQERePMZ8,17873
5
+ bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
+ bio2zarr/core.py,sha256=mYi2Vmh_YdNEd3weE0zZIPr7ToEUynq8nNCVvONVaqM,12140
7
+ bio2zarr/plink.py,sha256=ELGhsSdH1Xmxx6agCfTx1kYyntrU0XQ384wxTEn87BM,11717
8
+ bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
+ bio2zarr/tskit.py,sha256=iLheNWtX7Pad1oNfijf6THMphzXwEtuQ6Zmi94pRZHg,10847
10
+ bio2zarr/typing.py,sha256=HdXNwIBEqYtGNwKyeUDQv6-H-pKSwNZO0qD2_VxTXEY,48
11
+ bio2zarr/vcf.py,sha256=3aXCdTAIuGoUmpbPIPVKhNj4oevkF0s_l7gRB0QmaPU,60738
12
+ bio2zarr/vcf_utils.py,sha256=xrsmxpu1xyXtl6FaYuU562WZP-iVUIaqzxD-11MHfAM,19541
13
+ bio2zarr/vcz.py,sha256=3IkcrAsQkWCiHiMBh0bbxzHtvX8qaUV3W84y1ojUWSs,42204
14
+ bio2zarr/vcz_verification.py,sha256=4YZZnAuMH-z9uPqAeBONdsZADz2MtY57D7RAbMa90yY,8119
15
+ bio2zarr/zarr_utils.py,sha256=4vE6CqnOLqZExc_7Z0jGGbA-kjqz9NPSqSBue10bzHk,5443
16
+ bio2zarr-0.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
+ bio2zarr-0.1.7.dist-info/METADATA,sha256=wXANeYEuZh41wH_nay96e4xobWhpBhL-BzkBcdGAR04,15736
18
+ bio2zarr-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
19
+ bio2zarr-0.1.7.dist-info/entry_points.txt,sha256=bbIbR8fWMGruyLaoCxO1O22nKidWKUzMgYbTYdsN6YQ,181
20
+ bio2zarr-0.1.7.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
21
+ bio2zarr-0.1.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.8.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,21 +0,0 @@
1
- bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
- bio2zarr/__main__.py,sha256=4pF1IBO4CcswA_Fe7NmK_pqGOUHCwsd_8YU7dP92n9c,578
3
- bio2zarr/_version.py,sha256=ESbJO0YD7TYfOUv_WDIJJgWELGepEWsoyhqVifEcXPA,511
4
- bio2zarr/cli.py,sha256=WrLfUyV6VggqtDAcI3c1S5YN62ZVOent5f9JzSkX_vA,17570
5
- bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
- bio2zarr/core.py,sha256=mYi2Vmh_YdNEd3weE0zZIPr7ToEUynq8nNCVvONVaqM,12140
7
- bio2zarr/plink.py,sha256=hkrgXKkxfExgOpgNkj0SszEh9qA8R3T6kXCd-4jsXO8,11498
8
- bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
- bio2zarr/tskit.py,sha256=6YWbh8M3VJQtVpy2pD8x7Zf0jmc4HOIZwIlWcVaqjvU,10816
10
- bio2zarr/typing.py,sha256=HdXNwIBEqYtGNwKyeUDQv6-H-pKSwNZO0qD2_VxTXEY,48
11
- bio2zarr/vcf.py,sha256=_eQJm74YcKBfKDGM283ibhE40nUrkxO6Ee1giDfKjLg,60207
12
- bio2zarr/vcf_utils.py,sha256=xrsmxpu1xyXtl6FaYuU562WZP-iVUIaqzxD-11MHfAM,19541
13
- bio2zarr/vcz.py,sha256=yD2mvDZuzlAH73qPRVsUwqHSK-9HMdV4Vcif2JxfcCM,42610
14
- bio2zarr/vcz_verification.py,sha256=4YZZnAuMH-z9uPqAeBONdsZADz2MtY57D7RAbMa90yY,8119
15
- bio2zarr/zarr_utils.py,sha256=99J7ycaG92K_AcWRF2S9A4ec2_4cXL6kjYT99GBfli4,415
16
- bio2zarr-0.1.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
- bio2zarr-0.1.6.dist-info/METADATA,sha256=Me_jLTDVz76lOtidDs1gVrXnwU_rm4ARBpEz_Ozmt6U,15405
18
- bio2zarr-0.1.6.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
19
- bio2zarr-0.1.6.dist-info/entry_points.txt,sha256=bbIbR8fWMGruyLaoCxO1O22nKidWKUzMgYbTYdsN6YQ,181
20
- bio2zarr-0.1.6.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
21
- bio2zarr-0.1.6.dist-info/RECORD,,