bio2zarr 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bio2zarr/tskit.py ADDED
@@ -0,0 +1,296 @@
1
+ import logging
2
+ import pathlib
3
+
4
+ import numpy as np
5
+
6
+ from bio2zarr import constants, core, vcz
7
+ from bio2zarr.zarr_utils import STRING_DTYPE_NAME
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class TskitFormat(vcz.Source):
13
+ @core.requires_optional_dependency("tskit", "tskit")
14
+ def __init__(
15
+ self,
16
+ ts,
17
+ *,
18
+ model_mapping=None,
19
+ ):
20
+ import tskit
21
+
22
+ self._path = None
23
+ # Future versions here will need to deal with the complexities of
24
+ # having lists of tree sequences for multiple chromosomes.
25
+ if isinstance(ts, tskit.TreeSequence):
26
+ self.ts = ts
27
+ else:
28
+ # input 'ts' is a path.
29
+ self._path = ts
30
+ logger.info(f"Loading from {ts}")
31
+ self.ts = tskit.load(ts)
32
+ logger.info(
33
+ f"Input has {self.ts.num_individuals} individuals and "
34
+ f"{self.ts.num_sites} sites"
35
+ )
36
+
37
+ if model_mapping is None:
38
+ model_mapping = self.ts.map_to_vcf_model()
39
+
40
+ self.contig_id = model_mapping.contig_id
41
+ self.contig_length = model_mapping.contig_length
42
+ self.isolated_as_missing = model_mapping.isolated_as_missing
43
+ self.raw_positions = self.ts.sites_position
44
+ self.vcf_positions = model_mapping.transformed_positions
45
+ individuals_nodes = model_mapping.individuals_nodes
46
+ sample_ids = model_mapping.individuals_name
47
+
48
+ self._num_samples = individuals_nodes.shape[0]
49
+ logger.info(f"Converting for {self._num_samples} samples")
50
+ if self._num_samples < 1:
51
+ raise ValueError("individuals_nodes must have at least one sample")
52
+ self.max_ploidy = individuals_nodes.shape[1]
53
+ if len(sample_ids) != self._num_samples:
54
+ raise ValueError(
55
+ f"Length of sample_ids ({len(sample_ids)}) does not match "
56
+ f"number of samples ({self._num_samples})"
57
+ )
58
+
59
+ self._samples = [vcz.Sample(id=sample_id) for sample_id in sample_ids]
60
+
61
+ self.tskit_samples = np.unique(individuals_nodes[individuals_nodes >= 0])
62
+ if len(self.tskit_samples) < 1:
63
+ raise ValueError("individuals_nodes must have at least one valid sample")
64
+ node_id_to_index = {node_id: i for i, node_id in enumerate(self.tskit_samples)}
65
+ valid_mask = individuals_nodes >= 0
66
+ self.sample_indices, self.ploidy_indices = np.where(valid_mask)
67
+ self.genotype_indices = np.array(
68
+ [node_id_to_index[node_id] for node_id in individuals_nodes[valid_mask]]
69
+ )
70
+
71
+ @property
72
+ def path(self):
73
+ return self._path
74
+
75
+ @property
76
+ def num_records(self):
77
+ return self.ts.num_sites
78
+
79
+ @property
80
+ def num_samples(self):
81
+ return self._num_samples
82
+
83
+ @property
84
+ def samples(self):
85
+ return self._samples
86
+
87
+ @property
88
+ def root_attrs(self):
89
+ return {}
90
+
91
+ @property
92
+ def contigs(self):
93
+ return [vcz.Contig(id=self.contig_id, length=self.contig_length)]
94
+
95
+ def iter_contig(self, start, stop):
96
+ yield from (0 for _ in range(start, stop))
97
+
98
+ def iter_field(self, field_name, shape, start, stop):
99
+ if field_name == "position":
100
+ for pos in self.vcf_positions[start:stop]:
101
+ yield int(pos)
102
+ else:
103
+ raise ValueError(f"Unknown field {field_name}")
104
+
105
+ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
106
+ # All genotypes in tskit are considered phased
107
+ phased = np.ones(shape[:-1], dtype=bool)
108
+ logger.debug(f"Getting genotpes start={start} stop={stop}")
109
+
110
+ for variant in self.ts.variants(
111
+ isolated_as_missing=self.isolated_as_missing,
112
+ left=self.raw_positions[start],
113
+ right=self.raw_positions[stop] if stop < self.num_records else None,
114
+ samples=self.tskit_samples,
115
+ copy=False,
116
+ ):
117
+ gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
118
+ alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
119
+ # length is the length of the REF allele unless other fields
120
+ # are included.
121
+ variant_length = len(variant.alleles[0])
122
+ for i, allele in enumerate(variant.alleles):
123
+ # None is returned by tskit in the case of a missing allele
124
+ if allele is None:
125
+ continue
126
+ assert i < num_alleles
127
+ alleles[i] = allele
128
+ gt[self.sample_indices, self.ploidy_indices] = variant.genotypes[
129
+ self.genotype_indices
130
+ ]
131
+
132
+ yield vcz.VariantData(variant_length, alleles, gt, phased)
133
+
134
+ def generate_schema(
135
+ self,
136
+ variants_chunk_size=None,
137
+ samples_chunk_size=None,
138
+ ):
139
+ n = self.num_samples
140
+ m = self.ts.num_sites
141
+
142
+ # Determine max number of alleles
143
+ max_alleles = 0
144
+ for site in self.ts.sites():
145
+ states = {site.ancestral_state}
146
+ for mut in site.mutations:
147
+ states.add(mut.derived_state)
148
+ max_alleles = max(len(states), max_alleles)
149
+
150
+ logging.info(f"Scanned tskit with {n} samples and {m} variants")
151
+ logging.info(
152
+ f"Maximum ploidy: {self.max_ploidy}, maximum alleles: {max_alleles}"
153
+ )
154
+ dimensions = vcz.standard_dimensions(
155
+ variants_size=m,
156
+ variants_chunk_size=variants_chunk_size,
157
+ samples_size=n,
158
+ samples_chunk_size=samples_chunk_size,
159
+ ploidy_size=self.max_ploidy,
160
+ alleles_size=max_alleles,
161
+ )
162
+ schema_instance = vcz.VcfZarrSchema(
163
+ format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
164
+ dimensions=dimensions,
165
+ fields=[],
166
+ )
167
+
168
+ logger.info(
169
+ "Generating schema with chunks="
170
+ f"{schema_instance.dimensions['variants'].chunk_size}, "
171
+ f"{schema_instance.dimensions['samples'].chunk_size}"
172
+ )
173
+
174
+ # Check if positions will fit in i4 (max ~2.1 billion)
175
+ min_position = 0
176
+ max_position = 0
177
+ if self.ts.num_sites > 0:
178
+ min_position = np.min(self.vcf_positions)
179
+ max_position = np.max(self.vcf_positions)
180
+
181
+ tables = self.ts.tables
182
+ ancestral_state_offsets = tables.sites.ancestral_state_offset
183
+ derived_state_offsets = tables.mutations.derived_state_offset
184
+ ancestral_lengths = ancestral_state_offsets[1:] - ancestral_state_offsets[:-1]
185
+ derived_lengths = derived_state_offsets[1:] - derived_state_offsets[:-1]
186
+ max_variant_length = max(
187
+ np.max(ancestral_lengths) if len(ancestral_lengths) > 0 else 0,
188
+ np.max(derived_lengths) if len(derived_lengths) > 0 else 0,
189
+ )
190
+
191
+ array_specs = [
192
+ vcz.ZarrArraySpec(
193
+ source="position",
194
+ name="variant_position",
195
+ dtype=core.min_int_dtype(min_position, max_position),
196
+ dimensions=["variants"],
197
+ description="Position of each variant",
198
+ ),
199
+ vcz.ZarrArraySpec(
200
+ source=None,
201
+ name="variant_allele",
202
+ dtype=STRING_DTYPE_NAME,
203
+ dimensions=["variants", "alleles"],
204
+ description="Alleles for each variant",
205
+ ),
206
+ vcz.ZarrArraySpec(
207
+ source=None,
208
+ name="variant_length",
209
+ dtype=core.min_int_dtype(0, max_variant_length),
210
+ dimensions=["variants"],
211
+ description="Length of each variant",
212
+ ),
213
+ vcz.ZarrArraySpec(
214
+ source=None,
215
+ name="variant_contig",
216
+ dtype=core.min_int_dtype(0, len(self.contigs)),
217
+ dimensions=["variants"],
218
+ description="Contig/chromosome index for each variant",
219
+ ),
220
+ vcz.ZarrArraySpec(
221
+ source=None,
222
+ name="call_genotype_phased",
223
+ dtype="bool",
224
+ dimensions=["variants", "samples"],
225
+ description="Whether the genotype is phased",
226
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
227
+ ),
228
+ vcz.ZarrArraySpec(
229
+ source=None,
230
+ name="call_genotype",
231
+ dtype=core.min_int_dtype(constants.INT_FILL, max_alleles - 1),
232
+ dimensions=["variants", "samples", "ploidy"],
233
+ description="Genotype for each variant and sample",
234
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
235
+ ),
236
+ vcz.ZarrArraySpec(
237
+ source=None,
238
+ name="call_genotype_mask",
239
+ dtype="bool",
240
+ dimensions=["variants", "samples", "ploidy"],
241
+ description="Mask for each genotype call",
242
+ compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
243
+ ),
244
+ ]
245
+ schema_instance.fields = array_specs
246
+ return schema_instance
247
+
248
+
249
+ def convert(
250
+ ts_or_path,
251
+ vcz_path,
252
+ *,
253
+ model_mapping=None,
254
+ variants_chunk_size=None,
255
+ samples_chunk_size=None,
256
+ worker_processes=core.DEFAULT_WORKER_PROCESSES,
257
+ show_progress=False,
258
+ ):
259
+ """
260
+ Convert a :class:`tskit.TreeSequence` (or path to a tree sequence
261
+ file) to VCF Zarr format stored at the specified path.
262
+
263
+ .. todo:: Document parameters
264
+ """
265
+ # FIXME there's some tricky details here in how we're handling
266
+ # parallelism that we'll need to tackle properly, and maybe
267
+ # review the current structures a bit. Basically, it looks like
268
+ # we're pickling/unpickling the format object when we have
269
+ # multiple workers, and this results in several copies of the
270
+ # tree sequence object being pass around. This is fine most
271
+ # of the time, but results in lots of memory being used when
272
+ # we're dealing with really massive files.
273
+ # See https://github.com/sgkit-dev/bio2zarr/issues/403
274
+ tskit_format = TskitFormat(
275
+ ts_or_path,
276
+ model_mapping=model_mapping,
277
+ )
278
+ schema_instance = tskit_format.generate_schema(
279
+ variants_chunk_size=variants_chunk_size,
280
+ samples_chunk_size=samples_chunk_size,
281
+ )
282
+ zarr_path = pathlib.Path(vcz_path)
283
+ vzw = vcz.VcfZarrWriter(TskitFormat, zarr_path)
284
+ # Rough heuristic to split work up enough to keep utilisation high
285
+ target_num_partitions = max(1, worker_processes * 4)
286
+ vzw.init(
287
+ tskit_format,
288
+ target_num_partitions=target_num_partitions,
289
+ schema=schema_instance,
290
+ )
291
+ vzw.encode_all_partitions(
292
+ worker_processes=worker_processes,
293
+ show_progress=show_progress,
294
+ )
295
+ vzw.finalise(show_progress)
296
+ vzw.create_index()
bio2zarr/typing.py CHANGED
@@ -1,4 +1,3 @@
1
1
  from pathlib import Path
2
- from typing import Union
3
2
 
4
- PathType = Union[str, Path]
3
+ PathType = str | Path