tskit 1.0.1__cp314-cp314-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tskit/vcf.py ADDED
@@ -0,0 +1,219 @@
1
+ #
2
+ # MIT License
3
+ #
4
+ # Copyright (c) 2019-2024 Tskit Developers
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in all
14
+ # copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+ """
24
+ Convert tree sequences to VCF.
25
+ """
26
+ import numpy as np
27
+
28
+ from . import provenance
29
+
30
+
31
+ class VcfWriter:
32
+ """
33
+ Writes a VCF representation of the genotypes tree sequence to a
34
+ file-like object.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ tree_sequence,
40
+ *,
41
+ ploidy,
42
+ contig_id,
43
+ individuals,
44
+ individual_names,
45
+ position_transform,
46
+ site_mask,
47
+ sample_mask,
48
+ isolated_as_missing,
49
+ allow_position_zero,
50
+ include_non_sample_nodes,
51
+ ):
52
+ self.tree_sequence = tree_sequence
53
+
54
+ vcf_model = tree_sequence.map_to_vcf_model(
55
+ individuals=individuals,
56
+ ploidy=ploidy,
57
+ individual_names=individual_names,
58
+ include_non_sample_nodes=include_non_sample_nodes,
59
+ position_transform=position_transform,
60
+ contig_id=contig_id,
61
+ isolated_as_missing=isolated_as_missing,
62
+ )
63
+
64
+ # We now make some tweaks to the VCF model required for
65
+ # writing the VCF in text format
66
+
67
+ # Remove individuals with zero ploidy as these cannot be
68
+ # represented in VCF.
69
+ to_keep = (vcf_model.individuals_nodes != -1).any(axis=1)
70
+ vcf_model.individuals_nodes = vcf_model.individuals_nodes[to_keep]
71
+ vcf_model.individual_names = vcf_model.individuals_name[to_keep]
72
+ self.individual_ploidies = [
73
+ len(nodes[nodes >= 0]) for nodes in vcf_model.individuals_nodes
74
+ ]
75
+ self.num_individuals = len(vcf_model.individual_names)
76
+
77
+ if len(vcf_model.individuals_nodes) == 0:
78
+ raise ValueError("No samples in resulting VCF model")
79
+
80
+ if len(vcf_model.transformed_positions) > 0:
81
+ # Arguably this should be last_pos + 1, but if we hit this
82
+ # condition the coordinate systems are all muddled up anyway
83
+ # so it's simpler to stay with this rule that was inherited
84
+ # from the legacy VCF output code.
85
+ vcf_model.contig_length = max(
86
+ vcf_model.transformed_positions[-1], vcf_model.contig_length
87
+ )
88
+
89
+ # Flatten the array of node IDs, filtering out the -1 padding values
90
+ self.samples = []
91
+ for row in vcf_model.individuals_nodes:
92
+ for node_id in row:
93
+ if node_id != -1:
94
+ self.samples.append(node_id)
95
+
96
+ if site_mask is None:
97
+ site_mask = np.zeros(tree_sequence.num_sites, dtype=bool)
98
+ self.site_mask = np.array(site_mask, dtype=bool)
99
+ if self.site_mask.shape != (tree_sequence.num_sites,):
100
+ raise ValueError("Site mask must be 1D a boolean array of length num_sites")
101
+
102
+ # The VCF spec does not allow for positions to be 0, so we error if one of the
103
+ # transformed positions is 0 and allow_position_zero is False.
104
+ if not allow_position_zero and np.any(
105
+ vcf_model.transformed_positions[~site_mask] == 0
106
+ ):
107
+ raise ValueError(
108
+ "A variant position of 0 was found in the VCF output, this is not "
109
+ "fully compliant with the VCF spec. If you still wish to write the VCF "
110
+ 'please use the "allow_position_zero" argument to write_vcf. '
111
+ "Alternatively, you can increment all the positions by one using "
112
+ '"position_transform = lambda x: 1 + x" or coerce the zero to one with '
113
+ '"position_transform = lambda x: np.fmax(1, x)"'
114
+ )
115
+
116
+ self.sample_mask = sample_mask
117
+ if sample_mask is not None:
118
+ if not callable(sample_mask):
119
+ sample_mask = np.array(sample_mask, dtype=bool)
120
+ self.sample_mask = lambda _: sample_mask
121
+
122
+ self.vcf_model = vcf_model
123
+
124
+ def __write_header(self, output):
125
+ print("##fileformat=VCFv4.2", file=output)
126
+ print(f"##source=tskit {provenance.__version__}", file=output)
127
+ print('##FILTER=<ID=PASS,Description="All filters passed">', file=output)
128
+ print(
129
+ f"##contig=<ID={self.vcf_model.contig_id},length={self.vcf_model.contig_length}>",
130
+ file=output,
131
+ )
132
+ print(
133
+ '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=output
134
+ )
135
+ vcf_samples = "\t".join(self.vcf_model.individual_names)
136
+ print(
137
+ "#CHROM",
138
+ "POS",
139
+ "ID",
140
+ "REF",
141
+ "ALT",
142
+ "QUAL",
143
+ "FILTER",
144
+ "INFO",
145
+ "FORMAT",
146
+ vcf_samples,
147
+ sep="\t",
148
+ file=output,
149
+ )
150
+
151
+ def write(self, output):
152
+ self.__write_header(output)
153
+
154
+ # Build the array for hold the text genotype VCF data and the indexes into
155
+ # this array for when we're updating it.
156
+ gt_array = []
157
+ indexes = []
158
+ for ploidy in self.individual_ploidies:
159
+ for _ in range(ploidy):
160
+ indexes.append(len(gt_array))
161
+ # First element here is a placeholder that we'll write the actual
162
+ # genotypes into when for each variant.
163
+ gt_array.extend([0, ord("|")])
164
+ gt_array[-1] = ord("\t")
165
+ gt_array[-1] = ord("\n")
166
+ gt_array = np.array(gt_array, dtype=np.int8)
167
+ # TODO Unclear here whether using int64 or int32 will be faster for this index
168
+ # array. Test it out.
169
+ indexes = np.array(indexes, dtype=int)
170
+
171
+ for variant in self.tree_sequence.variants(
172
+ samples=self.samples, isolated_as_missing=self.vcf_model.isolated_as_missing
173
+ ):
174
+ site_id = variant.site.id
175
+ # We check the mask before we do any checks so we can use this as a
176
+ # way of skipping problematic sites.
177
+ if self.site_mask[site_id]:
178
+ continue
179
+
180
+ if variant.num_alleles > 9:
181
+ raise ValueError(
182
+ "More than 9 alleles not currently supported. Please open an issue "
183
+ "on GitHub if this limitation affects you."
184
+ )
185
+ pos = self.vcf_model.transformed_positions[variant.index]
186
+ ref = variant.alleles[0]
187
+ alt = "."
188
+ if variant.num_alleles > 1:
189
+ alt = ",".join(variant.alleles[1 : variant.num_alleles])
190
+ print(
191
+ self.vcf_model.contig_id,
192
+ pos,
193
+ site_id,
194
+ ref,
195
+ alt,
196
+ ".",
197
+ "PASS",
198
+ ".",
199
+ "GT",
200
+ sep="\t",
201
+ end="\t",
202
+ file=output,
203
+ )
204
+ genotypes = variant.genotypes
205
+ gt_array[indexes] = genotypes + ord("0")
206
+ if self.sample_mask is not None:
207
+ genotypes = genotypes.copy()
208
+ sample_mask = np.array(self.sample_mask(variant), dtype=bool)
209
+ if sample_mask.shape != genotypes.shape:
210
+ raise ValueError(
211
+ "Sample mask must be a numpy array of size num_samples"
212
+ )
213
+ genotypes[sample_mask] = -1
214
+ if self.sample_mask is not None or variant.has_missing_data:
215
+ missing = genotypes == -1
216
+ gt_array[indexes[missing]] = ord(".")
217
+ g_bytes = memoryview(gt_array).tobytes()
218
+ g_str = g_bytes.decode()
219
+ print(g_str, end="", file=output)
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: tskit
3
+ Version: 1.0.1
4
+ Summary: The tree sequence toolkit.
5
+ Author-email: Tskit Developers <admin@tskit.dev>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://tskit.dev/tskit
8
+ Project-URL: Documentation, https://tskit.dev/tskit/docs/stable
9
+ Project-URL: Changelog, https://tskit.dev/tskit/docs/stable/changelogs.html
10
+ Project-URL: Bug Tracker, https://github.com/tskit-dev/tskit/issues
11
+ Project-URL: GitHub, https://github.com/tskit-dev/tskit/
12
+ Keywords: population genetics,tree sequence,ancestral recombination graph,evolutionary tree,statistical genetics,phylogenetics,tskit
13
+ Classifier: Programming Language :: C
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Programming Language :: Python :: 3.14
21
+ Classifier: Programming Language :: Python :: 3 :: Only
22
+ Classifier: Development Status :: 5 - Production/Stable
23
+ Classifier: Environment :: Other Environment
24
+ Classifier: Intended Audience :: Science/Research
25
+ Classifier: Operating System :: POSIX
26
+ Classifier: Operating System :: MacOS :: MacOS X
27
+ Classifier: Operating System :: Microsoft :: Windows
28
+ Classifier: Topic :: Scientific/Engineering
29
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
30
+ Requires-Python: >=3.10
31
+ Description-Content-Type: text/x-rst
32
+ License-File: LICENSE
33
+ Requires-Dist: jsonschema>=3.0.0
34
+ Requires-Dist: numpy>=2
35
+ Provides-Extra: test
36
+ Requires-Dist: biopython==1.85; extra == "test"
37
+ Requires-Dist: coverage==7.7.0; extra == "test"
38
+ Requires-Dist: dendropy==5.0.1; extra == "test"
39
+ Requires-Dist: kastore==0.3.3; extra == "test"
40
+ Requires-Dist: lshmm==0.0.8; extra == "test"
41
+ Requires-Dist: msgpack==1.1.0; extra == "test"
42
+ Requires-Dist: msprime==1.4.0b2; extra == "test"
43
+ Requires-Dist: networkx==3.2.1; extra == "test"
44
+ Requires-Dist: numba==0.63.1; extra == "test"
45
+ Requires-Dist: portion==2.6.0; extra == "test"
46
+ Requires-Dist: pytest==8.3.5; extra == "test"
47
+ Requires-Dist: pytest-cov==6.0.0; extra == "test"
48
+ Requires-Dist: pytest-xdist==3.6.1; extra == "test"
49
+ Requires-Dist: tszip==0.2.5; extra == "test"
50
+ Requires-Dist: xmlunittest==1.0.1; extra == "test"
51
+ Requires-Dist: svgwrite==1.4.3; extra == "test"
52
+ Requires-Dist: newick==1.10.0; extra == "test"
53
+ Requires-Dist: zarr<3; extra == "test"
54
+ Provides-Extra: docs
55
+ Requires-Dist: jupyter-book==1.0.4.post1; extra == "docs"
56
+ Requires-Dist: breathe==4.35.0; extra == "docs"
57
+ Requires-Dist: sphinx-autodoc-typehints==2.3.0; extra == "docs"
58
+ Requires-Dist: sphinx-issues==5.0.0; extra == "docs"
59
+ Requires-Dist: sphinx-argparse==0.5.2; extra == "docs"
60
+ Requires-Dist: msprime==1.4.0b2; extra == "docs"
61
+ Requires-Dist: numba==0.63.1; extra == "docs"
62
+ Requires-Dist: sphinx-book-theme; extra == "docs"
63
+ Requires-Dist: pandas==2.2.3; extra == "docs"
64
+ Provides-Extra: dev
65
+ Requires-Dist: biopython>=1.70; extra == "dev"
66
+ Requires-Dist: coverage; extra == "dev"
67
+ Requires-Dist: dendropy; extra == "dev"
68
+ Requires-Dist: flake8; extra == "dev"
69
+ Requires-Dist: kastore; extra == "dev"
70
+ Requires-Dist: lshmm; extra == "dev"
71
+ Requires-Dist: msgpack; extra == "dev"
72
+ Requires-Dist: msprime; extra == "dev"
73
+ Requires-Dist: mypy; extra == "dev"
74
+ Requires-Dist: networkx; extra == "dev"
75
+ Requires-Dist: numba; extra == "dev"
76
+ Requires-Dist: portion; extra == "dev"
77
+ Requires-Dist: pre-commit; extra == "dev"
78
+ Requires-Dist: pytest; extra == "dev"
79
+ Requires-Dist: pytest-cov; extra == "dev"
80
+ Requires-Dist: pytest-xdist; extra == "dev"
81
+ Requires-Dist: setuptools_scm; extra == "dev"
82
+ Requires-Dist: svgwrite; extra == "dev"
83
+ Requires-Dist: tszip; extra == "dev"
84
+ Requires-Dist: xmlunittest; extra == "dev"
85
+ Requires-Dist: newick; extra == "dev"
86
+ Requires-Dist: zarr<3; extra == "dev"
87
+ Requires-Dist: jupyter-book<2; extra == "dev"
88
+ Requires-Dist: breathe; extra == "dev"
89
+ Requires-Dist: sphinx-autodoc-typehints; extra == "dev"
90
+ Requires-Dist: sphinx-issues; extra == "dev"
91
+ Requires-Dist: sphinx-argparse; extra == "dev"
92
+ Requires-Dist: sphinx-book-theme; extra == "dev"
93
+ Requires-Dist: pandas; extra == "dev"
94
+ Dynamic: license-file
95
+
96
+
97
+ The tree sequence toolkit.
98
+
99
+ Tskit is a cross-platform library for the storage and analysis of large-scale
100
+ genetic genealogy and variation data.
101
+ Please see the `documentation <https://tskit.dev/tskit/docs/stable/>`_
102
+ for further details.
103
+
104
+ Tskit is highly portable, and provides a number of
105
+ `installation options <https://tskit.dev/tskit/docs/stable/installation.html>`_.
@@ -0,0 +1,27 @@
1
+ _tskit.cpython-314-darwin.so,sha256=mlmsekrnU9N0F-gfudwc_zpYnWCmoi-Aj0jtybBsthU,1466160
2
+ tskit/__init__.py,sha256=bGaMlZ4MkZYgtZVkgbICh8D7Xi0Z2bk8NttnqS8bOKY,3220
3
+ tskit/__main__.py,sha256=T98e67vxj_8C_O83DgZpq9Ln5Dp12xCKwWgatn43FR4,67
4
+ tskit/_version.py,sha256=n8oAMlxCefvQC8EK_h0SQbULPyinNZeCexLSvo1GGVI,142
5
+ tskit/cli.py,sha256=J2jr6Q-Qe8CT4km4fcErwsnK9I3MCDLb_6H8xeahXFU,8756
6
+ tskit/combinatorics.py,sha256=HDIz34Myk-kA4HwNWD-WNwIS2kf5iHsTV2zIXl19FS0,55875
7
+ tskit/drawing.py,sha256=pWeL7bxfsTi08yHb-X1NNIHfVeDVJsVAF4hr-5NwOp4,111256
8
+ tskit/exceptions.py,sha256=ZC9Hvvi1Nv8EBK03erKowqPrXaxIkp14C2Su6V12K9s,2392
9
+ tskit/genotypes.py,sha256=36BUGbAN7Da5gtOlAwxTshT9ZHIWoTkX6MOmKfOhRWw,17859
10
+ tskit/intervals.py,sha256=YcgBP8tPhMYd1eQVjzZ_5fhIpF7SuPEGFLhS9NHJIho,23931
11
+ tskit/metadata.py,sha256=wR4UgmsXPRPcWQ1gwZgQBzzmb2WVgY_d1dkACLNSuuU,41974
12
+ tskit/provenance.py,sha256=jccUxhCozBPaCXX7zMcKIQON-zugGSbghuyH1vgChTM,4963
13
+ tskit/provenance.schema.json,sha256=JJE6_KkaEWYmxYZ0ylgJYtdwMv8E5F5C644_veRcMZk,2117
14
+ tskit/stats.py,sha256=Wy6k4XlthyDsZY9KBe9esmXGXkXmT1MvaTq6-EAao2w,7078
15
+ tskit/tables.py,sha256=FJc-kbFCFGNdL0ApTcz-CfoMwg7ldZc-TW-f3yf87GQ,195817
16
+ tskit/text_formats.py,sha256=DdnxGO-UBKKE0w_gtoxjWsmboqiwEySoyUxgBCpWh5E,14972
17
+ tskit/trees.py,sha256=S6eXCHJ0jUAadWvdf0WWhy6MARUjOrP07Bemavl-qZ8,501364
18
+ tskit/util.py,sha256=8K6l1EgDgLB-fHdsS6oztAGvkitynidA9kifcI9rCiE,35497
19
+ tskit/vcf.py,sha256=HvUmSeHlF9DCTicD87tmaUmImNy4GLAAvwAz7ORpsj8,8595
20
+ tskit/jit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ tskit/jit/numba.py,sha256=ElOZyipEgQW8GJ9On8xTBMwB9i6uKzevqvs0aJK5e_U,24779
22
+ tskit-1.0.1.dist-info/licenses/LICENSE,sha256=v9lcVGkQC5dVVV4SBBLsa9shqr22IXDbRedRoo9v1IA,1078
23
+ tskit-1.0.1.dist-info/METADATA,sha256=Y3WaZfZKkC_eNH_eXEZZwsPe3oWiqittekuJOEoF3PA,4555
24
+ tskit-1.0.1.dist-info/WHEEL,sha256=pBq69egh5x97PWc2-uzeinGu5GE3fGCM9h7i8BZE5wo,116
25
+ tskit-1.0.1.dist-info/entry_points.txt,sha256=3Zik1X8C9Io1WvmTRBao5yEG5Kwy_xhFdM-ABC9TkWQ,47
26
+ tskit-1.0.1.dist-info/top_level.txt,sha256=6GsXJYqSCR5Uhb4Js0BBzC0EFXE0FA5ywslsixSbwGM,13
27
+ tskit-1.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: false
4
+ Tag: cp314-cp314-macosx_10_15_universal2
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ tskit = tskit.cli:tskit_main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2018-2019 Tskit Developers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ _tskit
2
+ tskit