ugbio-mrd 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.1
2
+ Name: ugbio_mrd
3
+ Version: 1.2.0
4
+ Summary: Ultima Genomics MRD utils
5
+ Author-email: Itai Rusinek <itai.rusinek@ultimagen.com>, Gat Krieger <gat.krieger@ultimagen.com>, Avigail Moldovan <avigail.moldovan@ultimagen.com>
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: ugbio_core
10
+ Requires-Dist: ugbio_ppmseq
11
+ Requires-Dist: ugbio_featuremap
12
+
13
+ # ugbio_mrd
14
+
15
+ This module includes MRD (Minimal Residual Disease) python scripts and utils for bioinformatics pipelines.
@@ -0,0 +1,3 @@
1
+ # ugbio_mrd
2
+
3
+ This module includes MRD (Minimal Residual Disease) python scripts and utils for bioinformatics pipelines.
@@ -0,0 +1,40 @@
1
+ [project]
2
+ name = "ugbio_mrd"
3
+ version = "1.2.0"
4
+ requires-python = ">=3.10"
5
+ dependencies = [
6
+ "ugbio_core",
7
+ "ugbio_ppmseq",
8
+ "ugbio_featuremap",
9
+ ]
10
+ description = "Ultima Genomics MRD utils"
11
+ authors = [
12
+ { name = "Itai Rusinek", email = "itai.rusinek@ultimagen.com" },
13
+ { name = "Gat Krieger", email = "gat.krieger@ultimagen.com" },
14
+ { name = "Avigail Moldovan", email = "avigail.moldovan@ultimagen.com" },
15
+ ]
16
+ readme = "README.mrd.md"
17
+
18
+ [project.license]
19
+ text = "Apache-2.0"
20
+
21
+ [project.scripts]
22
+ run_tests = "pytest:main"
23
+ generate_synthetic_signatures = "ugbio_mrd.generate_synthetic_signatures:main"
24
+ intersect_featuremap_with_signature = "ugbio_mrd.intersect_featuremap_with_signature:main"
25
+ prepare_data_from_mrd_pipeline = "ugbio_mrd.prepare_data_from_mrd_pipeline:main"
26
+
27
+ [tool.uv.sources.ugbio_core]
28
+ workspace = true
29
+
30
+ [tool.uv.sources.ugbio_ppmseq]
31
+ workspace = true
32
+
33
+ [tool.uv.sources.ugbio_featuremap]
34
+ workspace = true
35
+
36
+ [build-system]
37
+ requires = [
38
+ "setuptools>=61.0",
39
+ ]
40
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,33 @@
1
+ import filecmp
2
+ from os.path import join as pjoin
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from ugbio_mrd.mrd_utils import intersect_featuremap_with_signature
7
+
8
+
9
+ @pytest.fixture
10
+ def resources_dir():
11
+ return Path(__file__).parent / "resources"
12
+
13
+
14
+ def test_intersect_featuremap_with_signature(tmpdir, resources_dir):
15
+ signature = pjoin(
16
+ resources_dir,
17
+ "150382-BC04.filtered_signature.chr22_12693463.vcf.gz",
18
+ )
19
+ featuremap = pjoin(
20
+ resources_dir,
21
+ "featuremap_150419-BC04.sorted.chr22_12693463.vcf.gz",
22
+ )
23
+ expected_intersection = pjoin(
24
+ resources_dir,
25
+ "featuremap_150419-BC04.sorted.chr22_12693463.intersection.vcf.gz",
26
+ )
27
+ output_intersection = pjoin(tmpdir, "intersection.vcf.gz")
28
+ intersect_featuremap_with_signature(
29
+ featuremap_file=featuremap,
30
+ signature_file=signature,
31
+ output_intersection_file=output_intersection,
32
+ )
33
+ filecmp.cmp(output_intersection, expected_intersection)
@@ -0,0 +1,171 @@
1
+ import subprocess
2
+ from os.path import join as pjoin
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pytest
8
+ from pandas.testing import assert_frame_equal
9
+
10
+ from ugbio_mrd.mrd_utils import (
11
+ generate_synthetic_signatures,
12
+ intersect_featuremap_with_signature,
13
+ read_intersection_dataframes,
14
+ read_signature,
15
+ )
16
+ intersection_file_basename = "MRD_test_subsample.MRD_test_subsample_annotated_AF_vcf_gz_mrd_quality_snvs.intersection"
17
+
18
+
19
+ @pytest.fixture
20
+ def resources_dir():
21
+ return Path(__file__).parent / "resources"
22
+
23
+
24
+
25
+
26
+
27
+ def _assert_read_signature(signature, expected_signature, expected_columns=None, possibly_null_columns=None):
28
+ expected_columns = expected_columns or [
29
+ "ref",
30
+ "alt",
31
+ "id",
32
+ "qual",
33
+ "af",
34
+ ]
35
+ possibly_null_columns = possibly_null_columns or [
36
+ "id",
37
+ "qual",
38
+ ]
39
+ for c in expected_columns:
40
+ assert c in signature.columns
41
+ if c not in possibly_null_columns:
42
+ assert not signature[c].isnull().all()
43
+ assert (signature[c] == expected_signature[c]).all() or np.allclose(signature[c], expected_signature[c])
44
+
45
+
46
+ def test_read_signature_ug_mutect(tmpdir, resources_dir):
47
+ signature = read_signature(pjoin(resources_dir, "mutect_mrd_signature_test.vcf.gz"), return_dataframes=True)
48
+ signature_no_sample_name = read_signature(
49
+ pjoin(resources_dir, "mutect_mrd_signature_test.no_sample_name.vcf.gz"),
50
+ return_dataframes=True,
51
+ ) # make sure we can read the dataframe even if the sample name could not be deduced from the header
52
+ expected_signature = pd.read_hdf(pjoin(resources_dir, "mutect_mrd_signature_test.expected_output.h5"))
53
+ _assert_read_signature(
54
+ signature,
55
+ expected_signature,
56
+ expected_columns=[
57
+ "ref",
58
+ "alt",
59
+ "id",
60
+ "qual",
61
+ "af",
62
+ "depth_tumor_sample",
63
+ "cycle_skip_status",
64
+ "gc_content",
65
+ "left_motif",
66
+ "right_motif",
67
+ "mutation_type",
68
+ ],
69
+ )
70
+ _assert_read_signature(
71
+ signature_no_sample_name,
72
+ expected_signature,
73
+ expected_columns=[
74
+ "ref",
75
+ "alt",
76
+ "id",
77
+ "qual",
78
+ "af",
79
+ "depth_tumor_sample",
80
+ "cycle_skip_status",
81
+ "gc_content",
82
+ "left_motif",
83
+ "right_motif",
84
+ "mutation_type",
85
+ ],
86
+ possibly_null_columns=["id", "qual", "depth_tumor_sample", "af"],
87
+ )
88
+
89
+
90
+ def test_read_signature_ug_dv(tmpdir, resources_dir):
91
+ signature = read_signature(pjoin(resources_dir, "dv_mrd_signature_test.vcf.gz"), return_dataframes=True)
92
+ expected_signature = pd.read_hdf(pjoin(resources_dir, "dv_mrd_signature_test.expected_output.h5"))
93
+ _assert_read_signature(
94
+ signature,
95
+ expected_signature,
96
+ expected_columns=[
97
+ "ref",
98
+ "alt",
99
+ "id",
100
+ "qual",
101
+ "af",
102
+ "depth_tumor_sample",
103
+ "cycle_skip_status",
104
+ "gc_content",
105
+ "left_motif",
106
+ "right_motif",
107
+ "mutation_type",
108
+ ],
109
+ )
110
+
111
+
112
+ def test_read_signature_external(resources_dir):
113
+ signature = read_signature(pjoin(resources_dir, "external_somatic_signature.vcf.gz"), return_dataframes=True)
114
+ expected_signature = pd.read_hdf(pjoin(resources_dir, "external_somatic_signature.expected_output.h5"))
115
+
116
+ _assert_read_signature(signature, expected_signature)
117
+
118
+
119
+ def test_intersect_featuremap_with_signature(tmpdir, resources_dir):
120
+ signature_file = pjoin(resources_dir, "Pa_46.FreshFrozen.chr20.70039_70995.vcf.gz")
121
+ featuremap_file = pjoin(resources_dir, "Pa_46.bsDNA.chr20_sample.vcf.gz")
122
+ test_file = pjoin(resources_dir, "intersected_featuremap.vcf.gz")
123
+
124
+ output_intersection_file = pjoin(tmpdir, "intersected.vcf.gz")
125
+ intersect_featuremap_with_signature(
126
+ featuremap_file,
127
+ signature_file,
128
+ output_intersection_file=output_intersection_file,
129
+ )
130
+ cmd1 = f"bcftools view -H {output_intersection_file}"
131
+ cmd2 = f"bcftools view -H {test_file}"
132
+ assert subprocess.check_output(cmd1, shell=True) == subprocess.check_output(cmd2, shell=True)
133
+
134
+
135
+ def test_read_intersection_dataframes(tmpdir, resources_dir):
136
+ parsed_intersection_dataframe = read_intersection_dataframes(
137
+ pjoin(resources_dir, f"{intersection_file_basename}.expected_output.parquet"),
138
+ return_dataframes=True,
139
+ )
140
+ parsed_intersection_dataframe_expected = pd.read_parquet(
141
+ pjoin(resources_dir, f"{intersection_file_basename}.parsed.expected_output.parquet")
142
+ )
143
+ parsed_intersection_dataframe2 = read_intersection_dataframes(
144
+ [pjoin(resources_dir, f"{intersection_file_basename}.expected_output.parquet")],
145
+ return_dataframes=True,
146
+ )
147
+ assert_frame_equal(
148
+ parsed_intersection_dataframe.reset_index(),
149
+ parsed_intersection_dataframe_expected,
150
+ )
151
+ assert_frame_equal(
152
+ parsed_intersection_dataframe2.reset_index(),
153
+ parsed_intersection_dataframe_expected,
154
+ )
155
+
156
+
157
+ def test_generate_synthetic_signatures(tmpdir, resources_dir):
158
+ signature_file = pjoin(resources_dir, "mutect_mrd_signature_test.vcf.gz")
159
+ db_file = pjoin(
160
+ resources_dir,
161
+ "pancan_pcawg_2020.mutations_hg38_GNOMAD_dbsnp_beds.sorted.Annotated.HMER_LEN.edited.chr19.vcf.gz",
162
+ )
163
+ synthetic_signature_list = generate_synthetic_signatures(
164
+ signature_vcf=signature_file, db_vcf=db_file, n_synthetic_signatures=1, output_dir=tmpdir
165
+ )
166
+ signature = read_signature(synthetic_signature_list[0], return_dataframes=True)
167
+ expected_signature = read_signature(pjoin(resources_dir, "synthetic_signature_test.vcf.gz"), return_dataframes=True)
168
+ # test that motif distribution is the same (0th order)
169
+ assert (
170
+ signature.groupby(["ref", "alt"]).value_counts() == expected_signature.groupby(["ref", "alt"]).value_counts()
171
+ ).all()
File without changes
@@ -0,0 +1,84 @@
1
+ #!/env/python
2
+ # Copyright 2022 Ultima Genomics Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # DESCRIPTION
16
+ # Generates multiple synthetic signatures from a database, with the same
17
+ # trinucleotide substitution context as the input signature.
18
+ # CHANGELOG in reverse chronological order
19
+ import argparse
20
+ import sys
21
+
22
+ from ugbio_mrd.mrd_utils import generate_synthetic_signatures
23
+
24
+
25
+ def __parse_args(argv: list[str]) -> argparse.Namespace:
26
+ parser = argparse.ArgumentParser(prog="generate_synthetic_signature_from_db", description=run.__doc__)
27
+ parser.add_argument(
28
+ "-s",
29
+ "--signature_vcf",
30
+ type=str,
31
+ required=True,
32
+ help="""Signature vcf file""",
33
+ )
34
+ parser.add_argument(
35
+ "-db",
36
+ "--db_vcf",
37
+ type=str,
38
+ required=True,
39
+ help="""Database vcf file (for example, PCAWG)""",
40
+ )
41
+ parser.add_argument(
42
+ "-n",
43
+ "--n_synthetic_signatures",
44
+ type=int,
45
+ required=True,
46
+ help="""Number of synthetic signatures to generate""",
47
+ )
48
+ parser.add_argument(
49
+ "-r",
50
+ "--ref_fasta",
51
+ type=str,
52
+ required=False,
53
+ help="reference fasta file, default None. Required if input vcf is not annotated with left and right motifs "
54
+ "X_LM and X_RM",
55
+ )
56
+ parser.add_argument(
57
+ "-o",
58
+ "--output_dir",
59
+ type=str,
60
+ default=None,
61
+ required=False,
62
+ help="""Output directory for synthetic signatures""",
63
+ )
64
+ return parser.parse_args(argv[1:])
65
+
66
+
67
+ def run(argv):
68
+ """Generates multiple synthetic signatures from a database,
69
+ with the same trinucleotide substitution context as the input signature"""
70
+ args_in = __parse_args(argv)
71
+ generate_synthetic_signatures(
72
+ signature_vcf=args_in.signature_vcf,
73
+ db_vcf=args_in.db_vcf,
74
+ n_synthetic_signatures=args_in.n_synthetic_signatures,
75
+ output_dir=args_in.output_dir,
76
+ ref_fasta=args_in.ref_fasta,
77
+ )
78
+
79
+ def main():
80
+ run(sys.argv)
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
@@ -0,0 +1,75 @@
1
+ #!/env/python
2
+ # Copyright 2022 Ultima Genomics Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # DESCRIPTION
16
+ # Intersects featuremap vcf-like with pre-defined signature VCF-like
17
+ # CHANGELOG in reverse chronological order
18
+ import argparse
19
+ import sys
20
+
21
+ from ugbio_mrd.mrd_utils import intersect_featuremap_with_signature
22
+
23
+
24
+ def __parse_args(argv: list[str]) -> argparse.Namespace:
25
+ parser = argparse.ArgumentParser(prog="intersect_with_signature", description=run.__doc__)
26
+ parser.add_argument(
27
+ "-f",
28
+ "--featuremap",
29
+ type=str,
30
+ required=True,
31
+ help="""Featuremap vcf file""",
32
+ )
33
+ parser.add_argument(
34
+ "-s",
35
+ "--signature",
36
+ type=str,
37
+ required=True,
38
+ help="""Signature vcf file""",
39
+ )
40
+ parser.add_argument(
41
+ "-o",
42
+ "--output",
43
+ type=str,
44
+ default=None,
45
+ required=False,
46
+ help="""Output intersection vcf file (lines from featuremap propagated)""",
47
+ )
48
+ parser.add_argument(
49
+ "-stype",
50
+ "--signature_type",
51
+ type=str,
52
+ default=None,
53
+ required=False,
54
+ help="""matched, control or db_control""",
55
+ )
56
+ return parser.parse_args(argv[1:])
57
+
58
+
59
+ def run(argv):
60
+ """Intersect featuremap and signature vcf files on position and matching ref and alts"""
61
+ args_in = __parse_args(argv)
62
+ intersect_featuremap_with_signature(
63
+ featuremap_file=args_in.featuremap,
64
+ signature_file=args_in.signature,
65
+ output_intersection_file=args_in.output,
66
+ signature_type=args_in.signature_type,
67
+ )
68
+
69
+
70
+ def main():
71
+ run(sys.argv)
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()