phc-ingestion 0.8.25__py3-none-any.whl → 0.8.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestion/nextgen/util/nextgen_specific_genes.py +28 -0
- ingestion/nextgen/util/pre_filter_somatic_vcf.py +29 -14
- ingestion/nextgen/util/process_structural.py +21 -9
- {phc_ingestion-0.8.25.dist-info → phc_ingestion-0.8.27.dist-info}/METADATA +1 -1
- {phc_ingestion-0.8.25.dist-info → phc_ingestion-0.8.27.dist-info}/RECORD +6 -5
- {phc_ingestion-0.8.25.dist-info → phc_ingestion-0.8.27.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import TypedDict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class GeneWithLocation(TypedDict):
|
|
5
|
+
gene: str
|
|
6
|
+
chr: str
|
|
7
|
+
start: int
|
|
8
|
+
end: int
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
nextgen_specific_genes: list[GeneWithLocation] = [
|
|
12
|
+
{"gene": "IGK", "chr": "chr2", "start": 88852034, "end": 90258119},
|
|
13
|
+
{"gene": "NSD2", "chr": "chr4", "start": 1792518, "end": 1940193},
|
|
14
|
+
{"gene": "CCND3", "chr": "chr6", "start": 41920534, "end": 42562008},
|
|
15
|
+
{"gene": "MYC", "chr": "chr8", "start": 125309416, "end": 129673293},
|
|
16
|
+
{"gene": "CCND1", "chr": "chr11", "start": 69090733, "end": 69656860},
|
|
17
|
+
{"gene": "IGH", "chr": "chr14", "start": 105578834, "end": 109902208},
|
|
18
|
+
{"gene": "MAF", "chr": "chr16", "start": 78428398, "end": 79615096},
|
|
19
|
+
{"gene": "MAFB", "chr": "chr20", "start": 39039005, "end": 40688948},
|
|
20
|
+
{"gene": "IGL", "chr": "chr22", "start": 22012552, "end": 22965858},
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def maybe_get_matching_gene_for_location(chr: str, position: int) -> str | None:
|
|
25
|
+
for gene in nextgen_specific_genes:
|
|
26
|
+
if gene["chr"] == chr and gene["start"] <= position <= gene["end"]:
|
|
27
|
+
return gene["gene"]
|
|
28
|
+
return None
|
|
@@ -9,6 +9,17 @@ def build_variant_key_from_vcf_line(line: str) -> str:
|
|
|
9
9
|
return f"{chrom}:{pos}:{ref}:{alt}"
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
def extract_filter_from_vcf_line(line: str) -> str:
|
|
13
|
+
split_line = line.strip().split("\t")
|
|
14
|
+
return split_line[6]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def replace_filter_in_vcf_line(line: str, new_filter: str) -> str:
|
|
18
|
+
split_line = line.strip().split("\t")
|
|
19
|
+
split_line[6] = new_filter
|
|
20
|
+
return "\t".join(split_line) + "\n"
|
|
21
|
+
|
|
22
|
+
|
|
12
23
|
def pre_filter_somatic_vcf(
|
|
13
24
|
somatic_vcf_file: str,
|
|
14
25
|
somatic_vcf_snv_file: str,
|
|
@@ -19,22 +30,25 @@ def pre_filter_somatic_vcf(
|
|
|
19
30
|
"""
|
|
20
31
|
Removes all variants from the `somatic_vcf_file` that are not
|
|
21
32
|
also in the `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
|
|
33
|
+
|
|
34
|
+
Also updates the FILTER field in the `somatic_vcf_file` to match
|
|
35
|
+
the FILTER field of the corresponding variant in the
|
|
36
|
+
`somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
|
|
22
37
|
"""
|
|
23
38
|
log.info("Pre-filtering somatic VCF file")
|
|
24
39
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
valid_variant_keys.add(build_variant_key_from_vcf_line(line))
|
|
40
|
+
valid_variants_with_filters: dict[str, str] = {}
|
|
41
|
+
|
|
42
|
+
for file in [somatic_vcf_snv_file, somatic_vcf_indel_file]:
|
|
43
|
+
with open_maybe_gzipped(file, "rt") as f:
|
|
44
|
+
for line in f:
|
|
45
|
+
if line.startswith("#"):
|
|
46
|
+
continue
|
|
47
|
+
valid_variants_with_filters[build_variant_key_from_vcf_line(line)] = (
|
|
48
|
+
extract_filter_from_vcf_line(line)
|
|
49
|
+
)
|
|
36
50
|
|
|
37
|
-
log.info(f"Found {len(
|
|
51
|
+
log.info(f"Found {len(valid_variants_with_filters)} valid variants")
|
|
38
52
|
|
|
39
53
|
output_vcf_path = f"{working_dir}/filtered_somatic.vcf.gz"
|
|
40
54
|
with (
|
|
@@ -45,8 +59,9 @@ def pre_filter_somatic_vcf(
|
|
|
45
59
|
if line.startswith("#"):
|
|
46
60
|
w.write(line)
|
|
47
61
|
else:
|
|
48
|
-
|
|
49
|
-
|
|
62
|
+
key = build_variant_key_from_vcf_line(line)
|
|
63
|
+
if key in valid_variants_with_filters:
|
|
64
|
+
w.write(replace_filter_in_vcf_line(line, valid_variants_with_filters[key]))
|
|
50
65
|
|
|
51
66
|
log.info(f"Successfully pre-filtered somatic VCF file to {output_vcf_path}")
|
|
52
67
|
return output_vcf_path
|
|
@@ -5,6 +5,7 @@ from typing import TypedDict
|
|
|
5
5
|
from ingestion.shared_util.coords_to_genes import coords_to_genes
|
|
6
6
|
from ingestion.nextgen.util.alteration_table import extract_variant_table
|
|
7
7
|
from ingestion.nextgen.util.interpretation import map_interpretation
|
|
8
|
+
from ingestion.nextgen.util.nextgen_specific_genes import maybe_get_matching_gene_for_location
|
|
8
9
|
from ingestion.shared_util.open_maybe_gzipped import open_maybe_gzipped
|
|
9
10
|
|
|
10
11
|
|
|
@@ -42,6 +43,23 @@ def is_del_dup_or_ins(variant: list[str]) -> bool:
|
|
|
42
43
|
return any([x in variant[2] for x in ["MantaDEL", "MantaDUP", "MantaINS"]])
|
|
43
44
|
|
|
44
45
|
|
|
46
|
+
def get_gene_from_coords(
|
|
47
|
+
chromosome: str, start_position: str, end_position: str, log: Logger
|
|
48
|
+
) -> str:
|
|
49
|
+
"""
|
|
50
|
+
A number of genes of interest with specific start and end positions have been provided.
|
|
51
|
+
If a variant falls within the start and end positions of one of those genes of interest, that gene will be used.
|
|
52
|
+
Otherwise, we fall back to the standard gene lookup.
|
|
53
|
+
"""
|
|
54
|
+
center_position = int((int(start_position) + int(end_position)) / 2)
|
|
55
|
+
|
|
56
|
+
gene = maybe_get_matching_gene_for_location(chromosome, center_position)
|
|
57
|
+
if gene:
|
|
58
|
+
return gene
|
|
59
|
+
|
|
60
|
+
return coords_to_genes("GRCh38", chromosome, center_position, log)
|
|
61
|
+
|
|
62
|
+
|
|
45
63
|
def process_structural(
|
|
46
64
|
sv_in_file: str, xml_in_file, root_path: str, prefix: str, log: Logger
|
|
47
65
|
) -> str | None:
|
|
@@ -74,9 +92,7 @@ def process_structural(
|
|
|
74
92
|
effect = "insertion"
|
|
75
93
|
|
|
76
94
|
# Get genes from coordinates using center point of start and end positions
|
|
77
|
-
gene1 =
|
|
78
|
-
"GRCh38", chromosome1, int((int(start_position1) + int(end_position1)) / 2), log
|
|
79
|
-
)
|
|
95
|
+
gene1 = get_gene_from_coords(chromosome1, start_position1, end_position1, log)
|
|
80
96
|
gene2 = "N/A"
|
|
81
97
|
|
|
82
98
|
else:
|
|
@@ -89,12 +105,8 @@ def process_structural(
|
|
|
89
105
|
effect = "translocation"
|
|
90
106
|
|
|
91
107
|
# Get genes from coordinates using center point of start and end positions
|
|
92
|
-
gene1 =
|
|
93
|
-
|
|
94
|
-
)
|
|
95
|
-
gene2 = coords_to_genes(
|
|
96
|
-
"GRCh38", chromosome2, int((int(start_position2) + int(end_position2)) / 2), log
|
|
97
|
-
)
|
|
108
|
+
gene1 = get_gene_from_coords(chromosome1, start_position1, end_position1, log)
|
|
109
|
+
gene2 = get_gene_from_coords(chromosome2, start_position2, end_position2, log)
|
|
98
110
|
|
|
99
111
|
# Scrape interpretation
|
|
100
112
|
interpretation = "unknown"
|
|
@@ -33,10 +33,11 @@ ingestion/nextgen/process.py,sha256=GKiQ2dCxrR7tBD8TSP6Wk-TyoX3xBGip5zfpeT2buiQ,
|
|
|
33
33
|
ingestion/nextgen/util/alteration_table.py,sha256=KwpJCQv_rVsL30jkzgZn0bKdd205fjVodYBNTcK3D1s,4220
|
|
34
34
|
ingestion/nextgen/util/interpretation.py,sha256=ozuzb0vozff34zfP6AdOiUmI8Q77hI02jve_nCPZHfE,297
|
|
35
35
|
ingestion/nextgen/util/manifest_helpers.py,sha256=PpSay-pe62jk735nom1tVD9nDE8-CxmzzCrgpBhgtjY,1571
|
|
36
|
-
ingestion/nextgen/util/
|
|
36
|
+
ingestion/nextgen/util/nextgen_specific_genes.py,sha256=KcZ9HzUUobsLFM7z7oaXdnpVnY5-niYR-VSMKY985Po,1084
|
|
37
|
+
ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=K_gH4EnUXrKB22u_f8FqQVGrOS5LxXNsNO3VBn381eY,2301
|
|
37
38
|
ingestion/nextgen/util/process_cnv.py,sha256=m-AhsXFlYw4LTzgJJaj5vXYbK5n3H7cImzBxD2To6M0,2598
|
|
38
39
|
ingestion/nextgen/util/process_manifest.py,sha256=TAbCHwN_2m08jySn1J4qEd9Nrxjw9CrsspwFWack0V4,8448
|
|
39
|
-
ingestion/nextgen/util/process_structural.py,sha256=
|
|
40
|
+
ingestion/nextgen/util/process_structural.py,sha256=9xdJnSYtIT86lbhEIsPop39NGMwLHuqQGqlcQttlt64,6520
|
|
40
41
|
ingestion/nextgen/util/process_vcf.py,sha256=TvyV5wyXXSpFy2lc6h3ljqVQ5VeDPGxihHl2_K6LalQ,8432
|
|
41
42
|
ingestion/nextgen/util/types.py,sha256=SSzt5gv-kss1PR45eQUelypWrGI-dAfQMO3GSD-T-Wg,22
|
|
42
43
|
ingestion/resources/GRCh37_map.csv.gz,sha256=JOEkjtbYrJpIdyoZdCvfJhvvz2dNfkSve7lXSXkCCD8,408290
|
|
@@ -53,6 +54,6 @@ ingestion/vcf_standardization/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
|
|
|
53
54
|
ingestion/vcf_standardization/util/af_helpers.py,sha256=dpTzoeIQVeBRt0ETF3a9rp5ojZqznHg4x_hCZ8OPcOg,1061
|
|
54
55
|
ingestion/vcf_standardization/util/dp_helpers.py,sha256=Nq8oLOLObu4_pv16qwwgpALRlUoJVCULrd9cFOD-eoI,823
|
|
55
56
|
ingestion/vcf_standardization/util/read_write.py,sha256=IQotJ27To1MoQcRstc5AbHZtUuJz5cqkkZiHsDNaBvI,2471
|
|
56
|
-
phc_ingestion-0.8.
|
|
57
|
-
phc_ingestion-0.8.
|
|
58
|
-
phc_ingestion-0.8.
|
|
57
|
+
phc_ingestion-0.8.27.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
|
|
58
|
+
phc_ingestion-0.8.27.dist-info/METADATA,sha256=vNzI7puyTHjuXFU7CBTQ4tUOiXxHF3aYUWAPgnFnA0Y,552
|
|
59
|
+
phc_ingestion-0.8.27.dist-info/RECORD,,
|
|
File without changes
|