phc-ingestion 0.8.31__py3-none-any.whl → 0.8.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestion/nextgen/process.py +37 -29
- ingestion/nextgen/util/alteration_table.py +138 -92
- ingestion/nextgen/util/manifest_helpers.py +0 -11
- ingestion/nextgen/util/nextgen_specific_genes.py +2 -2
- ingestion/nextgen/util/pre_filter_somatic_vcf.py +37 -6
- ingestion/nextgen/util/process_cnv.py +18 -22
- ingestion/nextgen/util/process_manifest.py +11 -10
- ingestion/nextgen/util/process_structural.py +64 -66
- ingestion/nextgen/util/process_vcf.py +23 -33
- {phc_ingestion-0.8.31.dist-info → phc_ingestion-0.8.33.dist-info}/METADATA +2 -2
- {phc_ingestion-0.8.31.dist-info → phc_ingestion-0.8.33.dist-info}/RECORD +12 -12
- {phc_ingestion-0.8.31.dist-info → phc_ingestion-0.8.33.dist-info}/WHEEL +0 -0
ingestion/nextgen/process.py
CHANGED
|
@@ -2,6 +2,7 @@ from lifeomic_logging import scoped_logger
|
|
|
2
2
|
from typing import Any, TypedDict
|
|
3
3
|
from ruamel.yaml import YAML
|
|
4
4
|
|
|
5
|
+
from ingestion.nextgen.util.alteration_table import extract_variant_table_rows_and_hyperdiploidy
|
|
5
6
|
from ingestion.nextgen.util.pre_filter_somatic_vcf import pre_filter_somatic_vcf
|
|
6
7
|
from ingestion.nextgen.util.process_cnv import process_cnv
|
|
7
8
|
from ingestion.nextgen.util.process_manifest import process_manifest
|
|
@@ -36,54 +37,61 @@ def process(
|
|
|
36
37
|
"projectId": project_id,
|
|
37
38
|
"archiveFileId": source_file_id,
|
|
38
39
|
"caseId": case_id,
|
|
39
|
-
"
|
|
40
|
+
"ingestionId": ingestion_id,
|
|
40
41
|
}
|
|
41
42
|
with scoped_logger(__name__, log_context) as log:
|
|
43
|
+
(
|
|
44
|
+
short_variant_table_rows,
|
|
45
|
+
copy_number_variant_table_rows,
|
|
46
|
+
structural_variant_table_rows,
|
|
47
|
+
hyperdiploidy_chromosomes,
|
|
48
|
+
) = extract_variant_table_rows_and_hyperdiploidy(vendor_files["xmlFile"], log)
|
|
42
49
|
cnv_path_name = process_cnv(
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
log
|
|
50
|
+
vendor_files["somaticCnvTxtFile"],
|
|
51
|
+
copy_number_variant_table_rows,
|
|
52
|
+
local_output_dir,
|
|
53
|
+
case_id,
|
|
54
|
+
log,
|
|
48
55
|
)
|
|
49
56
|
structural_path_name, translocations = process_structural(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
log
|
|
57
|
+
vendor_files["somaticSvVcfFile"],
|
|
58
|
+
structural_variant_table_rows,
|
|
59
|
+
local_output_dir,
|
|
60
|
+
case_id,
|
|
61
|
+
log,
|
|
55
62
|
)
|
|
56
63
|
manifest = process_manifest(
|
|
57
|
-
|
|
58
|
-
source_file_id
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
+
vendor_files["xmlFile"],
|
|
65
|
+
source_file_id,
|
|
66
|
+
case_id,
|
|
67
|
+
bool(cnv_path_name),
|
|
68
|
+
bool(structural_path_name),
|
|
69
|
+
translocations,
|
|
70
|
+
hyperdiploidy_chromosomes,
|
|
64
71
|
)
|
|
65
72
|
pre_filtered_somatic_vcf_path = pre_filter_somatic_vcf(
|
|
66
73
|
vendor_files["somaticVcfFile"],
|
|
67
74
|
vendor_files["somaticVcfSnvFile"],
|
|
68
75
|
vendor_files["somaticVcfIndelFile"],
|
|
76
|
+
short_variant_table_rows,
|
|
69
77
|
local_output_dir,
|
|
70
78
|
log,
|
|
71
79
|
)
|
|
72
80
|
somatic_vcf_meta_data = process_vcf(
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
case_id
|
|
76
|
-
|
|
77
|
-
|
|
81
|
+
pre_filtered_somatic_vcf_path,
|
|
82
|
+
local_output_dir,
|
|
83
|
+
case_id,
|
|
84
|
+
"somatic",
|
|
85
|
+
short_variant_table_rows,
|
|
78
86
|
log=log,
|
|
79
87
|
)
|
|
80
88
|
germline_vcf_meta_data = process_vcf(
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
case_id
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
log
|
|
89
|
+
vendor_files["germlineVcfFile"],
|
|
90
|
+
local_output_dir,
|
|
91
|
+
case_id,
|
|
92
|
+
"germline",
|
|
93
|
+
short_variant_table_rows,
|
|
94
|
+
log,
|
|
87
95
|
)
|
|
88
96
|
|
|
89
97
|
manifest_path_name = f"{local_output_dir}/{case_id}.ga4gh.genomics.yml"
|
|
@@ -1,10 +1,41 @@
|
|
|
1
1
|
from logging import Logger
|
|
2
|
-
import pandas as pd
|
|
3
2
|
import re
|
|
4
|
-
from typing import
|
|
3
|
+
from typing import TypedDict, Generic, TypeVar
|
|
5
4
|
|
|
6
5
|
|
|
7
|
-
|
|
6
|
+
T = TypeVar("T")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AlterationTableRow(Generic[T], TypedDict):
|
|
10
|
+
gene: T
|
|
11
|
+
type: str
|
|
12
|
+
description: str
|
|
13
|
+
vaf: str
|
|
14
|
+
info: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ShortVariantGene(TypedDict):
|
|
18
|
+
chr: str
|
|
19
|
+
pos: int
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CopyNumberVariantGene(TypedDict):
|
|
23
|
+
gene: str
|
|
24
|
+
chr: str
|
|
25
|
+
start: int
|
|
26
|
+
end: int
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class StructuralVariantGene(TypedDict):
|
|
30
|
+
gene1: str
|
|
31
|
+
chr1: str
|
|
32
|
+
pos1: int
|
|
33
|
+
gene2: str
|
|
34
|
+
chr2: str
|
|
35
|
+
pos2: int
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
base_short_variant_types: list[str] = [
|
|
8
39
|
"Missense",
|
|
9
40
|
"Frameshift",
|
|
10
41
|
"Stop gained",
|
|
@@ -20,12 +51,23 @@ short_variant_types: list[str] = [
|
|
|
20
51
|
]
|
|
21
52
|
|
|
22
53
|
|
|
54
|
+
def get_short_variant_types() -> list[str]:
|
|
55
|
+
# For multi-word short variant types, sometimes the spaces are not included
|
|
56
|
+
short_variant_types: list[str] = []
|
|
57
|
+
for short_variant_type in base_short_variant_types:
|
|
58
|
+
short_variant_types.append(short_variant_type)
|
|
59
|
+
if " " in short_variant_type:
|
|
60
|
+
short_variant_types.append(short_variant_type.replace(" ", ""))
|
|
61
|
+
|
|
62
|
+
return short_variant_types
|
|
63
|
+
|
|
64
|
+
|
|
23
65
|
def extract_all_table_lines(xml_in_file: str) -> list[str]:
|
|
24
66
|
with open(xml_in_file, "r") as f:
|
|
25
67
|
xml_lines = f.readlines()
|
|
26
68
|
|
|
27
69
|
in_range_trigger = False
|
|
28
|
-
table_lines = []
|
|
70
|
+
table_lines: list[str] = []
|
|
29
71
|
for line in xml_lines:
|
|
30
72
|
if "Gene (Chr. Position, hg38)" in line:
|
|
31
73
|
in_range_trigger = True
|
|
@@ -37,7 +79,7 @@ def extract_all_table_lines(xml_in_file: str) -> list[str]:
|
|
|
37
79
|
return table_lines
|
|
38
80
|
|
|
39
81
|
|
|
40
|
-
def
|
|
82
|
+
def extract_alteration_table_rows(xml_in_file: str, log: Logger) -> list[AlterationTableRow[str]]:
|
|
41
83
|
table_lines = extract_all_table_lines(xml_in_file)
|
|
42
84
|
# Remove completely empty lines
|
|
43
85
|
table_lines = [line for line in table_lines if line.strip() != ""]
|
|
@@ -49,90 +91,94 @@ def extract_alteration_table(xml_in_file: str, log: Logger) -> pd.DataFrame:
|
|
|
49
91
|
if current_row:
|
|
50
92
|
table_row_lines.append(current_row)
|
|
51
93
|
current_row = []
|
|
52
|
-
line = re.sub(r"
|
|
53
|
-
line
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
94
|
+
line = re.sub(r"<\/?T.\/?>", "", line).strip()
|
|
95
|
+
if line and line != "p.":
|
|
96
|
+
current_row.append(line)
|
|
97
|
+
|
|
98
|
+
alteration_table_rows: list[AlterationTableRow[str]] = []
|
|
99
|
+
|
|
100
|
+
# Skip the first row which is the header
|
|
101
|
+
for row in table_row_lines[1:]:
|
|
102
|
+
# Sometimes the alteration table is "empty", in which case the `type` column will only contain "NA" values
|
|
103
|
+
if row[1] == "NA":
|
|
104
|
+
continue
|
|
105
|
+
alteration_table_rows.append(
|
|
106
|
+
{
|
|
107
|
+
"gene": row[0],
|
|
108
|
+
"type": row[1],
|
|
109
|
+
"description": row[2],
|
|
110
|
+
"vaf": row[3],
|
|
111
|
+
# Sometimes the info column is empty, so we need to check if it actually exists
|
|
112
|
+
# So far, it seems like rows with empty "info" columns are generally not useful for us
|
|
113
|
+
# and the data in them will not be used anywhere, so we just fill in an empty string
|
|
114
|
+
"info": row[4] if len(row) > 4 else "",
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return alteration_table_rows
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def parse_short_variant_gene(gene: str) -> ShortVariantGene:
|
|
122
|
+
pattern = r"^.*\((?P<chr>chr\d+|chrX|chrY):(?P<pos>\d+).*\).*$"
|
|
123
|
+
match = re.match(pattern, gene)
|
|
124
|
+
if not match:
|
|
125
|
+
raise RuntimeError(f"Failed to parse gene field for short variant")
|
|
126
|
+
return {"chr": match.group("chr"), "pos": int(match.group("pos"))}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def parse_copy_number_variant_gene(gene: str) -> CopyNumberVariantGene:
|
|
130
|
+
pattern = r"^(?P<gene>[A-Z1-9]*).*?\((?P<chr>chr\d+|chrX|chrY):(?P<start>\d+)_(?P<end>\d+)\).*$"
|
|
131
|
+
match = re.match(pattern, gene)
|
|
132
|
+
if not match:
|
|
133
|
+
raise RuntimeError(f"Failed to parse gene field for copy number variant")
|
|
134
|
+
return {
|
|
135
|
+
"gene": match.group("gene"),
|
|
136
|
+
"chr": match.group("chr"),
|
|
137
|
+
"start": int(match.group("start")),
|
|
138
|
+
"end": int(match.group("end")),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def parse_structural_variant_gene(gene: str) -> StructuralVariantGene:
|
|
143
|
+
pattern = r"^(?P<gene1>[A-Z1-9]*)(-|\/)(?P<gene2>[A-Z1-9]*).*\(.*(?P<chr1>chr\d+|chrX|chrY):(?P<pos1>\d+).*;.*(?P<chr2>chr\d+|chrX|chrY):(?P<pos2>\d+).*\).*$"
|
|
144
|
+
match = re.match(pattern, gene)
|
|
145
|
+
if not match:
|
|
146
|
+
raise RuntimeError(f"Failed to parse gene field for structural variant")
|
|
147
|
+
return {
|
|
148
|
+
"gene1": match.group("gene1"),
|
|
149
|
+
"chr1": match.group("chr1"),
|
|
150
|
+
"pos1": int(match.group("pos1")),
|
|
151
|
+
"gene2": match.group("gene2"),
|
|
152
|
+
"chr2": match.group("chr2"),
|
|
153
|
+
"pos2": int(match.group("pos2")),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def extract_variant_table_rows_and_hyperdiploidy(xml_in_file: str, log: Logger) -> tuple[
|
|
158
|
+
list[AlterationTableRow[ShortVariantGene]],
|
|
159
|
+
list[AlterationTableRow[CopyNumberVariantGene]],
|
|
160
|
+
list[AlterationTableRow[StructuralVariantGene]],
|
|
161
|
+
list[str] | None,
|
|
162
|
+
]:
|
|
163
|
+
alteration_table_rows = extract_alteration_table_rows(xml_in_file, log)
|
|
164
|
+
|
|
165
|
+
short_variant_rows: list[AlterationTableRow[ShortVariantGene]] = []
|
|
166
|
+
copy_number_rows: list[AlterationTableRow[CopyNumberVariantGene]] = []
|
|
167
|
+
structural_variant_rows: list[AlterationTableRow[StructuralVariantGene]] = []
|
|
168
|
+
hyperdiploidy_chromosomes: list[str] | None = None
|
|
169
|
+
|
|
170
|
+
short_variant_types = get_short_variant_types()
|
|
171
|
+
|
|
172
|
+
for row in alteration_table_rows:
|
|
173
|
+
if row["type"] in short_variant_types:
|
|
174
|
+
short_variant_rows.append({**row, "gene": parse_short_variant_gene(row["gene"])})
|
|
175
|
+
elif row["type"] == "CNV":
|
|
176
|
+
copy_number_rows.append({**row, "gene": parse_copy_number_variant_gene(row["gene"])})
|
|
177
|
+
elif row["type"] == "Translocation":
|
|
178
|
+
structural_variant_rows.append(
|
|
179
|
+
{**row, "gene": parse_structural_variant_gene(row["gene"])}
|
|
180
|
+
)
|
|
181
|
+
elif row["type"] == "Hyperdiploidy":
|
|
182
|
+
hyperdiploidy_chromosomes = re.findall(r"\d+", row["gene"])
|
|
183
|
+
|
|
184
|
+
return short_variant_rows, copy_number_rows, structural_variant_rows, hyperdiploidy_chromosomes
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from ingestion.nextgen.util.alteration_table import extract_hyperdiploidy_row
|
|
2
|
-
|
|
3
1
|
from logging import Logger
|
|
4
2
|
import re
|
|
5
3
|
|
|
@@ -42,12 +40,3 @@ def parse_report_date(line: str) -> str:
|
|
|
42
40
|
return parse_pattern(
|
|
43
41
|
r"^.*Diagnostic Genomics Laboratory.*(\d{2}\/\d{2}\/\d{4}).*$", line, "report date"
|
|
44
42
|
)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def extract_hyperdiploidy_chromosomes(xml_in_file: str, log: Logger) -> list[str] | None:
|
|
48
|
-
hyperdiploidy_row_dict = extract_hyperdiploidy_row(xml_in_file, log)
|
|
49
|
-
|
|
50
|
-
if not hyperdiploidy_row_dict:
|
|
51
|
-
return None
|
|
52
|
-
|
|
53
|
-
return re.findall(r"\d+", hyperdiploidy_row_dict["gene"])
|
|
@@ -14,7 +14,7 @@ nextgen_specific_genes_with_location: list[GeneWithLocation] = [
|
|
|
14
14
|
{"gene": "CCND3", "chr": "chr6", "start": 41920534, "end": 42562008},
|
|
15
15
|
{"gene": "MYC", "chr": "chr8", "start": 125309416, "end": 129673293},
|
|
16
16
|
{"gene": "CCND1", "chr": "chr11", "start": 69090733, "end": 69656860},
|
|
17
|
-
{"gene": "IGH", "chr": "chr14", "start":
|
|
17
|
+
{"gene": "IGH", "chr": "chr14", "start": 105516968, "end": 109902208},
|
|
18
18
|
{"gene": "MAF", "chr": "chr16", "start": 78428398, "end": 79615096},
|
|
19
19
|
{"gene": "MAFB", "chr": "chr20", "start": 39039005, "end": 40688948},
|
|
20
20
|
{"gene": "IGL", "chr": "chr22", "start": 22012552, "end": 22965858},
|
|
@@ -22,7 +22,7 @@ nextgen_specific_genes_with_location: list[GeneWithLocation] = [
|
|
|
22
22
|
nextgen_specific_genes: set[str] = {gene["gene"] for gene in nextgen_specific_genes_with_location}
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def
|
|
25
|
+
def maybe_get_nextgen_specific_gene(chr: str, position: int) -> str | None:
|
|
26
26
|
for gene in nextgen_specific_genes_with_location:
|
|
27
27
|
if gene["chr"] == chr and gene["start"] <= position <= gene["end"]:
|
|
28
28
|
return gene["gene"]
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from logging import Logger
|
|
2
2
|
|
|
3
|
+
from ingestion.nextgen.util.alteration_table import AlterationTableRow, ShortVariantGene
|
|
3
4
|
from ingestion.shared_util.open_maybe_gzipped import open_maybe_gzipped
|
|
4
5
|
|
|
5
6
|
|
|
@@ -14,26 +15,54 @@ def extract_filter_from_vcf_line(line: str) -> str:
|
|
|
14
15
|
return split_line[6]
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
def
|
|
18
|
+
def replace_filter_in_line(line: str, new_filter: str) -> str:
|
|
18
19
|
split_line = line.strip().split("\t")
|
|
19
20
|
split_line[6] = new_filter
|
|
20
21
|
return "\t".join(split_line) + "\n"
|
|
21
22
|
|
|
22
23
|
|
|
24
|
+
def is_line_in_alteration_table(
|
|
25
|
+
line: str, short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]]
|
|
26
|
+
) -> bool:
|
|
27
|
+
"""
|
|
28
|
+
Returns True if the line in the VCF appears in
|
|
29
|
+
the alteration table, False otherwise.
|
|
30
|
+
|
|
31
|
+
Matching in the alteration table is less strict than in the
|
|
32
|
+
VCF files; we only need to match chromosome and position.
|
|
33
|
+
|
|
34
|
+
Also position may differ by +1 or -1, as deletion and insertion positions
|
|
35
|
+
are represented differently in the VCF and the alteration table.
|
|
36
|
+
"""
|
|
37
|
+
split_line = line.strip().split("\t")
|
|
38
|
+
chrom, pos = split_line[0], int(split_line[1])
|
|
39
|
+
|
|
40
|
+
for row in short_variant_table_rows:
|
|
41
|
+
ref_chrom, ref_pos = row["gene"]["chr"], row["gene"]["pos"]
|
|
42
|
+
|
|
43
|
+
if ref_chrom == chrom and (abs(ref_pos - pos) <= 1):
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
|
|
23
49
|
def pre_filter_somatic_vcf(
|
|
24
50
|
somatic_vcf_file: str,
|
|
25
51
|
somatic_vcf_snv_file: str,
|
|
26
52
|
somatic_vcf_indel_file: str,
|
|
53
|
+
short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
|
|
27
54
|
working_dir: str,
|
|
28
55
|
log: Logger,
|
|
29
56
|
) -> str:
|
|
30
57
|
"""
|
|
31
58
|
Removes all variants from the `somatic_vcf_file` that are not
|
|
32
|
-
also in the `somatic_vcf_snv_file
|
|
59
|
+
also in the `somatic_vcf_snv_file`, the `somatic_vcf_indel_file`,
|
|
60
|
+
or the alteration table.
|
|
33
61
|
|
|
34
62
|
Also updates the FILTER field in the `somatic_vcf_file` to match
|
|
35
63
|
the FILTER field of the corresponding variant in the
|
|
36
64
|
`somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
|
|
65
|
+
For variants in the alteration table, the original FILTER field is kept.
|
|
37
66
|
"""
|
|
38
67
|
log.info("Pre-filtering somatic VCF file")
|
|
39
68
|
|
|
@@ -48,20 +77,22 @@ def pre_filter_somatic_vcf(
|
|
|
48
77
|
extract_filter_from_vcf_line(line)
|
|
49
78
|
)
|
|
50
79
|
|
|
51
|
-
log.info(f"Found {len(valid_variants_with_filters)} valid variants")
|
|
80
|
+
log.info(f"Found {len(valid_variants_with_filters)} valid variants in the SNV and INDEL files")
|
|
52
81
|
|
|
53
82
|
output_vcf_path = f"{working_dir}/filtered_somatic.vcf.gz"
|
|
54
83
|
with (
|
|
55
|
-
open_maybe_gzipped(somatic_vcf_file, "rt") as
|
|
84
|
+
open_maybe_gzipped(somatic_vcf_file, "rt") as r,
|
|
56
85
|
open_maybe_gzipped(output_vcf_path, "wt") as w,
|
|
57
86
|
):
|
|
58
|
-
for line in
|
|
87
|
+
for line in r:
|
|
59
88
|
if line.startswith("#"):
|
|
60
89
|
w.write(line)
|
|
61
90
|
else:
|
|
62
91
|
key = build_variant_key_from_vcf_line(line)
|
|
63
92
|
if key in valid_variants_with_filters:
|
|
64
|
-
w.write(
|
|
93
|
+
w.write(replace_filter_in_line(line, valid_variants_with_filters[key]))
|
|
94
|
+
elif is_line_in_alteration_table(line, short_variant_table_rows):
|
|
95
|
+
w.write(line)
|
|
65
96
|
|
|
66
97
|
log.info(f"Successfully pre-filtered somatic VCF file to {output_vcf_path}")
|
|
67
98
|
return output_vcf_path
|
|
@@ -1,20 +1,21 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from logging import Logger
|
|
3
3
|
|
|
4
|
-
from ingestion.nextgen.util.alteration_table import
|
|
4
|
+
from ingestion.nextgen.util.alteration_table import AlterationTableRow, CopyNumberVariantGene
|
|
5
5
|
from ingestion.nextgen.util.interpretation import map_interpretation
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def process_cnv(
|
|
9
|
-
|
|
9
|
+
cnv_in_file: str,
|
|
10
|
+
copy_number_variant_table_rows: list[AlterationTableRow[CopyNumberVariantGene]],
|
|
11
|
+
output_dir: str,
|
|
12
|
+
case_id: str,
|
|
13
|
+
log: Logger,
|
|
10
14
|
) -> str | None:
|
|
11
|
-
copy_number_path_name = f"{
|
|
12
|
-
sample_id =
|
|
15
|
+
copy_number_path_name = f"{output_dir}/{case_id}.copynumber.csv"
|
|
16
|
+
sample_id = case_id
|
|
13
17
|
|
|
14
|
-
copy_number_variant_rows = []
|
|
15
|
-
copy_number_variant_table = extract_variant_table(
|
|
16
|
-
xml_in_file=xml_in_file, variant_type="copy number", log=log
|
|
17
|
-
)
|
|
18
|
+
copy_number_variant_rows: list[str] = []
|
|
18
19
|
|
|
19
20
|
with open(cnv_in_file, "r") as f:
|
|
20
21
|
cnv_rows = f.readlines()
|
|
@@ -45,20 +46,15 @@ def process_cnv(
|
|
|
45
46
|
attributes = {}
|
|
46
47
|
|
|
47
48
|
# Scrape interpretation
|
|
48
|
-
interpretation =
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
):
|
|
58
|
-
interpretation = map_interpretation(row["info"], log)
|
|
59
|
-
|
|
60
|
-
if not interpretation:
|
|
61
|
-
interpretation = "unknown"
|
|
49
|
+
interpretation = "unknown"
|
|
50
|
+
for row in copy_number_variant_table_rows:
|
|
51
|
+
if (
|
|
52
|
+
row["gene"]["gene"] == gene_id_only
|
|
53
|
+
and row["gene"]["chr"] == chromosome
|
|
54
|
+
and row["gene"]["start"] <= int(start_position)
|
|
55
|
+
and row["gene"]["end"] >= int(end_position)
|
|
56
|
+
):
|
|
57
|
+
interpretation = map_interpretation(row["info"], log)
|
|
62
58
|
|
|
63
59
|
copy_number_variant_rows.append(
|
|
64
60
|
f"{sample_id},{gene_id_only},{copy_number},{status},{attributes},{chromosome},{start_position},{end_position},{interpretation}\n"
|
|
@@ -173,45 +173,46 @@ def extract_test_data(patient_info_lines: list, interpretation_lines: list):
|
|
|
173
173
|
def process_manifest(
|
|
174
174
|
xml_in_file: str,
|
|
175
175
|
source_file_id: str,
|
|
176
|
-
|
|
176
|
+
case_id: str,
|
|
177
177
|
include_copy_number: bool,
|
|
178
178
|
include_structural: bool,
|
|
179
179
|
somatic_translocations: list[str],
|
|
180
|
-
|
|
180
|
+
hyperdiploidy_chromosomes: list[str] | None,
|
|
181
181
|
):
|
|
182
182
|
test_text = extract_xml_text(xml_in_file)
|
|
183
183
|
interpretation_text = extract_interpretation_text(xml_in_file)
|
|
184
184
|
manifest = extract_test_data(test_text, interpretation_text)
|
|
185
185
|
manifest.update(extract_patient_data(test_text))
|
|
186
186
|
|
|
187
|
-
|
|
187
|
+
file_prefix = f".lifeomic/nextgen/{case_id}/{case_id}"
|
|
188
|
+
|
|
188
189
|
if hyperdiploidy_chromosomes:
|
|
189
190
|
manifest["hyperdiploidyTrisomies"] = hyperdiploidy_chromosomes
|
|
190
191
|
if somatic_translocations:
|
|
191
192
|
manifest["somaticTranslocations"] = somatic_translocations
|
|
192
193
|
|
|
193
|
-
manifest["reportFile"] = f"
|
|
194
|
+
manifest["reportFile"] = f"{file_prefix}.pdf"
|
|
194
195
|
manifest["sourceFileId"] = source_file_id
|
|
195
196
|
manifest["resources"] = []
|
|
196
197
|
|
|
197
198
|
manifest["files"] = [
|
|
198
199
|
{
|
|
199
|
-
"fileName": f"
|
|
200
|
+
"fileName": f"{file_prefix}.modified.somatic.nrm.filtered.vcf.gz",
|
|
200
201
|
"sequenceType": "somatic",
|
|
201
202
|
"type": "shortVariant",
|
|
202
203
|
},
|
|
203
204
|
{
|
|
204
|
-
"fileName": f"
|
|
205
|
+
"fileName": f"{file_prefix}.modified.germline.nrm.filtered.vcf.gz",
|
|
205
206
|
"sequenceType": "germline",
|
|
206
207
|
"type": "shortVariant",
|
|
207
208
|
},
|
|
208
209
|
{
|
|
209
|
-
"fileName": f"
|
|
210
|
+
"fileName": f"{file_prefix}.somatic.updated.bam",
|
|
210
211
|
"sequenceType": "somatic",
|
|
211
212
|
"type": "read",
|
|
212
213
|
},
|
|
213
214
|
{
|
|
214
|
-
"fileName": f"
|
|
215
|
+
"fileName": f"{file_prefix}.germline.updated.bam",
|
|
215
216
|
"sequenceType": "germline",
|
|
216
217
|
"type": "read",
|
|
217
218
|
},
|
|
@@ -219,7 +220,7 @@ def process_manifest(
|
|
|
219
220
|
if include_structural:
|
|
220
221
|
manifest["files"].append(
|
|
221
222
|
{
|
|
222
|
-
"fileName": f"
|
|
223
|
+
"fileName": f"{file_prefix}.structural.csv",
|
|
223
224
|
"sequenceType": "somatic",
|
|
224
225
|
"type": "structuralVariant",
|
|
225
226
|
},
|
|
@@ -227,7 +228,7 @@ def process_manifest(
|
|
|
227
228
|
if include_copy_number:
|
|
228
229
|
manifest["files"].append(
|
|
229
230
|
{
|
|
230
|
-
"fileName": f"
|
|
231
|
+
"fileName": f"{file_prefix}.copynumber.csv",
|
|
231
232
|
"sequenceType": "somatic",
|
|
232
233
|
"type": "copyNumberVariant",
|
|
233
234
|
}
|
|
@@ -3,12 +3,9 @@ import re
|
|
|
3
3
|
from typing import TypedDict
|
|
4
4
|
|
|
5
5
|
from ingestion.shared_util.coords_to_genes import coords_to_genes
|
|
6
|
-
from ingestion.nextgen.util.alteration_table import
|
|
6
|
+
from ingestion.nextgen.util.alteration_table import AlterationTableRow, StructuralVariantGene
|
|
7
7
|
from ingestion.nextgen.util.interpretation import map_interpretation
|
|
8
|
-
from ingestion.nextgen.util.nextgen_specific_genes import
|
|
9
|
-
maybe_get_matching_gene_for_location,
|
|
10
|
-
nextgen_specific_genes,
|
|
11
|
-
)
|
|
8
|
+
from ingestion.nextgen.util.nextgen_specific_genes import maybe_get_nextgen_specific_gene
|
|
12
9
|
from ingestion.shared_util.open_maybe_gzipped import open_maybe_gzipped
|
|
13
10
|
|
|
14
11
|
|
|
@@ -46,38 +43,33 @@ def is_del_dup_or_ins(variant: list[str]) -> bool:
|
|
|
46
43
|
return any([x in variant[2] for x in ["MantaDEL", "MantaDUP", "MantaINS"]])
|
|
47
44
|
|
|
48
45
|
|
|
49
|
-
def
|
|
50
|
-
chromosome: str, start_position: str, end_position: str, log: Logger
|
|
51
|
-
) -> str:
|
|
46
|
+
def get_center_position(start_position: str, end_position: str) -> int:
|
|
52
47
|
"""
|
|
53
|
-
|
|
54
|
-
If a variant falls within the start and end positions of one of those genes of interest, that gene will be used.
|
|
55
|
-
Otherwise, we fall back to the standard gene lookup.
|
|
48
|
+
Calculate the center position of a variant based on its start and end positions, useful for finding genes.
|
|
56
49
|
"""
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
gene = maybe_get_matching_gene_for_location(chromosome, center_position)
|
|
60
|
-
if gene:
|
|
61
|
-
return gene
|
|
62
|
-
|
|
63
|
-
return coords_to_genes("GRCh38", chromosome, center_position, log)
|
|
50
|
+
return int((int(start_position) + int(end_position)) / 2)
|
|
64
51
|
|
|
65
52
|
|
|
66
53
|
def process_structural(
|
|
67
|
-
|
|
54
|
+
structural_variant_in_file: str,
|
|
55
|
+
structural_variant_table_rows: list[AlterationTableRow[StructuralVariantGene]],
|
|
56
|
+
output_dir: str,
|
|
57
|
+
case_id: str,
|
|
58
|
+
log: Logger,
|
|
68
59
|
) -> tuple[str | None, list[str]]:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
)
|
|
60
|
+
structural_variant_path_name = f"{output_dir}/{case_id}.structural.csv"
|
|
61
|
+
sample_id = case_id
|
|
72
62
|
|
|
73
|
-
|
|
74
|
-
sample_id = prefix
|
|
75
|
-
|
|
76
|
-
with open_maybe_gzipped(sv_in_file, "rt") as f:
|
|
63
|
+
with open_maybe_gzipped(structural_variant_in_file, "rt") as f:
|
|
77
64
|
variants = [line for line in f.readlines() if not line.startswith("#")]
|
|
78
65
|
|
|
79
66
|
structural_variants: list[StructuralVariant] = []
|
|
67
|
+
formatted_translocations: set[str] = set()
|
|
68
|
+
|
|
80
69
|
for variant in variants:
|
|
70
|
+
gene1: str | None = None
|
|
71
|
+
gene2: str | None = None
|
|
72
|
+
|
|
81
73
|
working_variant = variant.strip().split("\t")
|
|
82
74
|
|
|
83
75
|
chromosome1 = f"chr{working_variant[0]}"
|
|
@@ -95,7 +87,7 @@ def process_structural(
|
|
|
95
87
|
effect = "insertion"
|
|
96
88
|
|
|
97
89
|
# Get genes from coordinates using center point of start and end positions
|
|
98
|
-
gene1 =
|
|
90
|
+
gene1 = None
|
|
99
91
|
gene2 = "N/A"
|
|
100
92
|
|
|
101
93
|
else:
|
|
@@ -107,32 +99,60 @@ def process_structural(
|
|
|
107
99
|
end_position2 = alt[1]
|
|
108
100
|
effect = "translocation"
|
|
109
101
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
102
|
+
gene1 = maybe_get_nextgen_specific_gene(
|
|
103
|
+
chromosome1, get_center_position(start_position1, end_position1)
|
|
104
|
+
)
|
|
105
|
+
gene2 = maybe_get_nextgen_specific_gene(
|
|
106
|
+
chromosome2, get_center_position(start_position2, end_position2)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Maybe add this variant to the formatted translocations list
|
|
110
|
+
if (gene1 == "MYC" or gene2 == "MYC") and gene1 != gene2:
|
|
111
|
+
formatted_translocations.add("t(MYC)")
|
|
112
|
+
elif gene1 and gene2:
|
|
113
|
+
# Remove the "chr" prefix and convert to int
|
|
114
|
+
chr1, chr2 = int(chromosome1[3:]), int(chromosome2[3:])
|
|
115
|
+
# Don't add translocations between the same chromosome
|
|
116
|
+
if chr1 == chr2:
|
|
117
|
+
continue
|
|
118
|
+
# Ensure chromosomes are in ascending order
|
|
119
|
+
if chr1 > chr2:
|
|
120
|
+
chr1, chr2 = chr2, chr1
|
|
121
|
+
formatted_translocations.add(f"t({chr1};{chr2})")
|
|
113
122
|
|
|
114
123
|
# Scrape interpretation
|
|
115
124
|
interpretation = "unknown"
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
[f"{chromosome1}:{start_position1}", f"{chromosome2}:{start_position2}"]
|
|
126
|
-
)
|
|
125
|
+
for row in structural_variant_table_rows:
|
|
126
|
+
is_match = (
|
|
127
|
+
row["gene"]["chr1"] == chromosome1
|
|
128
|
+
and row["gene"]["chr2"] == chromosome2
|
|
129
|
+
and row["gene"]["pos1"] == int(start_position1)
|
|
130
|
+
and row["gene"]["pos2"] == int(start_position2)
|
|
131
|
+
)
|
|
132
|
+
if not is_match:
|
|
133
|
+
continue
|
|
127
134
|
|
|
128
|
-
|
|
129
|
-
|
|
135
|
+
interpretation = map_interpretation(row["info"], log)
|
|
136
|
+
# Use the gene names from the alteration table but only if they are not already set
|
|
137
|
+
gene1 = gene1 if gene1 else row["gene"]["gene1"]
|
|
138
|
+
gene2 = gene2 if gene2 else row["gene"]["gene2"]
|
|
130
139
|
|
|
131
140
|
# Hard-code
|
|
132
141
|
sequence_type = "Somatic"
|
|
133
142
|
in_frame = "Unknown"
|
|
134
143
|
attributes: dict = {}
|
|
135
144
|
|
|
145
|
+
# If genes have not been populated from the nextgen specific genes or alteration
|
|
146
|
+
# table fall back to using the default gene finding method
|
|
147
|
+
if not gene1:
|
|
148
|
+
gene1 = coords_to_genes(
|
|
149
|
+
"GRCh38", chromosome1, get_center_position(start_position1, end_position1), log
|
|
150
|
+
)
|
|
151
|
+
if not gene2:
|
|
152
|
+
gene2 = coords_to_genes(
|
|
153
|
+
"GRCh38", chromosome2, get_center_position(start_position2, end_position2), log
|
|
154
|
+
)
|
|
155
|
+
|
|
136
156
|
structural_variants.append(
|
|
137
157
|
{
|
|
138
158
|
"sample_id": sample_id,
|
|
@@ -163,7 +183,7 @@ def process_structural(
|
|
|
163
183
|
deduped_structural_variants.append(sv)
|
|
164
184
|
|
|
165
185
|
if not deduped_structural_variants:
|
|
166
|
-
log.info(f"Ignoring empty structural variant file {
|
|
186
|
+
log.info(f"Ignoring empty structural variant file {structural_variant_in_file}")
|
|
167
187
|
return (None, [])
|
|
168
188
|
|
|
169
189
|
log.info(f"Saving file to {structural_variant_path_name}")
|
|
@@ -174,28 +194,6 @@ def process_structural(
|
|
|
174
194
|
for sv in deduped_structural_variants:
|
|
175
195
|
f.write(structural_variant_to_csv_row(sv))
|
|
176
196
|
|
|
177
|
-
log.info("Finding structural variant translocations for genes of interest")
|
|
178
|
-
translocations = [sv for sv in deduped_structural_variants if sv["effect"] == "translocation"]
|
|
179
|
-
formatted_translocations: set[str] = set()
|
|
180
|
-
for translocation in translocations:
|
|
181
|
-
gene1, gene2 = translocation["gene1"], translocation["gene2"]
|
|
182
|
-
# MYC is a special case
|
|
183
|
-
if gene1 == "MYC" or gene2 == "MYC":
|
|
184
|
-
if gene1 == gene2:
|
|
185
|
-
continue
|
|
186
|
-
formatted_translocations.add("t(MYC)")
|
|
187
|
-
continue
|
|
188
|
-
if gene1 in nextgen_specific_genes and gene2 in nextgen_specific_genes:
|
|
189
|
-
chr1, chr2 = int(translocation["position1"][0][3:]), int(
|
|
190
|
-
translocation["position2"][0][3:]
|
|
191
|
-
)
|
|
192
|
-
if chr1 == chr2:
|
|
193
|
-
continue
|
|
194
|
-
# Ensure chromosomes are in ascending order
|
|
195
|
-
if chr1 > chr2:
|
|
196
|
-
chr1, chr2 = chr2, chr1
|
|
197
|
-
formatted_translocations.add(f"t({chr1};{chr2})")
|
|
198
|
-
|
|
199
197
|
log.info(f"Found {len(formatted_translocations)} translocations for genes of interest")
|
|
200
198
|
|
|
201
199
|
return structural_variant_path_name, list(formatted_translocations)
|
|
@@ -4,7 +4,7 @@ import shutil
|
|
|
4
4
|
from logging import Logger
|
|
5
5
|
from typing import Literal
|
|
6
6
|
|
|
7
|
-
from ingestion.nextgen.util.alteration_table import
|
|
7
|
+
from ingestion.nextgen.util.alteration_table import AlterationTableRow, ShortVariantGene
|
|
8
8
|
|
|
9
9
|
SequenceType = Literal["somatic", "germline"]
|
|
10
10
|
|
|
@@ -76,14 +76,10 @@ def transform_vcf(
|
|
|
76
76
|
headers: list,
|
|
77
77
|
variants: list,
|
|
78
78
|
sequence_type: SequenceType,
|
|
79
|
-
|
|
79
|
+
short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
|
|
80
80
|
case_id: str,
|
|
81
81
|
log: Logger,
|
|
82
82
|
) -> str:
|
|
83
|
-
short_variant_table = extract_variant_table(
|
|
84
|
-
xml_in_file=xml_in_file, variant_type="short", log=log
|
|
85
|
-
)
|
|
86
|
-
|
|
87
83
|
log.info(f"Performing file transformations on {vcf_in_file}")
|
|
88
84
|
approved_chr_list = ["chr" + str(i) for i in range(1, 23)] + ["chrX", "chrY", "chrM"]
|
|
89
85
|
vcf_out: list[str] = []
|
|
@@ -135,13 +131,12 @@ def transform_vcf(
|
|
|
135
131
|
working.calculate_af()
|
|
136
132
|
working.prune_var()
|
|
137
133
|
|
|
138
|
-
if sequence_type == "somatic"
|
|
134
|
+
if sequence_type == "somatic":
|
|
139
135
|
split_var[7] = add_vendsig_to_info(
|
|
140
136
|
working.pruned_info,
|
|
141
|
-
|
|
137
|
+
short_variant_table_rows,
|
|
142
138
|
split_var[0],
|
|
143
139
|
int(split_var[1]),
|
|
144
|
-
log,
|
|
145
140
|
)
|
|
146
141
|
else:
|
|
147
142
|
split_var[7] = f"{working.pruned_info};VENDSIG=Unknown"
|
|
@@ -160,14 +155,14 @@ def export_vcf(vcf_out: str, vcf_path: str, log: Logger):
|
|
|
160
155
|
|
|
161
156
|
def process_vcf(
|
|
162
157
|
vcf_in_file: str,
|
|
163
|
-
|
|
158
|
+
output_dir: str,
|
|
164
159
|
case_id: str,
|
|
165
160
|
sequence_type: SequenceType,
|
|
166
|
-
|
|
161
|
+
short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
|
|
167
162
|
log: Logger,
|
|
168
163
|
):
|
|
169
164
|
line_count = 0
|
|
170
|
-
vcf_path = f"{
|
|
165
|
+
vcf_path = f"{output_dir}/{case_id}.modified.{sequence_type}.vcf.gz"
|
|
171
166
|
|
|
172
167
|
headers = []
|
|
173
168
|
variants = []
|
|
@@ -196,7 +191,13 @@ def process_vcf(
|
|
|
196
191
|
|
|
197
192
|
else:
|
|
198
193
|
vcf_out = transform_vcf(
|
|
199
|
-
vcf_in_file,
|
|
194
|
+
vcf_in_file,
|
|
195
|
+
headers,
|
|
196
|
+
variants,
|
|
197
|
+
sequence_type,
|
|
198
|
+
short_variant_table_rows,
|
|
199
|
+
case_id,
|
|
200
|
+
log,
|
|
200
201
|
)
|
|
201
202
|
export_vcf(vcf_out, vcf_path, log)
|
|
202
203
|
|
|
@@ -206,7 +207,7 @@ def process_vcf(
|
|
|
206
207
|
def map_vendsig(vendsig: str) -> str:
|
|
207
208
|
if vendsig in ["Pathogenic"]:
|
|
208
209
|
return "VENDSIG=Pathogenic"
|
|
209
|
-
elif vendsig in ["Likely Pathogenic"]:
|
|
210
|
+
elif vendsig in ["Likely Pathogenic", "LikelyPathogenic"]:
|
|
210
211
|
return "VENDSIG=Likely pathogenic"
|
|
211
212
|
elif vendsig in ["VUS"]:
|
|
212
213
|
return "VENDSIG=Uncertain significance"
|
|
@@ -214,26 +215,15 @@ def map_vendsig(vendsig: str) -> str:
|
|
|
214
215
|
raise RuntimeError(f"Unable to map vendor significance: {vendsig}")
|
|
215
216
|
|
|
216
217
|
|
|
217
|
-
def
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
pattern = r"^.*\((chr\d+|chrX|chrY):(\d+).*\).*$"
|
|
225
|
-
match = re.match(pattern, chr_pos)
|
|
226
|
-
if not match:
|
|
227
|
-
raise RuntimeError(f"Failed to extract chrom and pos from gene string")
|
|
228
|
-
chrom = match.group(1)
|
|
229
|
-
pos = int(match.group(2))
|
|
230
|
-
return (chrom, pos)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def add_vendsig_to_info(info: str, short_var_table, chrom: str, pos: int, log: Logger) -> str:
|
|
218
|
+
def add_vendsig_to_info(
|
|
219
|
+
info: str,
|
|
220
|
+
short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
|
|
221
|
+
chrom: str,
|
|
222
|
+
pos: int,
|
|
223
|
+
) -> str:
|
|
234
224
|
mapped_vendsig = None
|
|
235
|
-
for
|
|
236
|
-
ref_chrom, ref_pos =
|
|
225
|
+
for row in short_variant_table_rows:
|
|
226
|
+
ref_chrom, ref_pos = row["gene"]["chr"], row["gene"]["pos"]
|
|
237
227
|
|
|
238
228
|
if ref_chrom == chrom:
|
|
239
229
|
if ref_pos == pos or ref_pos + 1 == pos or ref_pos - 1 == pos:
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: phc-ingestion
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.33
|
|
4
4
|
Summary: Functions for LifeOmic PHC genomic ingestions
|
|
5
5
|
License: MIT
|
|
6
6
|
Author-email: LifeOmic Development <development@lifeomic.com>
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
8
|
Requires-Dist: jsonschema<5.0.0,>=4.16.0
|
|
9
9
|
Requires-Dist: lifeomic-logging<0.4.0,>=0.3.2
|
|
10
10
|
Requires-Dist: natsort==7.1.1
|
|
@@ -29,16 +29,16 @@ ingestion/generic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
29
29
|
ingestion/generic/process.py,sha256=WJHV_-SKhrDZ3JS3fm9DVMoW3Zs2t50GiraSV3vlLHE,1548
|
|
30
30
|
ingestion/generic/utils.py,sha256=1MEIru7uq38IjUdL8lcHqDH0oTki9uWrz1f2e-pmRoU,2814
|
|
31
31
|
ingestion/nextgen/__init__.py,sha256=7LQ-h_Bvc5P1QcHMdzsqi1Qm4fTJn04-ozar2ty9wSc,59
|
|
32
|
-
ingestion/nextgen/process.py,sha256=
|
|
33
|
-
ingestion/nextgen/util/alteration_table.py,sha256=
|
|
32
|
+
ingestion/nextgen/process.py,sha256=kDCnU685v7aqJ3i4HpFdb7HqgHRSBKqtYPpuyN7qWmM,3976
|
|
33
|
+
ingestion/nextgen/util/alteration_table.py,sha256=OqstLK6cgoNvRWy8bW6_iABaAn5ggCi1xBM8GOU6wYQ,6060
|
|
34
34
|
ingestion/nextgen/util/interpretation.py,sha256=ozuzb0vozff34zfP6AdOiUmI8Q77hI02jve_nCPZHfE,297
|
|
35
|
-
ingestion/nextgen/util/manifest_helpers.py,sha256=
|
|
36
|
-
ingestion/nextgen/util/nextgen_specific_genes.py,sha256=
|
|
37
|
-
ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=
|
|
38
|
-
ingestion/nextgen/util/process_cnv.py,sha256=
|
|
39
|
-
ingestion/nextgen/util/process_manifest.py,sha256=
|
|
40
|
-
ingestion/nextgen/util/process_structural.py,sha256=
|
|
41
|
-
ingestion/nextgen/util/process_vcf.py,sha256=
|
|
35
|
+
ingestion/nextgen/util/manifest_helpers.py,sha256=2xrpEtHbCb1Kea1wJeObkDfTiBklmffQt_o2hMgOSOE,1208
|
|
36
|
+
ingestion/nextgen/util/nextgen_specific_genes.py,sha256=hgam7HVE324FwOf7G4Wk4cUArch9vHIjBZRUUyF3ukg,1206
|
|
37
|
+
ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=mIaUihmGLbS38D4Gy_Qtf1lFAfW0A-LgAgQmsrEiI-M,3529
|
|
38
|
+
ingestion/nextgen/util/process_cnv.py,sha256=MIirc8e0k6lsaTZkRM3U3L3IvbrcHmKQ4xlIu585514,2430
|
|
39
|
+
ingestion/nextgen/util/process_manifest.py,sha256=EGYaTcub4M08mFTAh4CNHPRkP8_a5r4jMJaExm9Nkko,8423
|
|
40
|
+
ingestion/nextgen/util/process_structural.py,sha256=FKjkK7BkIlocnLs8rFCjrMC39FCQnD0nQCeWvi7cRoA,7539
|
|
41
|
+
ingestion/nextgen/util/process_vcf.py,sha256=SN0C13F45R_N5UaMaVSUDSCtIMmpHfaMTo7_5PkFkrM,8085
|
|
42
42
|
ingestion/nextgen/util/types.py,sha256=SSzt5gv-kss1PR45eQUelypWrGI-dAfQMO3GSD-T-Wg,22
|
|
43
43
|
ingestion/resources/GRCh37_map.csv.gz,sha256=JOEkjtbYrJpIdyoZdCvfJhvvz2dNfkSve7lXSXkCCD8,408290
|
|
44
44
|
ingestion/resources/GRCh38_map.csv.gz,sha256=qriYO2_buCCb4T6WcuZ-pCwPxMsm0TL2OxAHvJ1cEfA,612373
|
|
@@ -54,6 +54,6 @@ ingestion/vcf_standardization/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
|
|
|
54
54
|
ingestion/vcf_standardization/util/af_helpers.py,sha256=dpTzoeIQVeBRt0ETF3a9rp5ojZqznHg4x_hCZ8OPcOg,1061
|
|
55
55
|
ingestion/vcf_standardization/util/dp_helpers.py,sha256=Nq8oLOLObu4_pv16qwwgpALRlUoJVCULrd9cFOD-eoI,823
|
|
56
56
|
ingestion/vcf_standardization/util/read_write.py,sha256=IQotJ27To1MoQcRstc5AbHZtUuJz5cqkkZiHsDNaBvI,2471
|
|
57
|
-
phc_ingestion-0.8.
|
|
58
|
-
phc_ingestion-0.8.
|
|
59
|
-
phc_ingestion-0.8.
|
|
57
|
+
phc_ingestion-0.8.33.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
|
|
58
|
+
phc_ingestion-0.8.33.dist-info/METADATA,sha256=CM6kTtndCIkqq55vXl_x2vehEZ_mL29fK7TblsCsz9E,552
|
|
59
|
+
phc_ingestion-0.8.33.dist-info/RECORD,,
|
|
File without changes
|