phc-ingestion 0.8.32__tar.gz → 0.8.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/PKG-INFO +2 -2
  2. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/process.py +37 -29
  3. phc-ingestion-0.8.34/ingestion/nextgen/util/alteration_table.py +184 -0
  4. phc-ingestion-0.8.34/ingestion/nextgen/util/interpretation.py +28 -0
  5. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/manifest_helpers.py +0 -11
  6. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/nextgen_specific_genes.py +2 -2
  7. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/pre_filter_somatic_vcf.py +37 -6
  8. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/process_cnv.py +18 -22
  9. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/process_manifest.py +12 -11
  10. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/process_structural.py +64 -66
  11. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/process_vcf.py +23 -43
  12. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/vcf_standardization/Variant.py +14 -14
  13. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/vcf_standardization/standardize.py +4 -4
  14. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/vcf_standardization/util/read_write.py +3 -2
  15. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/pyproject.toml +2 -2
  16. phc-ingestion-0.8.32/ingestion/nextgen/util/alteration_table.py +0 -138
  17. phc-ingestion-0.8.32/ingestion/nextgen/util/interpretation.py +0 -11
  18. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/PYPI.md +0 -0
  19. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/__init__.py +0 -0
  20. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/__init__.py +0 -0
  21. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/process.py +0 -0
  22. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/__init__.py +0 -0
  23. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/cnv.py +0 -0
  24. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/detect_genome_ref.py +0 -0
  25. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/ga4gh.py +0 -0
  26. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/hla.py +0 -0
  27. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/ihc.py +0 -0
  28. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/interpretation.py +0 -0
  29. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/json.py +0 -0
  30. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/metadata.py +0 -0
  31. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/specimen_details.py +0 -0
  32. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/structural.py +0 -0
  33. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/tar.py +0 -0
  34. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/tests.py +0 -0
  35. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/tmb.py +0 -0
  36. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/tsv.py +0 -0
  37. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/caris/util/vcf.py +0 -0
  38. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/foundation/__init__.py +0 -0
  39. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/foundation/process.py +0 -0
  40. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/foundation/util/__init__.py +0 -0
  41. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/foundation/util/cnv.py +0 -0
  42. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/foundation/util/fnv.py +0 -0
  43. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/foundation/util/ga4gh.py +0 -0
  44. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/foundation/util/interpretation.py +0 -0
  45. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/foundation/util/vcf_etl.py +0 -0
  46. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/generic/__init__.py +0 -0
  47. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/generic/process.py +0 -0
  48. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/generic/utils.py +0 -0
  49. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/__init__.py +0 -0
  50. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/types.py +0 -0
  51. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/resources/GRCh37_map.csv.gz +0 -0
  52. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/resources/GRCh38_map.csv.gz +0 -0
  53. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/shared_util/__init__.py +0 -0
  54. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/shared_util/coords_to_genes.py +0 -0
  55. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/shared_util/gene_to_coords.py +0 -0
  56. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/shared_util/open_maybe_gzipped.py +0 -0
  57. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/shared_util/types.py +0 -0
  58. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/vcf_standardization/__init__.py +0 -0
  59. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/vcf_standardization/util/__init__.py +0 -0
  60. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/vcf_standardization/util/af_helpers.py +0 -0
  61. {phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/vcf_standardization/util/dp_helpers.py +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phc-ingestion
3
- Version: 0.8.32
3
+ Version: 0.8.34
4
4
  Summary: Functions for LifeOmic PHC genomic ingestions
5
5
  License: MIT
6
6
  Author-email: LifeOmic Development <development@lifeomic.com>
7
- Requires-Python: >=3.10
7
+ Requires-Python: >=3.11
8
8
  Description-Content-Type: text/markdown
9
9
 
10
10
  # phc-ingestion
@@ -2,6 +2,7 @@ from lifeomic_logging import scoped_logger
2
2
  from typing import Any, TypedDict
3
3
  from ruamel.yaml import YAML
4
4
 
5
+ from ingestion.nextgen.util.alteration_table import extract_variant_table_rows_and_hyperdiploidy
5
6
  from ingestion.nextgen.util.pre_filter_somatic_vcf import pre_filter_somatic_vcf
6
7
  from ingestion.nextgen.util.process_cnv import process_cnv
7
8
  from ingestion.nextgen.util.process_manifest import process_manifest
@@ -36,54 +37,61 @@ def process(
36
37
  "projectId": project_id,
37
38
  "archiveFileId": source_file_id,
38
39
  "caseId": case_id,
39
- "ingestion_id": ingestion_id,
40
+ "ingestionId": ingestion_id,
40
41
  }
41
42
  with scoped_logger(__name__, log_context) as log:
43
+ (
44
+ short_variant_table_rows,
45
+ copy_number_variant_table_rows,
46
+ structural_variant_table_rows,
47
+ hyperdiploidy_chromosomes,
48
+ ) = extract_variant_table_rows_and_hyperdiploidy(vendor_files["xmlFile"], log)
42
49
  cnv_path_name = process_cnv(
43
- xml_in_file=vendor_files["xmlFile"],
44
- cnv_in_file=vendor_files["somaticCnvTxtFile"],
45
- root_path=local_output_dir,
46
- prefix=case_id,
47
- log=log,
50
+ vendor_files["somaticCnvTxtFile"],
51
+ copy_number_variant_table_rows,
52
+ local_output_dir,
53
+ case_id,
54
+ log,
48
55
  )
49
56
  structural_path_name, translocations = process_structural(
50
- xml_in_file=vendor_files["xmlFile"],
51
- sv_in_file=vendor_files["somaticSvVcfFile"],
52
- root_path=local_output_dir,
53
- prefix=case_id,
54
- log=log,
57
+ vendor_files["somaticSvVcfFile"],
58
+ structural_variant_table_rows,
59
+ local_output_dir,
60
+ case_id,
61
+ log,
55
62
  )
56
63
  manifest = process_manifest(
57
- xml_in_file=vendor_files["xmlFile"],
58
- source_file_id=source_file_id,
59
- prefix=case_id,
60
- include_copy_number=bool(cnv_path_name),
61
- include_structural=bool(structural_path_name),
62
- somatic_translocations=translocations,
63
- log=log,
64
+ vendor_files["xmlFile"],
65
+ source_file_id,
66
+ case_id,
67
+ bool(cnv_path_name),
68
+ bool(structural_path_name),
69
+ translocations,
70
+ hyperdiploidy_chromosomes,
64
71
  )
65
72
  pre_filtered_somatic_vcf_path = pre_filter_somatic_vcf(
66
73
  vendor_files["somaticVcfFile"],
67
74
  vendor_files["somaticVcfSnvFile"],
68
75
  vendor_files["somaticVcfIndelFile"],
76
+ short_variant_table_rows,
69
77
  local_output_dir,
70
78
  log,
71
79
  )
72
80
  somatic_vcf_meta_data = process_vcf(
73
- vcf_in_file=pre_filtered_somatic_vcf_path,
74
- root_path=local_output_dir,
75
- case_id=case_id,
76
- sequence_type="somatic",
77
- xml_in_file=vendor_files["xmlFile"],
81
+ pre_filtered_somatic_vcf_path,
82
+ local_output_dir,
83
+ case_id,
84
+ "somatic",
85
+ short_variant_table_rows,
78
86
  log=log,
79
87
  )
80
88
  germline_vcf_meta_data = process_vcf(
81
- vcf_in_file=vendor_files["germlineVcfFile"],
82
- root_path=local_output_dir,
83
- case_id=case_id,
84
- sequence_type="germline",
85
- xml_in_file=vendor_files["xmlFile"],
86
- log=log,
89
+ vendor_files["germlineVcfFile"],
90
+ local_output_dir,
91
+ case_id,
92
+ "germline",
93
+ short_variant_table_rows,
94
+ log,
87
95
  )
88
96
 
89
97
  manifest_path_name = f"{local_output_dir}/{case_id}.ga4gh.genomics.yml"
@@ -0,0 +1,184 @@
1
+ from logging import Logger
2
+ import re
3
+ from typing import TypedDict, Generic, TypeVar
4
+
5
+
6
+ T = TypeVar("T")
7
+
8
+
9
+ class AlterationTableRow(Generic[T], TypedDict):
10
+ gene: T
11
+ type: str
12
+ description: str
13
+ vaf: str
14
+ info: str
15
+
16
+
17
+ class ShortVariantGene(TypedDict):
18
+ chr: str
19
+ pos: int
20
+
21
+
22
+ class CopyNumberVariantGene(TypedDict):
23
+ gene: str
24
+ chr: str
25
+ start: int
26
+ end: int
27
+
28
+
29
+ class StructuralVariantGene(TypedDict):
30
+ gene1: str
31
+ chr1: str
32
+ pos1: int
33
+ gene2: str
34
+ chr2: str
35
+ pos2: int
36
+
37
+
38
+ base_short_variant_types: list[str] = [
39
+ "Missense",
40
+ "Frameshift",
41
+ "Stop gained",
42
+ "Stop lost",
43
+ "Inframe deletion",
44
+ "Inframe insertion",
45
+ "Inframe",
46
+ "Splice site",
47
+ "Splice region",
48
+ "Nonsense",
49
+ "Splice acceptor",
50
+ "Splice donor",
51
+ ]
52
+
53
+
54
+ def get_short_variant_types() -> list[str]:
55
+ # For multi-word short variant types, sometimes the spaces are not included
56
+ short_variant_types: list[str] = []
57
+ for short_variant_type in base_short_variant_types:
58
+ short_variant_types.append(short_variant_type)
59
+ if " " in short_variant_type:
60
+ short_variant_types.append(short_variant_type.replace(" ", ""))
61
+
62
+ return short_variant_types
63
+
64
+
65
+ def extract_all_table_lines(xml_in_file: str) -> list[str]:
66
+ with open(xml_in_file, "r") as f:
67
+ xml_lines = f.readlines()
68
+
69
+ in_range_trigger = False
70
+ table_lines: list[str] = []
71
+ for line in xml_lines:
72
+ if "Gene (Chr. Position, hg38)" in line:
73
+ in_range_trigger = True
74
+ if in_range_trigger:
75
+ if "</Table>" in line:
76
+ break
77
+ table_lines.append(line)
78
+
79
+ return table_lines
80
+
81
+
82
+ def extract_alteration_table_rows(xml_in_file: str, log: Logger) -> list[AlterationTableRow[str]]:
83
+ table_lines = extract_all_table_lines(xml_in_file)
84
+ # Remove completely empty lines
85
+ table_lines = [line for line in table_lines if line.strip() != ""]
86
+
87
+ table_row_lines: list[list[str]] = []
88
+ current_row: list[str] = []
89
+ for line in table_lines:
90
+ if line.strip() == "</TR>":
91
+ if current_row:
92
+ table_row_lines.append(current_row)
93
+ current_row = []
94
+ line = re.sub(r"<\/?T.\/?>", "", line).strip()
95
+ if line and line != "p.":
96
+ current_row.append(line)
97
+
98
+ alteration_table_rows: list[AlterationTableRow[str]] = []
99
+
100
+ # Skip the first row which is the header
101
+ for row in table_row_lines[1:]:
102
+ # Sometimes the alteration table is "empty", in which case the `type` column will only contain "NA" values
103
+ if row[1] == "NA":
104
+ continue
105
+ alteration_table_rows.append(
106
+ {
107
+ "gene": row[0],
108
+ "type": row[1],
109
+ "description": row[2],
110
+ "vaf": row[3],
111
+ # Sometimes the info column is empty, so we need to check if it actually exists
112
+ # So far, it seems like rows with empty "info" columns are generally not useful for us
113
+ # and the data in them will not be used anywhere, so we just fill in an empty string
114
+ "info": row[4] if len(row) > 4 else "",
115
+ }
116
+ )
117
+
118
+ return alteration_table_rows
119
+
120
+
121
+ def parse_short_variant_gene(gene: str) -> ShortVariantGene:
122
+ pattern = r"^.*\((?P<chr>chr\d+|chrX|chrY):(?P<pos>\d+).*\).*$"
123
+ match = re.match(pattern, gene)
124
+ if not match:
125
+ raise RuntimeError(f"Failed to parse gene field for short variant")
126
+ return {"chr": match.group("chr"), "pos": int(match.group("pos"))}
127
+
128
+
129
+ def parse_copy_number_variant_gene(gene: str) -> CopyNumberVariantGene:
130
+ pattern = r"^(?P<gene>[A-Z1-9]*).*?\((?P<chr>chr\d+|chrX|chrY):(?P<start>\d+)_(?P<end>\d+)\).*$"
131
+ match = re.match(pattern, gene)
132
+ if not match:
133
+ raise RuntimeError(f"Failed to parse gene field for copy number variant")
134
+ return {
135
+ "gene": match.group("gene"),
136
+ "chr": match.group("chr"),
137
+ "start": int(match.group("start")),
138
+ "end": int(match.group("end")),
139
+ }
140
+
141
+
142
+ def parse_structural_variant_gene(gene: str) -> StructuralVariantGene:
143
+ pattern = r"^(?P<gene1>[A-Z1-9]*)(-|\/)(?P<gene2>[A-Z1-9]*).*\(.*(?P<chr1>chr\d+|chrX|chrY):(?P<pos1>\d+).*;.*(?P<chr2>chr\d+|chrX|chrY):(?P<pos2>\d+).*\).*$"
144
+ match = re.match(pattern, gene)
145
+ if not match:
146
+ raise RuntimeError(f"Failed to parse gene field for structural variant")
147
+ return {
148
+ "gene1": match.group("gene1"),
149
+ "chr1": match.group("chr1"),
150
+ "pos1": int(match.group("pos1")),
151
+ "gene2": match.group("gene2"),
152
+ "chr2": match.group("chr2"),
153
+ "pos2": int(match.group("pos2")),
154
+ }
155
+
156
+
157
+ def extract_variant_table_rows_and_hyperdiploidy(xml_in_file: str, log: Logger) -> tuple[
158
+ list[AlterationTableRow[ShortVariantGene]],
159
+ list[AlterationTableRow[CopyNumberVariantGene]],
160
+ list[AlterationTableRow[StructuralVariantGene]],
161
+ list[str] | None,
162
+ ]:
163
+ alteration_table_rows = extract_alteration_table_rows(xml_in_file, log)
164
+
165
+ short_variant_rows: list[AlterationTableRow[ShortVariantGene]] = []
166
+ copy_number_rows: list[AlterationTableRow[CopyNumberVariantGene]] = []
167
+ structural_variant_rows: list[AlterationTableRow[StructuralVariantGene]] = []
168
+ hyperdiploidy_chromosomes: list[str] | None = None
169
+
170
+ short_variant_types = get_short_variant_types()
171
+
172
+ for row in alteration_table_rows:
173
+ if row["type"] in short_variant_types:
174
+ short_variant_rows.append({**row, "gene": parse_short_variant_gene(row["gene"])})
175
+ elif row["type"] == "CNV":
176
+ copy_number_rows.append({**row, "gene": parse_copy_number_variant_gene(row["gene"])})
177
+ elif row["type"] == "Translocation":
178
+ structural_variant_rows.append(
179
+ {**row, "gene": parse_structural_variant_gene(row["gene"])}
180
+ )
181
+ elif row["type"] == "Hyperdiploidy":
182
+ hyperdiploidy_chromosomes = re.findall(r"\d+", row["gene"])
183
+
184
+ return short_variant_rows, copy_number_rows, structural_variant_rows, hyperdiploidy_chromosomes
@@ -0,0 +1,28 @@
1
+ from logging import Logger
2
+
3
+
4
+ def map_interpretation(status: str, log: Logger):
5
+ """
6
+ Map interpretation for structural and copy number variants
7
+ """
8
+ if status == "Pathogenic":
9
+ return "Pathogenic"
10
+ elif "VUS" in status:
11
+ return "Uncertain significance"
12
+ else:
13
+ log.error(f"Failed to resolve interpretation: {status}")
14
+ return ""
15
+
16
+
17
+ def map_vendsig(vendsig: str) -> str:
18
+ """
19
+ Map vendor significance for short variants
20
+ """
21
+ if vendsig in ["Pathogenic"]:
22
+ return "VENDSIG=Pathogenic"
23
+ elif vendsig in ["Likely Pathogenic", "LikelyPathogenic"]:
24
+ return "VENDSIG=Likely pathogenic"
25
+ elif vendsig in ["VUS"]:
26
+ return "VENDSIG=Uncertain significance"
27
+ else:
28
+ raise RuntimeError(f"Unable to map vendor significance: {vendsig}")
@@ -1,5 +1,3 @@
1
- from ingestion.nextgen.util.alteration_table import extract_hyperdiploidy_row
2
-
3
1
  from logging import Logger
4
2
  import re
5
3
 
@@ -42,12 +40,3 @@ def parse_report_date(line: str) -> str:
42
40
  return parse_pattern(
43
41
  r"^.*Diagnostic Genomics Laboratory.*(\d{2}\/\d{2}\/\d{4}).*$", line, "report date"
44
42
  )
45
-
46
-
47
- def extract_hyperdiploidy_chromosomes(xml_in_file: str, log: Logger) -> list[str] | None:
48
- hyperdiploidy_row_dict = extract_hyperdiploidy_row(xml_in_file, log)
49
-
50
- if not hyperdiploidy_row_dict:
51
- return None
52
-
53
- return re.findall(r"\d+", hyperdiploidy_row_dict["gene"])
@@ -14,7 +14,7 @@ nextgen_specific_genes_with_location: list[GeneWithLocation] = [
14
14
  {"gene": "CCND3", "chr": "chr6", "start": 41920534, "end": 42562008},
15
15
  {"gene": "MYC", "chr": "chr8", "start": 125309416, "end": 129673293},
16
16
  {"gene": "CCND1", "chr": "chr11", "start": 69090733, "end": 69656860},
17
- {"gene": "IGH", "chr": "chr14", "start": 105578834, "end": 109902208},
17
+ {"gene": "IGH", "chr": "chr14", "start": 105325507, "end": 109902208},
18
18
  {"gene": "MAF", "chr": "chr16", "start": 78428398, "end": 79615096},
19
19
  {"gene": "MAFB", "chr": "chr20", "start": 39039005, "end": 40688948},
20
20
  {"gene": "IGL", "chr": "chr22", "start": 22012552, "end": 22965858},
@@ -22,7 +22,7 @@ nextgen_specific_genes_with_location: list[GeneWithLocation] = [
22
22
  nextgen_specific_genes: set[str] = {gene["gene"] for gene in nextgen_specific_genes_with_location}
23
23
 
24
24
 
25
- def maybe_get_matching_gene_for_location(chr: str, position: int) -> str | None:
25
+ def maybe_get_nextgen_specific_gene(chr: str, position: int) -> str | None:
26
26
  for gene in nextgen_specific_genes_with_location:
27
27
  if gene["chr"] == chr and gene["start"] <= position <= gene["end"]:
28
28
  return gene["gene"]
@@ -1,5 +1,6 @@
1
1
  from logging import Logger
2
2
 
3
+ from ingestion.nextgen.util.alteration_table import AlterationTableRow, ShortVariantGene
3
4
  from ingestion.shared_util.open_maybe_gzipped import open_maybe_gzipped
4
5
 
5
6
 
@@ -14,26 +15,54 @@ def extract_filter_from_vcf_line(line: str) -> str:
14
15
  return split_line[6]
15
16
 
16
17
 
17
- def replace_filter_in_vcf_line(line: str, new_filter: str) -> str:
18
+ def replace_filter_in_line(line: str, new_filter: str) -> str:
18
19
  split_line = line.strip().split("\t")
19
20
  split_line[6] = new_filter
20
21
  return "\t".join(split_line) + "\n"
21
22
 
22
23
 
24
+ def is_line_in_alteration_table(
25
+ line: str, short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]]
26
+ ) -> bool:
27
+ """
28
+ Returns True if the line in the VCF appears in
29
+ the alteration table, False otherwise.
30
+
31
+ Matching in the alteration table is less strict than in the
32
+ VCF files; we only need to match chromosome and position.
33
+
34
+ Also position may differ by +1 or -1, as deletion and insertion positions
35
+ are represented differently in the VCF and the alteration table.
36
+ """
37
+ split_line = line.strip().split("\t")
38
+ chrom, pos = split_line[0], int(split_line[1])
39
+
40
+ for row in short_variant_table_rows:
41
+ ref_chrom, ref_pos = row["gene"]["chr"], row["gene"]["pos"]
42
+
43
+ if ref_chrom == chrom and (abs(ref_pos - pos) <= 1):
44
+ return True
45
+
46
+ return False
47
+
48
+
23
49
  def pre_filter_somatic_vcf(
24
50
  somatic_vcf_file: str,
25
51
  somatic_vcf_snv_file: str,
26
52
  somatic_vcf_indel_file: str,
53
+ short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
27
54
  working_dir: str,
28
55
  log: Logger,
29
56
  ) -> str:
30
57
  """
31
58
  Removes all variants from the `somatic_vcf_file` that are not
32
- also in the `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
59
+ also in the `somatic_vcf_snv_file`, the `somatic_vcf_indel_file`,
60
+ or the alteration table.
33
61
 
34
62
  Also updates the FILTER field in the `somatic_vcf_file` to match
35
63
  the FILTER field of the corresponding variant in the
36
64
  `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
65
+ For variants in the alteration table, the original FILTER field is kept.
37
66
  """
38
67
  log.info("Pre-filtering somatic VCF file")
39
68
 
@@ -48,20 +77,22 @@ def pre_filter_somatic_vcf(
48
77
  extract_filter_from_vcf_line(line)
49
78
  )
50
79
 
51
- log.info(f"Found {len(valid_variants_with_filters)} valid variants")
80
+ log.info(f"Found {len(valid_variants_with_filters)} valid variants in the SNV and INDEL files")
52
81
 
53
82
  output_vcf_path = f"{working_dir}/filtered_somatic.vcf.gz"
54
83
  with (
55
- open_maybe_gzipped(somatic_vcf_file, "rt") as f,
84
+ open_maybe_gzipped(somatic_vcf_file, "rt") as r,
56
85
  open_maybe_gzipped(output_vcf_path, "wt") as w,
57
86
  ):
58
- for line in f:
87
+ for line in r:
59
88
  if line.startswith("#"):
60
89
  w.write(line)
61
90
  else:
62
91
  key = build_variant_key_from_vcf_line(line)
63
92
  if key in valid_variants_with_filters:
64
- w.write(replace_filter_in_vcf_line(line, valid_variants_with_filters[key]))
93
+ w.write(replace_filter_in_line(line, valid_variants_with_filters[key]))
94
+ elif is_line_in_alteration_table(line, short_variant_table_rows):
95
+ w.write(line)
65
96
 
66
97
  log.info(f"Successfully pre-filtered somatic VCF file to {output_vcf_path}")
67
98
  return output_vcf_path
@@ -1,20 +1,21 @@
1
1
  import pandas as pd
2
2
  from logging import Logger
3
3
 
4
- from ingestion.nextgen.util.alteration_table import extract_variant_table
4
+ from ingestion.nextgen.util.alteration_table import AlterationTableRow, CopyNumberVariantGene
5
5
  from ingestion.nextgen.util.interpretation import map_interpretation
6
6
 
7
7
 
8
8
  def process_cnv(
9
- xml_in_file: str, cnv_in_file: str, root_path: str, prefix: str, log: Logger
9
+ cnv_in_file: str,
10
+ copy_number_variant_table_rows: list[AlterationTableRow[CopyNumberVariantGene]],
11
+ output_dir: str,
12
+ case_id: str,
13
+ log: Logger,
10
14
  ) -> str | None:
11
- copy_number_path_name = f"{root_path}/{prefix}.copynumber.csv"
12
- sample_id = prefix
15
+ copy_number_path_name = f"{output_dir}/{case_id}.copynumber.csv"
16
+ sample_id = case_id
13
17
 
14
- copy_number_variant_rows = []
15
- copy_number_variant_table = extract_variant_table(
16
- xml_in_file=xml_in_file, variant_type="copy number", log=log
17
- )
18
+ copy_number_variant_rows: list[str] = []
18
19
 
19
20
  with open(cnv_in_file, "r") as f:
20
21
  cnv_rows = f.readlines()
@@ -45,20 +46,15 @@ def process_cnv(
45
46
  attributes = {}
46
47
 
47
48
  # Scrape interpretation
48
- interpretation = None
49
- if not copy_number_variant_table.empty:
50
- for index, row in copy_number_variant_table.iterrows():
51
- ref_gene = row["gene"].split(" ")[0]
52
- ref_coord = row["gene"].split(" ")[1]
53
-
54
- if (
55
- ref_gene == gene_id_only
56
- and ref_coord == f"({chromosome}:{start_position}_{end_position})"
57
- ):
58
- interpretation = map_interpretation(row["info"], log)
59
-
60
- if not interpretation:
61
- interpretation = "unknown"
49
+ interpretation = "unknown"
50
+ for row in copy_number_variant_table_rows:
51
+ if (
52
+ row["gene"]["gene"] == gene_id_only
53
+ and row["gene"]["chr"] == chromosome
54
+ and row["gene"]["start"] <= int(start_position)
55
+ and row["gene"]["end"] >= int(end_position)
56
+ ):
57
+ interpretation = map_interpretation(row["info"], log)
62
58
 
63
59
  copy_number_variant_rows.append(
64
60
  f"{sample_id},{gene_id_only},{copy_number},{status},{attributes},{chromosome},{start_position},{end_position},{interpretation}\n"
@@ -65,7 +65,7 @@ def get_cell_purity(interpretation_lines: list):
65
65
  return float(00.00)
66
66
 
67
67
 
68
- def extract_patient_data(patient_info_lines: list):
68
+ def extract_patient_data(patient_info_lines: list[str]):
69
69
  patient_data: dict = {}
70
70
  patient_data["patientInfo"] = {}
71
71
 
@@ -173,45 +173,46 @@ def extract_test_data(patient_info_lines: list, interpretation_lines: list):
173
173
  def process_manifest(
174
174
  xml_in_file: str,
175
175
  source_file_id: str,
176
- prefix: str,
176
+ case_id: str,
177
177
  include_copy_number: bool,
178
178
  include_structural: bool,
179
179
  somatic_translocations: list[str],
180
- log: Logger,
180
+ hyperdiploidy_chromosomes: list[str] | None,
181
181
  ):
182
182
  test_text = extract_xml_text(xml_in_file)
183
183
  interpretation_text = extract_interpretation_text(xml_in_file)
184
184
  manifest = extract_test_data(test_text, interpretation_text)
185
185
  manifest.update(extract_patient_data(test_text))
186
186
 
187
- hyperdiploidy_chromosomes = manifest_helpers.extract_hyperdiploidy_chromosomes(xml_in_file, log)
187
+ file_prefix = f".lifeomic/nextgen/{case_id}/{case_id}"
188
+
188
189
  if hyperdiploidy_chromosomes:
189
190
  manifest["hyperdiploidyTrisomies"] = hyperdiploidy_chromosomes
190
191
  if somatic_translocations:
191
192
  manifest["somaticTranslocations"] = somatic_translocations
192
193
 
193
- manifest["reportFile"] = f".lifeomic/nextgen/{prefix}/{prefix}.pdf"
194
+ manifest["reportFile"] = f"{file_prefix}.pdf"
194
195
  manifest["sourceFileId"] = source_file_id
195
196
  manifest["resources"] = []
196
197
 
197
198
  manifest["files"] = [
198
199
  {
199
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.modified.somatic.nrm.filtered.vcf.gz",
200
+ "fileName": f"{file_prefix}.modified.somatic.nrm.filtered.vcf.gz",
200
201
  "sequenceType": "somatic",
201
202
  "type": "shortVariant",
202
203
  },
203
204
  {
204
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.modified.germline.nrm.filtered.vcf.gz",
205
+ "fileName": f"{file_prefix}.modified.germline.nrm.filtered.vcf.gz",
205
206
  "sequenceType": "germline",
206
207
  "type": "shortVariant",
207
208
  },
208
209
  {
209
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.somatic.updated.bam",
210
+ "fileName": f"{file_prefix}.somatic.updated.bam",
210
211
  "sequenceType": "somatic",
211
212
  "type": "read",
212
213
  },
213
214
  {
214
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.germline.updated.bam",
215
+ "fileName": f"{file_prefix}.germline.updated.bam",
215
216
  "sequenceType": "germline",
216
217
  "type": "read",
217
218
  },
@@ -219,7 +220,7 @@ def process_manifest(
219
220
  if include_structural:
220
221
  manifest["files"].append(
221
222
  {
222
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.structural.csv",
223
+ "fileName": f"{file_prefix}.structural.csv",
223
224
  "sequenceType": "somatic",
224
225
  "type": "structuralVariant",
225
226
  },
@@ -227,7 +228,7 @@ def process_manifest(
227
228
  if include_copy_number:
228
229
  manifest["files"].append(
229
230
  {
230
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.copynumber.csv",
231
+ "fileName": f"{file_prefix}.copynumber.csv",
231
232
  "sequenceType": "somatic",
232
233
  "type": "copyNumberVariant",
233
234
  }