phc-ingestion 0.8.32__py3-none-any.whl → 0.8.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ from lifeomic_logging import scoped_logger
2
2
  from typing import Any, TypedDict
3
3
  from ruamel.yaml import YAML
4
4
 
5
+ from ingestion.nextgen.util.alteration_table import extract_variant_table_rows_and_hyperdiploidy
5
6
  from ingestion.nextgen.util.pre_filter_somatic_vcf import pre_filter_somatic_vcf
6
7
  from ingestion.nextgen.util.process_cnv import process_cnv
7
8
  from ingestion.nextgen.util.process_manifest import process_manifest
@@ -36,54 +37,61 @@ def process(
36
37
  "projectId": project_id,
37
38
  "archiveFileId": source_file_id,
38
39
  "caseId": case_id,
39
- "ingestion_id": ingestion_id,
40
+ "ingestionId": ingestion_id,
40
41
  }
41
42
  with scoped_logger(__name__, log_context) as log:
43
+ (
44
+ short_variant_table_rows,
45
+ copy_number_variant_table_rows,
46
+ structural_variant_table_rows,
47
+ hyperdiploidy_chromosomes,
48
+ ) = extract_variant_table_rows_and_hyperdiploidy(vendor_files["xmlFile"], log)
42
49
  cnv_path_name = process_cnv(
43
- xml_in_file=vendor_files["xmlFile"],
44
- cnv_in_file=vendor_files["somaticCnvTxtFile"],
45
- root_path=local_output_dir,
46
- prefix=case_id,
47
- log=log,
50
+ vendor_files["somaticCnvTxtFile"],
51
+ copy_number_variant_table_rows,
52
+ local_output_dir,
53
+ case_id,
54
+ log,
48
55
  )
49
56
  structural_path_name, translocations = process_structural(
50
- xml_in_file=vendor_files["xmlFile"],
51
- sv_in_file=vendor_files["somaticSvVcfFile"],
52
- root_path=local_output_dir,
53
- prefix=case_id,
54
- log=log,
57
+ vendor_files["somaticSvVcfFile"],
58
+ structural_variant_table_rows,
59
+ local_output_dir,
60
+ case_id,
61
+ log,
55
62
  )
56
63
  manifest = process_manifest(
57
- xml_in_file=vendor_files["xmlFile"],
58
- source_file_id=source_file_id,
59
- prefix=case_id,
60
- include_copy_number=bool(cnv_path_name),
61
- include_structural=bool(structural_path_name),
62
- somatic_translocations=translocations,
63
- log=log,
64
+ vendor_files["xmlFile"],
65
+ source_file_id,
66
+ case_id,
67
+ bool(cnv_path_name),
68
+ bool(structural_path_name),
69
+ translocations,
70
+ hyperdiploidy_chromosomes,
64
71
  )
65
72
  pre_filtered_somatic_vcf_path = pre_filter_somatic_vcf(
66
73
  vendor_files["somaticVcfFile"],
67
74
  vendor_files["somaticVcfSnvFile"],
68
75
  vendor_files["somaticVcfIndelFile"],
76
+ short_variant_table_rows,
69
77
  local_output_dir,
70
78
  log,
71
79
  )
72
80
  somatic_vcf_meta_data = process_vcf(
73
- vcf_in_file=pre_filtered_somatic_vcf_path,
74
- root_path=local_output_dir,
75
- case_id=case_id,
76
- sequence_type="somatic",
77
- xml_in_file=vendor_files["xmlFile"],
81
+ pre_filtered_somatic_vcf_path,
82
+ local_output_dir,
83
+ case_id,
84
+ "somatic",
85
+ short_variant_table_rows,
78
86
  log=log,
79
87
  )
80
88
  germline_vcf_meta_data = process_vcf(
81
- vcf_in_file=vendor_files["germlineVcfFile"],
82
- root_path=local_output_dir,
83
- case_id=case_id,
84
- sequence_type="germline",
85
- xml_in_file=vendor_files["xmlFile"],
86
- log=log,
89
+ vendor_files["germlineVcfFile"],
90
+ local_output_dir,
91
+ case_id,
92
+ "germline",
93
+ short_variant_table_rows,
94
+ log,
87
95
  )
88
96
 
89
97
  manifest_path_name = f"{local_output_dir}/{case_id}.ga4gh.genomics.yml"
@@ -1,10 +1,41 @@
1
1
  from logging import Logger
2
- import pandas as pd
3
2
  import re
4
- from typing import cast, Literal, TypedDict
3
+ from typing import TypedDict, Generic, TypeVar
5
4
 
6
5
 
7
- short_variant_types: list[str] = [
6
+ T = TypeVar("T")
7
+
8
+
9
+ class AlterationTableRow(Generic[T], TypedDict):
10
+ gene: T
11
+ type: str
12
+ description: str
13
+ vaf: str
14
+ info: str
15
+
16
+
17
+ class ShortVariantGene(TypedDict):
18
+ chr: str
19
+ pos: int
20
+
21
+
22
+ class CopyNumberVariantGene(TypedDict):
23
+ gene: str
24
+ chr: str
25
+ start: int
26
+ end: int
27
+
28
+
29
+ class StructuralVariantGene(TypedDict):
30
+ gene1: str
31
+ chr1: str
32
+ pos1: int
33
+ gene2: str
34
+ chr2: str
35
+ pos2: int
36
+
37
+
38
+ base_short_variant_types: list[str] = [
8
39
  "Missense",
9
40
  "Frameshift",
10
41
  "Stop gained",
@@ -20,12 +51,23 @@ short_variant_types: list[str] = [
20
51
  ]
21
52
 
22
53
 
54
+ def get_short_variant_types() -> list[str]:
55
+ # For multi-word short variant types, sometimes the spaces are not included
56
+ short_variant_types: list[str] = []
57
+ for short_variant_type in base_short_variant_types:
58
+ short_variant_types.append(short_variant_type)
59
+ if " " in short_variant_type:
60
+ short_variant_types.append(short_variant_type.replace(" ", ""))
61
+
62
+ return short_variant_types
63
+
64
+
23
65
  def extract_all_table_lines(xml_in_file: str) -> list[str]:
24
66
  with open(xml_in_file, "r") as f:
25
67
  xml_lines = f.readlines()
26
68
 
27
69
  in_range_trigger = False
28
- table_lines = []
70
+ table_lines: list[str] = []
29
71
  for line in xml_lines:
30
72
  if "Gene (Chr. Position, hg38)" in line:
31
73
  in_range_trigger = True
@@ -37,7 +79,7 @@ def extract_all_table_lines(xml_in_file: str) -> list[str]:
37
79
  return table_lines
38
80
 
39
81
 
40
- def extract_alteration_table(xml_in_file: str, log: Logger) -> pd.DataFrame:
82
+ def extract_alteration_table_rows(xml_in_file: str, log: Logger) -> list[AlterationTableRow[str]]:
41
83
  table_lines = extract_all_table_lines(xml_in_file)
42
84
  # Remove completely empty lines
43
85
  table_lines = [line for line in table_lines if line.strip() != ""]
@@ -49,90 +91,94 @@ def extract_alteration_table(xml_in_file: str, log: Logger) -> pd.DataFrame:
49
91
  if current_row:
50
92
  table_row_lines.append(current_row)
51
93
  current_row = []
52
- line = re.sub(r"<T.>", "", line)
53
- line = re.sub(r"</T.>", "", line)
54
- line = re.sub(r"<T./>", "", line)
55
- if line.strip() not in ["", "p."]:
56
- current_row.append(line.strip())
57
-
58
- gene_column = []
59
- type_column = []
60
- description_column = []
61
- vaf_column = []
62
- info_column = []
63
-
64
- for row in table_row_lines:
65
- gene_column.append(row[0])
66
- type_column.append(row[1])
67
- description_column.append(row[2])
68
- vaf_column.append(row[3])
69
- # Sometimes the info column is empty, so we need to check if it actually exists
70
- # So far, it seems like rows with empty "info" columns are generally not useful for us
71
- # and the data in them will not be used anywhere, so we just fill in an empty string
72
- if len(row) > 4:
73
- info_column.append(row[4])
74
- else:
75
- info_column.append("")
76
-
77
- # If the test is negative we will have a type column with only NA values
78
- # We return an empty df which we check for later when scraping annotations
79
- # Ignore the first row which is the header
80
- if set(type_column[1:]) == {"NA"}:
81
- log.info("Alteration table is empty")
82
- return pd.DataFrame()
83
-
84
- alteration_df = pd.DataFrame(
85
- {
86
- "gene": gene_column,
87
- "type": type_column,
88
- "description": description_column,
89
- "vaf": vaf_column,
90
- "info": info_column,
91
- }
92
- )
93
-
94
- return alteration_df
95
-
96
-
97
- def extract_variant_table(
98
- xml_in_file: str, variant_type: Literal["copy number", "structural", "short"], log: Logger
99
- ) -> pd.DataFrame:
100
- alteration_table = extract_alteration_table(xml_in_file, log)
101
- if alteration_table.empty:
102
- return alteration_table
103
-
104
- # Drop by variant type
105
- if variant_type == "copy number":
106
- variant_df = alteration_table[alteration_table["type"] == "CNV"]
107
- elif variant_type == "structural":
108
- variant_df = alteration_table[alteration_table["type"] == "Translocation"]
109
- elif variant_type == "short":
110
- variant_df = alteration_table[alteration_table["type"].isin(short_variant_types)]
111
-
112
- return variant_df
113
-
114
-
115
- class AlterationTableRow(TypedDict):
116
- gene: str
117
- type: str
118
- description: str
119
- vaf: str
120
- info: str
121
-
122
-
123
- def extract_hyperdiploidy_row(xml_in_file: str, log: Logger) -> None | AlterationTableRow:
124
- alteration_table = extract_alteration_table(xml_in_file, log)
125
- if alteration_table.empty:
126
- return None
127
-
128
- hyperdiploidy_df = alteration_table[alteration_table["type"] == "Hyperdiploidy"]
129
-
130
- if hyperdiploidy_df.empty:
131
- return None
132
- # We only expect one hyperdiploidy row. If we get more than 1, just fail the ingestion so we can investigate
133
- if hyperdiploidy_df.shape[0] > 1:
134
- raise ValueError("More than one hyperdiploidy row found")
135
-
136
- hyperdiploidy_row = cast(AlterationTableRow, hyperdiploidy_df.iloc[0].to_dict())
137
-
138
- return hyperdiploidy_row
94
+ line = re.sub(r"<\/?T.\/?>", "", line).strip()
95
+ if line and line != "p.":
96
+ current_row.append(line)
97
+
98
+ alteration_table_rows: list[AlterationTableRow[str]] = []
99
+
100
+ # Skip the first row which is the header
101
+ for row in table_row_lines[1:]:
102
+ # Sometimes the alteration table is "empty", in which case the `type` column will only contain "NA" values
103
+ if row[1] == "NA":
104
+ continue
105
+ alteration_table_rows.append(
106
+ {
107
+ "gene": row[0],
108
+ "type": row[1],
109
+ "description": row[2],
110
+ "vaf": row[3],
111
+ # Sometimes the info column is empty, so we need to check if it actually exists
112
+ # So far, it seems like rows with empty "info" columns are generally not useful for us
113
+ # and the data in them will not be used anywhere, so we just fill in an empty string
114
+ "info": row[4] if len(row) > 4 else "",
115
+ }
116
+ )
117
+
118
+ return alteration_table_rows
119
+
120
+
121
+ def parse_short_variant_gene(gene: str) -> ShortVariantGene:
122
+ pattern = r"^.*\((?P<chr>chr\d+|chrX|chrY):(?P<pos>\d+).*\).*$"
123
+ match = re.match(pattern, gene)
124
+ if not match:
125
+ raise RuntimeError(f"Failed to parse gene field for short variant")
126
+ return {"chr": match.group("chr"), "pos": int(match.group("pos"))}
127
+
128
+
129
+ def parse_copy_number_variant_gene(gene: str) -> CopyNumberVariantGene:
130
+ pattern = r"^(?P<gene>[A-Z1-9]*).*?\((?P<chr>chr\d+|chrX|chrY):(?P<start>\d+)_(?P<end>\d+)\).*$"
131
+ match = re.match(pattern, gene)
132
+ if not match:
133
+ raise RuntimeError(f"Failed to parse gene field for copy number variant")
134
+ return {
135
+ "gene": match.group("gene"),
136
+ "chr": match.group("chr"),
137
+ "start": int(match.group("start")),
138
+ "end": int(match.group("end")),
139
+ }
140
+
141
+
142
+ def parse_structural_variant_gene(gene: str) -> StructuralVariantGene:
143
+ pattern = r"^(?P<gene1>[A-Z1-9]*)(-|\/)(?P<gene2>[A-Z1-9]*).*\(.*(?P<chr1>chr\d+|chrX|chrY):(?P<pos1>\d+).*;.*(?P<chr2>chr\d+|chrX|chrY):(?P<pos2>\d+).*\).*$"
144
+ match = re.match(pattern, gene)
145
+ if not match:
146
+ raise RuntimeError(f"Failed to parse gene field for structural variant")
147
+ return {
148
+ "gene1": match.group("gene1"),
149
+ "chr1": match.group("chr1"),
150
+ "pos1": int(match.group("pos1")),
151
+ "gene2": match.group("gene2"),
152
+ "chr2": match.group("chr2"),
153
+ "pos2": int(match.group("pos2")),
154
+ }
155
+
156
+
157
+ def extract_variant_table_rows_and_hyperdiploidy(xml_in_file: str, log: Logger) -> tuple[
158
+ list[AlterationTableRow[ShortVariantGene]],
159
+ list[AlterationTableRow[CopyNumberVariantGene]],
160
+ list[AlterationTableRow[StructuralVariantGene]],
161
+ list[str] | None,
162
+ ]:
163
+ alteration_table_rows = extract_alteration_table_rows(xml_in_file, log)
164
+
165
+ short_variant_rows: list[AlterationTableRow[ShortVariantGene]] = []
166
+ copy_number_rows: list[AlterationTableRow[CopyNumberVariantGene]] = []
167
+ structural_variant_rows: list[AlterationTableRow[StructuralVariantGene]] = []
168
+ hyperdiploidy_chromosomes: list[str] | None = None
169
+
170
+ short_variant_types = get_short_variant_types()
171
+
172
+ for row in alteration_table_rows:
173
+ if row["type"] in short_variant_types:
174
+ short_variant_rows.append({**row, "gene": parse_short_variant_gene(row["gene"])})
175
+ elif row["type"] == "CNV":
176
+ copy_number_rows.append({**row, "gene": parse_copy_number_variant_gene(row["gene"])})
177
+ elif row["type"] == "Translocation":
178
+ structural_variant_rows.append(
179
+ {**row, "gene": parse_structural_variant_gene(row["gene"])}
180
+ )
181
+ elif row["type"] == "Hyperdiploidy":
182
+ hyperdiploidy_chromosomes = re.findall(r"\d+", row["gene"])
183
+
184
+ return short_variant_rows, copy_number_rows, structural_variant_rows, hyperdiploidy_chromosomes
@@ -2,6 +2,9 @@ from logging import Logger
2
2
 
3
3
 
4
4
  def map_interpretation(status: str, log: Logger):
5
+ """
6
+ Map interpretation for structural and copy number variants
7
+ """
5
8
  if status == "Pathogenic":
6
9
  return "Pathogenic"
7
10
  elif "VUS" in status:
@@ -9,3 +12,17 @@ def map_interpretation(status: str, log: Logger):
9
12
  else:
10
13
  log.error(f"Failed to resolve interpretation: {status}")
11
14
  return ""
15
+
16
+
17
+ def map_vendsig(vendsig: str) -> str:
18
+ """
19
+ Map vendor significance for short variants
20
+ """
21
+ if vendsig in ["Pathogenic"]:
22
+ return "VENDSIG=Pathogenic"
23
+ elif vendsig in ["Likely Pathogenic", "LikelyPathogenic"]:
24
+ return "VENDSIG=Likely pathogenic"
25
+ elif vendsig in ["VUS"]:
26
+ return "VENDSIG=Uncertain significance"
27
+ else:
28
+ raise RuntimeError(f"Unable to map vendor significance: {vendsig}")
@@ -1,5 +1,3 @@
1
- from ingestion.nextgen.util.alteration_table import extract_hyperdiploidy_row
2
-
3
1
  from logging import Logger
4
2
  import re
5
3
 
@@ -42,12 +40,3 @@ def parse_report_date(line: str) -> str:
42
40
  return parse_pattern(
43
41
  r"^.*Diagnostic Genomics Laboratory.*(\d{2}\/\d{2}\/\d{4}).*$", line, "report date"
44
42
  )
45
-
46
-
47
- def extract_hyperdiploidy_chromosomes(xml_in_file: str, log: Logger) -> list[str] | None:
48
- hyperdiploidy_row_dict = extract_hyperdiploidy_row(xml_in_file, log)
49
-
50
- if not hyperdiploidy_row_dict:
51
- return None
52
-
53
- return re.findall(r"\d+", hyperdiploidy_row_dict["gene"])
@@ -14,7 +14,7 @@ nextgen_specific_genes_with_location: list[GeneWithLocation] = [
14
14
  {"gene": "CCND3", "chr": "chr6", "start": 41920534, "end": 42562008},
15
15
  {"gene": "MYC", "chr": "chr8", "start": 125309416, "end": 129673293},
16
16
  {"gene": "CCND1", "chr": "chr11", "start": 69090733, "end": 69656860},
17
- {"gene": "IGH", "chr": "chr14", "start": 105578834, "end": 109902208},
17
+ {"gene": "IGH", "chr": "chr14", "start": 105325507, "end": 109902208},
18
18
  {"gene": "MAF", "chr": "chr16", "start": 78428398, "end": 79615096},
19
19
  {"gene": "MAFB", "chr": "chr20", "start": 39039005, "end": 40688948},
20
20
  {"gene": "IGL", "chr": "chr22", "start": 22012552, "end": 22965858},
@@ -22,7 +22,7 @@ nextgen_specific_genes_with_location: list[GeneWithLocation] = [
22
22
  nextgen_specific_genes: set[str] = {gene["gene"] for gene in nextgen_specific_genes_with_location}
23
23
 
24
24
 
25
- def maybe_get_matching_gene_for_location(chr: str, position: int) -> str | None:
25
+ def maybe_get_nextgen_specific_gene(chr: str, position: int) -> str | None:
26
26
  for gene in nextgen_specific_genes_with_location:
27
27
  if gene["chr"] == chr and gene["start"] <= position <= gene["end"]:
28
28
  return gene["gene"]
@@ -1,5 +1,6 @@
1
1
  from logging import Logger
2
2
 
3
+ from ingestion.nextgen.util.alteration_table import AlterationTableRow, ShortVariantGene
3
4
  from ingestion.shared_util.open_maybe_gzipped import open_maybe_gzipped
4
5
 
5
6
 
@@ -14,26 +15,54 @@ def extract_filter_from_vcf_line(line: str) -> str:
14
15
  return split_line[6]
15
16
 
16
17
 
17
- def replace_filter_in_vcf_line(line: str, new_filter: str) -> str:
18
+ def replace_filter_in_line(line: str, new_filter: str) -> str:
18
19
  split_line = line.strip().split("\t")
19
20
  split_line[6] = new_filter
20
21
  return "\t".join(split_line) + "\n"
21
22
 
22
23
 
24
+ def is_line_in_alteration_table(
25
+ line: str, short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]]
26
+ ) -> bool:
27
+ """
28
+ Returns True if the line in the VCF appears in
29
+ the alteration table, False otherwise.
30
+
31
+ Matching in the alteration table is less strict than in the
32
+ VCF files; we only need to match chromosome and position.
33
+
34
+ Also position may differ by +1 or -1, as deletion and insertion positions
35
+ are represented differently in the VCF and the alteration table.
36
+ """
37
+ split_line = line.strip().split("\t")
38
+ chrom, pos = split_line[0], int(split_line[1])
39
+
40
+ for row in short_variant_table_rows:
41
+ ref_chrom, ref_pos = row["gene"]["chr"], row["gene"]["pos"]
42
+
43
+ if ref_chrom == chrom and (abs(ref_pos - pos) <= 1):
44
+ return True
45
+
46
+ return False
47
+
48
+
23
49
  def pre_filter_somatic_vcf(
24
50
  somatic_vcf_file: str,
25
51
  somatic_vcf_snv_file: str,
26
52
  somatic_vcf_indel_file: str,
53
+ short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
27
54
  working_dir: str,
28
55
  log: Logger,
29
56
  ) -> str:
30
57
  """
31
58
  Removes all variants from the `somatic_vcf_file` that are not
32
- also in the `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
59
+ also in the `somatic_vcf_snv_file`, the `somatic_vcf_indel_file`,
60
+ or the alteration table.
33
61
 
34
62
  Also updates the FILTER field in the `somatic_vcf_file` to match
35
63
  the FILTER field of the corresponding variant in the
36
64
  `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
65
+ For variants in the alteration table, the original FILTER field is kept.
37
66
  """
38
67
  log.info("Pre-filtering somatic VCF file")
39
68
 
@@ -48,20 +77,22 @@ def pre_filter_somatic_vcf(
48
77
  extract_filter_from_vcf_line(line)
49
78
  )
50
79
 
51
- log.info(f"Found {len(valid_variants_with_filters)} valid variants")
80
+ log.info(f"Found {len(valid_variants_with_filters)} valid variants in the SNV and INDEL files")
52
81
 
53
82
  output_vcf_path = f"{working_dir}/filtered_somatic.vcf.gz"
54
83
  with (
55
- open_maybe_gzipped(somatic_vcf_file, "rt") as f,
84
+ open_maybe_gzipped(somatic_vcf_file, "rt") as r,
56
85
  open_maybe_gzipped(output_vcf_path, "wt") as w,
57
86
  ):
58
- for line in f:
87
+ for line in r:
59
88
  if line.startswith("#"):
60
89
  w.write(line)
61
90
  else:
62
91
  key = build_variant_key_from_vcf_line(line)
63
92
  if key in valid_variants_with_filters:
64
- w.write(replace_filter_in_vcf_line(line, valid_variants_with_filters[key]))
93
+ w.write(replace_filter_in_line(line, valid_variants_with_filters[key]))
94
+ elif is_line_in_alteration_table(line, short_variant_table_rows):
95
+ w.write(line)
65
96
 
66
97
  log.info(f"Successfully pre-filtered somatic VCF file to {output_vcf_path}")
67
98
  return output_vcf_path
@@ -1,20 +1,21 @@
1
1
  import pandas as pd
2
2
  from logging import Logger
3
3
 
4
- from ingestion.nextgen.util.alteration_table import extract_variant_table
4
+ from ingestion.nextgen.util.alteration_table import AlterationTableRow, CopyNumberVariantGene
5
5
  from ingestion.nextgen.util.interpretation import map_interpretation
6
6
 
7
7
 
8
8
  def process_cnv(
9
- xml_in_file: str, cnv_in_file: str, root_path: str, prefix: str, log: Logger
9
+ cnv_in_file: str,
10
+ copy_number_variant_table_rows: list[AlterationTableRow[CopyNumberVariantGene]],
11
+ output_dir: str,
12
+ case_id: str,
13
+ log: Logger,
10
14
  ) -> str | None:
11
- copy_number_path_name = f"{root_path}/{prefix}.copynumber.csv"
12
- sample_id = prefix
15
+ copy_number_path_name = f"{output_dir}/{case_id}.copynumber.csv"
16
+ sample_id = case_id
13
17
 
14
- copy_number_variant_rows = []
15
- copy_number_variant_table = extract_variant_table(
16
- xml_in_file=xml_in_file, variant_type="copy number", log=log
17
- )
18
+ copy_number_variant_rows: list[str] = []
18
19
 
19
20
  with open(cnv_in_file, "r") as f:
20
21
  cnv_rows = f.readlines()
@@ -45,20 +46,15 @@ def process_cnv(
45
46
  attributes = {}
46
47
 
47
48
  # Scrape interpretation
48
- interpretation = None
49
- if not copy_number_variant_table.empty:
50
- for index, row in copy_number_variant_table.iterrows():
51
- ref_gene = row["gene"].split(" ")[0]
52
- ref_coord = row["gene"].split(" ")[1]
53
-
54
- if (
55
- ref_gene == gene_id_only
56
- and ref_coord == f"({chromosome}:{start_position}_{end_position})"
57
- ):
58
- interpretation = map_interpretation(row["info"], log)
59
-
60
- if not interpretation:
61
- interpretation = "unknown"
49
+ interpretation = "unknown"
50
+ for row in copy_number_variant_table_rows:
51
+ if (
52
+ row["gene"]["gene"] == gene_id_only
53
+ and row["gene"]["chr"] == chromosome
54
+ and row["gene"]["start"] <= int(start_position)
55
+ and row["gene"]["end"] >= int(end_position)
56
+ ):
57
+ interpretation = map_interpretation(row["info"], log)
62
58
 
63
59
  copy_number_variant_rows.append(
64
60
  f"{sample_id},{gene_id_only},{copy_number},{status},{attributes},{chromosome},{start_position},{end_position},{interpretation}\n"
@@ -65,7 +65,7 @@ def get_cell_purity(interpretation_lines: list):
65
65
  return float(00.00)
66
66
 
67
67
 
68
- def extract_patient_data(patient_info_lines: list):
68
+ def extract_patient_data(patient_info_lines: list[str]):
69
69
  patient_data: dict = {}
70
70
  patient_data["patientInfo"] = {}
71
71
 
@@ -173,45 +173,46 @@ def extract_test_data(patient_info_lines: list, interpretation_lines: list):
173
173
  def process_manifest(
174
174
  xml_in_file: str,
175
175
  source_file_id: str,
176
- prefix: str,
176
+ case_id: str,
177
177
  include_copy_number: bool,
178
178
  include_structural: bool,
179
179
  somatic_translocations: list[str],
180
- log: Logger,
180
+ hyperdiploidy_chromosomes: list[str] | None,
181
181
  ):
182
182
  test_text = extract_xml_text(xml_in_file)
183
183
  interpretation_text = extract_interpretation_text(xml_in_file)
184
184
  manifest = extract_test_data(test_text, interpretation_text)
185
185
  manifest.update(extract_patient_data(test_text))
186
186
 
187
- hyperdiploidy_chromosomes = manifest_helpers.extract_hyperdiploidy_chromosomes(xml_in_file, log)
187
+ file_prefix = f".lifeomic/nextgen/{case_id}/{case_id}"
188
+
188
189
  if hyperdiploidy_chromosomes:
189
190
  manifest["hyperdiploidyTrisomies"] = hyperdiploidy_chromosomes
190
191
  if somatic_translocations:
191
192
  manifest["somaticTranslocations"] = somatic_translocations
192
193
 
193
- manifest["reportFile"] = f".lifeomic/nextgen/{prefix}/{prefix}.pdf"
194
+ manifest["reportFile"] = f"{file_prefix}.pdf"
194
195
  manifest["sourceFileId"] = source_file_id
195
196
  manifest["resources"] = []
196
197
 
197
198
  manifest["files"] = [
198
199
  {
199
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.modified.somatic.nrm.filtered.vcf.gz",
200
+ "fileName": f"{file_prefix}.modified.somatic.nrm.filtered.vcf.gz",
200
201
  "sequenceType": "somatic",
201
202
  "type": "shortVariant",
202
203
  },
203
204
  {
204
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.modified.germline.nrm.filtered.vcf.gz",
205
+ "fileName": f"{file_prefix}.modified.germline.nrm.filtered.vcf.gz",
205
206
  "sequenceType": "germline",
206
207
  "type": "shortVariant",
207
208
  },
208
209
  {
209
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.somatic.updated.bam",
210
+ "fileName": f"{file_prefix}.somatic.updated.bam",
210
211
  "sequenceType": "somatic",
211
212
  "type": "read",
212
213
  },
213
214
  {
214
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.germline.updated.bam",
215
+ "fileName": f"{file_prefix}.germline.updated.bam",
215
216
  "sequenceType": "germline",
216
217
  "type": "read",
217
218
  },
@@ -219,7 +220,7 @@ def process_manifest(
219
220
  if include_structural:
220
221
  manifest["files"].append(
221
222
  {
222
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.structural.csv",
223
+ "fileName": f"{file_prefix}.structural.csv",
223
224
  "sequenceType": "somatic",
224
225
  "type": "structuralVariant",
225
226
  },
@@ -227,7 +228,7 @@ def process_manifest(
227
228
  if include_copy_number:
228
229
  manifest["files"].append(
229
230
  {
230
- "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.copynumber.csv",
231
+ "fileName": f"{file_prefix}.copynumber.csv",
231
232
  "sequenceType": "somatic",
232
233
  "type": "copyNumberVariant",
233
234
  }
@@ -3,12 +3,9 @@ import re
3
3
  from typing import TypedDict
4
4
 
5
5
  from ingestion.shared_util.coords_to_genes import coords_to_genes
6
- from ingestion.nextgen.util.alteration_table import extract_variant_table
6
+ from ingestion.nextgen.util.alteration_table import AlterationTableRow, StructuralVariantGene
7
7
  from ingestion.nextgen.util.interpretation import map_interpretation
8
- from ingestion.nextgen.util.nextgen_specific_genes import (
9
- maybe_get_matching_gene_for_location,
10
- nextgen_specific_genes,
11
- )
8
+ from ingestion.nextgen.util.nextgen_specific_genes import maybe_get_nextgen_specific_gene
12
9
  from ingestion.shared_util.open_maybe_gzipped import open_maybe_gzipped
13
10
 
14
11
 
@@ -46,38 +43,33 @@ def is_del_dup_or_ins(variant: list[str]) -> bool:
46
43
  return any([x in variant[2] for x in ["MantaDEL", "MantaDUP", "MantaINS"]])
47
44
 
48
45
 
49
- def get_gene_from_coords(
50
- chromosome: str, start_position: str, end_position: str, log: Logger
51
- ) -> str:
46
+ def get_center_position(start_position: str, end_position: str) -> int:
52
47
  """
53
- A number of genes of interest with specific start and end positions have been provided.
54
- If a variant falls within the start and end positions of one of those genes of interest, that gene will be used.
55
- Otherwise, we fall back to the standard gene lookup.
48
+ Calculate the center position of a variant based on its start and end positions, useful for finding genes.
56
49
  """
57
- center_position = int((int(start_position) + int(end_position)) / 2)
58
-
59
- gene = maybe_get_matching_gene_for_location(chromosome, center_position)
60
- if gene:
61
- return gene
62
-
63
- return coords_to_genes("GRCh38", chromosome, center_position, log)
50
+ return int((int(start_position) + int(end_position)) / 2)
64
51
 
65
52
 
66
53
  def process_structural(
67
- sv_in_file: str, xml_in_file, root_path: str, prefix: str, log: Logger
54
+ structural_variant_in_file: str,
55
+ structural_variant_table_rows: list[AlterationTableRow[StructuralVariantGene]],
56
+ output_dir: str,
57
+ case_id: str,
58
+ log: Logger,
68
59
  ) -> tuple[str | None, list[str]]:
69
- structural_variant_table = extract_variant_table(
70
- xml_in_file=xml_in_file, variant_type="structural", log=log
71
- )
60
+ structural_variant_path_name = f"{output_dir}/{case_id}.structural.csv"
61
+ sample_id = case_id
72
62
 
73
- structural_variant_path_name = f"{root_path}/{prefix}.structural.csv"
74
- sample_id = prefix
75
-
76
- with open_maybe_gzipped(sv_in_file, "rt") as f:
63
+ with open_maybe_gzipped(structural_variant_in_file, "rt") as f:
77
64
  variants = [line for line in f.readlines() if not line.startswith("#")]
78
65
 
79
66
  structural_variants: list[StructuralVariant] = []
67
+ formatted_translocations: set[str] = set()
68
+
80
69
  for variant in variants:
70
+ gene1: str | None = None
71
+ gene2: str | None = None
72
+
81
73
  working_variant = variant.strip().split("\t")
82
74
 
83
75
  chromosome1 = f"chr{working_variant[0]}"
@@ -95,7 +87,7 @@ def process_structural(
95
87
  effect = "insertion"
96
88
 
97
89
  # Get genes from coordinates using center point of start and end positions
98
- gene1 = get_gene_from_coords(chromosome1, start_position1, end_position1, log)
90
+ gene1 = None
99
91
  gene2 = "N/A"
100
92
 
101
93
  else:
@@ -107,32 +99,60 @@ def process_structural(
107
99
  end_position2 = alt[1]
108
100
  effect = "translocation"
109
101
 
110
- # Get genes from coordinates using center point of start and end positions
111
- gene1 = get_gene_from_coords(chromosome1, start_position1, end_position1, log)
112
- gene2 = get_gene_from_coords(chromosome2, start_position2, end_position2, log)
102
+ gene1 = maybe_get_nextgen_specific_gene(
103
+ chromosome1, get_center_position(start_position1, end_position1)
104
+ )
105
+ gene2 = maybe_get_nextgen_specific_gene(
106
+ chromosome2, get_center_position(start_position2, end_position2)
107
+ )
108
+
109
+ # Maybe add this variant to the formatted translocations list
110
+ if (gene1 == "MYC" or gene2 == "MYC") and gene1 != gene2:
111
+ formatted_translocations.add("t(MYC)")
112
+ elif gene1 and gene2:
113
+ # Remove the "chr" prefix and convert to int
114
+ chr1, chr2 = int(chromosome1[3:]), int(chromosome2[3:])
115
+ # Don't add translocations between the same chromosome
116
+ if chr1 == chr2:
117
+ continue
118
+ # Ensure chromosomes are in ascending order
119
+ if chr1 > chr2:
120
+ chr1, chr2 = chr2, chr1
121
+ formatted_translocations.add(f"t({chr1};{chr2})")
113
122
 
114
123
  # Scrape interpretation
115
124
  interpretation = "unknown"
116
- if not structural_variant_table.empty:
117
- for _, row in structural_variant_table.iterrows():
118
- pattern = r"^.*\(.*(chr\d+:\d+).*;.*(chr\d+:\d+).*\).*$"
119
- match = re.match(pattern, row["gene"])
120
- if not match:
121
- log.warn(f"Failed to parse gene field for structural variant")
122
- continue
123
- ref_coords = set(match.groups())
124
- variant_coords = set(
125
- [f"{chromosome1}:{start_position1}", f"{chromosome2}:{start_position2}"]
126
- )
125
+ for row in structural_variant_table_rows:
126
+ is_match = (
127
+ row["gene"]["chr1"] == chromosome1
128
+ and row["gene"]["chr2"] == chromosome2
129
+ and row["gene"]["pos1"] == int(start_position1)
130
+ and row["gene"]["pos2"] == int(start_position2)
131
+ )
132
+ if not is_match:
133
+ continue
127
134
 
128
- if ref_coords == variant_coords:
129
- interpretation = map_interpretation(row["info"], log)
135
+ interpretation = map_interpretation(row["info"], log)
136
+ # Use the gene names from the alteration table but only if they are not already set
137
+ gene1 = gene1 if gene1 else row["gene"]["gene1"]
138
+ gene2 = gene2 if gene2 else row["gene"]["gene2"]
130
139
 
131
140
  # Hard-code
132
141
  sequence_type = "Somatic"
133
142
  in_frame = "Unknown"
134
143
  attributes: dict = {}
135
144
 
145
+ # If genes have not been populated from the nextgen specific genes or alteration
146
+ # table fall back to using the default gene finding method
147
+ if not gene1:
148
+ gene1 = coords_to_genes(
149
+ "GRCh38", chromosome1, get_center_position(start_position1, end_position1), log
150
+ )
151
+ if not gene2:
152
+ gene2 = coords_to_genes(
153
+ "GRCh38", chromosome2, get_center_position(start_position2, end_position2), log
154
+ )
155
+
136
156
  structural_variants.append(
137
157
  {
138
158
  "sample_id": sample_id,
@@ -163,7 +183,7 @@ def process_structural(
163
183
  deduped_structural_variants.append(sv)
164
184
 
165
185
  if not deduped_structural_variants:
166
- log.info(f"Ignoring empty structural variant file {sv_in_file}")
186
+ log.info(f"Ignoring empty structural variant file {structural_variant_in_file}")
167
187
  return (None, [])
168
188
 
169
189
  log.info(f"Saving file to {structural_variant_path_name}")
@@ -174,28 +194,6 @@ def process_structural(
174
194
  for sv in deduped_structural_variants:
175
195
  f.write(structural_variant_to_csv_row(sv))
176
196
 
177
- log.info("Finding structural variant translocations for genes of interest")
178
- translocations = [sv for sv in deduped_structural_variants if sv["effect"] == "translocation"]
179
- formatted_translocations: set[str] = set()
180
- for translocation in translocations:
181
- gene1, gene2 = translocation["gene1"], translocation["gene2"]
182
- # MYC is a special case
183
- if gene1 == "MYC" or gene2 == "MYC":
184
- if gene1 == gene2:
185
- continue
186
- formatted_translocations.add("t(MYC)")
187
- continue
188
- if gene1 in nextgen_specific_genes and gene2 in nextgen_specific_genes:
189
- chr1, chr2 = int(translocation["position1"][0][3:]), int(
190
- translocation["position2"][0][3:]
191
- )
192
- if chr1 == chr2:
193
- continue
194
- # Ensure chromosomes are in ascending order
195
- if chr1 > chr2:
196
- chr1, chr2 = chr2, chr1
197
- formatted_translocations.add(f"t({chr1};{chr2})")
198
-
199
197
  log.info(f"Found {len(formatted_translocations)} translocations for genes of interest")
200
198
 
201
199
  return structural_variant_path_name, list(formatted_translocations)
@@ -4,7 +4,8 @@ import shutil
4
4
  from logging import Logger
5
5
  from typing import Literal
6
6
 
7
- from ingestion.nextgen.util.alteration_table import extract_variant_table
7
+ from ingestion.nextgen.util.alteration_table import AlterationTableRow, ShortVariantGene
8
+ from ingestion.nextgen.util.interpretation import map_vendsig
8
9
 
9
10
  SequenceType = Literal["somatic", "germline"]
10
11
 
@@ -76,14 +77,10 @@ def transform_vcf(
76
77
  headers: list,
77
78
  variants: list,
78
79
  sequence_type: SequenceType,
79
- xml_in_file: str,
80
+ short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
80
81
  case_id: str,
81
82
  log: Logger,
82
83
  ) -> str:
83
- short_variant_table = extract_variant_table(
84
- xml_in_file=xml_in_file, variant_type="short", log=log
85
- )
86
-
87
84
  log.info(f"Performing file transformations on {vcf_in_file}")
88
85
  approved_chr_list = ["chr" + str(i) for i in range(1, 23)] + ["chrX", "chrY", "chrM"]
89
86
  vcf_out: list[str] = []
@@ -135,13 +132,12 @@ def transform_vcf(
135
132
  working.calculate_af()
136
133
  working.prune_var()
137
134
 
138
- if sequence_type == "somatic" and not short_variant_table.empty:
135
+ if sequence_type == "somatic":
139
136
  split_var[7] = add_vendsig_to_info(
140
137
  working.pruned_info,
141
- short_variant_table,
138
+ short_variant_table_rows,
142
139
  split_var[0],
143
140
  int(split_var[1]),
144
- log,
145
141
  )
146
142
  else:
147
143
  split_var[7] = f"{working.pruned_info};VENDSIG=Unknown"
@@ -160,14 +156,14 @@ def export_vcf(vcf_out: str, vcf_path: str, log: Logger):
160
156
 
161
157
  def process_vcf(
162
158
  vcf_in_file: str,
163
- root_path: str,
159
+ output_dir: str,
164
160
  case_id: str,
165
161
  sequence_type: SequenceType,
166
- xml_in_file: str,
162
+ short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
167
163
  log: Logger,
168
164
  ):
169
165
  line_count = 0
170
- vcf_path = f"{root_path}/{case_id}.modified.{sequence_type}.vcf.gz"
166
+ vcf_path = f"{output_dir}/{case_id}.modified.{sequence_type}.vcf.gz"
171
167
 
172
168
  headers = []
173
169
  variants = []
@@ -196,44 +192,28 @@ def process_vcf(
196
192
 
197
193
  else:
198
194
  vcf_out = transform_vcf(
199
- vcf_in_file, headers, variants, sequence_type, xml_in_file, case_id, log
195
+ vcf_in_file,
196
+ headers,
197
+ variants,
198
+ sequence_type,
199
+ short_variant_table_rows,
200
+ case_id,
201
+ log,
200
202
  )
201
203
  export_vcf(vcf_out, vcf_path, log)
202
204
 
203
205
  return {"vcf_path_name": vcf_path, "vcf_line_count": line_count}
204
206
 
205
207
 
206
- def map_vendsig(vendsig: str) -> str:
207
- if vendsig in ["Pathogenic"]:
208
- return "VENDSIG=Pathogenic"
209
- elif vendsig in ["Likely Pathogenic", "LikelyPathogenic"]:
210
- return "VENDSIG=Likely pathogenic"
211
- elif vendsig in ["VUS"]:
212
- return "VENDSIG=Uncertain significance"
213
- else:
214
- raise RuntimeError(f"Unable to map vendor significance: {vendsig}")
215
-
216
-
217
- def extract_chrom_pos_from_gene_string(chr_pos: str, log: Logger) -> tuple[str, int]:
218
- """
219
- Parses chromosome and position from a gene string from the alteration table.
220
-
221
- Raises if no match found.
222
- """
223
-
224
- pattern = r"^.*\((chr\d+|chrX|chrY):(\d+).*\).*$"
225
- match = re.match(pattern, chr_pos)
226
- if not match:
227
- raise RuntimeError(f"Failed to extract chrom and pos from gene string")
228
- chrom = match.group(1)
229
- pos = int(match.group(2))
230
- return (chrom, pos)
231
-
232
-
233
- def add_vendsig_to_info(info: str, short_var_table, chrom: str, pos: int, log: Logger) -> str:
208
+ def add_vendsig_to_info(
209
+ info: str,
210
+ short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
211
+ chrom: str,
212
+ pos: int,
213
+ ) -> str:
234
214
  mapped_vendsig = None
235
- for index, row in short_var_table.iterrows():
236
- ref_chrom, ref_pos = extract_chrom_pos_from_gene_string(row["gene"], log)
215
+ for row in short_variant_table_rows:
216
+ ref_chrom, ref_pos = row["gene"]["chr"], row["gene"]["pos"]
237
217
 
238
218
  if ref_chrom == chrom:
239
219
  if ref_pos == pos or ref_pos + 1 == pos or ref_pos - 1 == pos:
@@ -18,7 +18,7 @@ class Variant:
18
18
  self.info = {x.split("=")[0]: x.split("=")[1] for x in fields[7].split(";") if "=" in x}
19
19
  self.frmt = fields[8].split(":")
20
20
  self.smpl = fields[9].split(":")
21
- self.ad_af_dp = {"AD": False, "AF": False, "DP": False}
21
+ self.ad_af_dp: dict[str, bool | str] = {"AD": False, "AF": False, "DP": False}
22
22
 
23
23
  def standardize_allele_frequency(self, log):
24
24
  # Detect if allele frequency is present either in the INFO or FORMAT/SAMPLE fields
@@ -131,19 +131,19 @@ class Variant:
131
131
  )
132
132
  return updated_variant
133
133
 
134
+ @classmethod
135
+ def check_formatting(cls, var: str):
136
+ # Loose formatting check, return as Variant class object
137
+ split_var = var.split("\t")
138
+ if len(split_var) < 8 or not split_var[1].isdigit():
139
+ raise RuntimeError(f"Variant contains incorrect number, or invalid fields: {var}")
134
140
 
135
- def check_formatting(var: str) -> Variant:
136
- # Loose formatting check, return as Variant class object
137
- split_var = var.split("\t")
138
- if len(split_var) < 8 or not split_var[1].isdigit():
139
- raise RuntimeError(f"Variant contains incorrect number, or invalid fields: {var}")
141
+ if len(split_var) == 8:
142
+ split_var.append(".") # Add placeholder for FORMAT
143
+ split_var.append(".") # Add placeholder for SAMPLE
140
144
 
141
- if len(split_var) == 8:
142
- split_var.append(".") # Add placeholder for FORMAT
143
- split_var.append(".") # Add placeholder for SAMPLE
145
+ elif len(split_var) == 9:
146
+ split_var.append(".") # Add placeholder for SAMPLE
144
147
 
145
- elif len(split_var) == 9:
146
- split_var.append(".") # Add placeholder for SAMPLE
147
-
148
- working_variant = Variant(split_var)
149
- return working_variant
148
+ working_variant = cls(split_var)
149
+ return working_variant
@@ -9,15 +9,15 @@ from ingestion.vcf_standardization.util.read_write import (
9
9
  read_headers,
10
10
  read_variants,
11
11
  )
12
- from ingestion.vcf_standardization.Variant import check_formatting
12
+ from ingestion.vcf_standardization.Variant import Variant
13
13
 
14
14
 
15
- def format_variant(variant: str, log: Logger, vendsig_dict: dict = None) -> Optional[str]:
15
+ def format_variant(variant: str, log: Logger, vendsig_dict: dict | None = None) -> Optional[str]:
16
16
  # Ignore structural variants
17
17
  if "SVTYPE" in variant:
18
18
  return None
19
19
  # Working variant
20
- wv = check_formatting(variant)
20
+ wv = Variant.check_formatting(variant)
21
21
 
22
22
  # Only process variants that aren't multiallelic
23
23
  if len(wv.alt.split(",")) == 1:
@@ -48,7 +48,7 @@ def standardize_vcf(
48
48
  out_path: str,
49
49
  case_id: str,
50
50
  log: Logger,
51
- vendsig_dict: dict = None,
51
+ vendsig_dict: dict | None = None,
52
52
  compression: bool = False,
53
53
  ) -> Optional[int]:
54
54
  check_vcf(infile, log)
@@ -1,10 +1,11 @@
1
1
  import gzip
2
+ from logging import Logger
2
3
  import re
3
4
  import os
4
5
  from typing import Iterator, Optional
5
6
 
6
7
 
7
- def check_vcf(infile, log):
8
+ def check_vcf(infile: str, log: Logger) -> None:
8
9
  log.info("Checking VCF file")
9
10
  # Check if file exists. Raise if it doesn't.
10
11
  if os.path.exists(infile) == False:
@@ -68,7 +69,7 @@ def write_vcf(
68
69
  compression: bool,
69
70
  line_count: int,
70
71
  log,
71
- ):
72
+ ) -> int:
72
73
  log.info(f"Writing standardized VCF to {outfile}")
73
74
 
74
75
  with gzip.open(outfile, "wt") if compression else open(outfile, "w") as w:
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phc-ingestion
3
- Version: 0.8.32
3
+ Version: 0.8.34
4
4
  Summary: Functions for LifeOmic PHC genomic ingestions
5
5
  License: MIT
6
6
  Author-email: LifeOmic Development <development@lifeomic.com>
7
- Requires-Python: >=3.10
7
+ Requires-Python: >=3.11
8
8
  Requires-Dist: jsonschema<5.0.0,>=4.16.0
9
9
  Requires-Dist: lifeomic-logging<0.4.0,>=0.3.2
10
10
  Requires-Dist: natsort==7.1.1
@@ -29,16 +29,16 @@ ingestion/generic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
29
29
  ingestion/generic/process.py,sha256=WJHV_-SKhrDZ3JS3fm9DVMoW3Zs2t50GiraSV3vlLHE,1548
30
30
  ingestion/generic/utils.py,sha256=1MEIru7uq38IjUdL8lcHqDH0oTki9uWrz1f2e-pmRoU,2814
31
31
  ingestion/nextgen/__init__.py,sha256=7LQ-h_Bvc5P1QcHMdzsqi1Qm4fTJn04-ozar2ty9wSc,59
32
- ingestion/nextgen/process.py,sha256=F0Ms8rTr_4boWPpE13D39C3ljFtyIVtw9XIIjCVI6f8,3849
33
- ingestion/nextgen/util/alteration_table.py,sha256=h3nqImVRGgMV62P5_8wZBbaD06lr7kJA9JOBqtW3fco,4263
34
- ingestion/nextgen/util/interpretation.py,sha256=ozuzb0vozff34zfP6AdOiUmI8Q77hI02jve_nCPZHfE,297
35
- ingestion/nextgen/util/manifest_helpers.py,sha256=PpSay-pe62jk735nom1tVD9nDE8-CxmzzCrgpBhgtjY,1571
36
- ingestion/nextgen/util/nextgen_specific_genes.py,sha256=II_E2AgAqv35u_ga25geRn6UHuZy_Uk9itfyu_HybFY,1211
37
- ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=K_gH4EnUXrKB22u_f8FqQVGrOS5LxXNsNO3VBn381eY,2301
38
- ingestion/nextgen/util/process_cnv.py,sha256=m-AhsXFlYw4LTzgJJaj5vXYbK5n3H7cImzBxD2To6M0,2598
39
- ingestion/nextgen/util/process_manifest.py,sha256=FOa-m78layb5TFTktaDHkHT9hAGUeH9ZPGeqgBncz64,8585
40
- ingestion/nextgen/util/process_structural.py,sha256=fUhoVGY5XHXPqLjC9bKD8JpXNDF9VbUQGl3Y3d7vG6E,7703
41
- ingestion/nextgen/util/process_vcf.py,sha256=H2Jd5CzQiTBOjk3w95LuPp2rN79bKHBrMRDjG-YNTXE,8452
32
+ ingestion/nextgen/process.py,sha256=kDCnU685v7aqJ3i4HpFdb7HqgHRSBKqtYPpuyN7qWmM,3976
33
+ ingestion/nextgen/util/alteration_table.py,sha256=OqstLK6cgoNvRWy8bW6_iABaAn5ggCi1xBM8GOU6wYQ,6060
34
+ ingestion/nextgen/util/interpretation.py,sha256=tQ3qzAUwBzwK1tQzH9ujZk_VCQ2wP8HzSZY0fImJ5Wo,818
35
+ ingestion/nextgen/util/manifest_helpers.py,sha256=2xrpEtHbCb1Kea1wJeObkDfTiBklmffQt_o2hMgOSOE,1208
36
+ ingestion/nextgen/util/nextgen_specific_genes.py,sha256=1jFcqvtYAlJ7eBwOBm1UC2TzAbjHjdlvPBUzxr1G8dY,1206
37
+ ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=mIaUihmGLbS38D4Gy_Qtf1lFAfW0A-LgAgQmsrEiI-M,3529
38
+ ingestion/nextgen/util/process_cnv.py,sha256=MIirc8e0k6lsaTZkRM3U3L3IvbrcHmKQ4xlIu585514,2430
39
+ ingestion/nextgen/util/process_manifest.py,sha256=EnV9I90vnanDvuoErbMfz6yAfjzM5LdhhUF4q5DJd8w,8428
40
+ ingestion/nextgen/util/process_structural.py,sha256=FKjkK7BkIlocnLs8rFCjrMC39FCQnD0nQCeWvi7cRoA,7539
41
+ ingestion/nextgen/util/process_vcf.py,sha256=ZZURSMnZhHDpFahzijZ4MvCfSWTPdIktzmnCKVVUbGs,7768
42
42
  ingestion/nextgen/util/types.py,sha256=SSzt5gv-kss1PR45eQUelypWrGI-dAfQMO3GSD-T-Wg,22
43
43
  ingestion/resources/GRCh37_map.csv.gz,sha256=JOEkjtbYrJpIdyoZdCvfJhvvz2dNfkSve7lXSXkCCD8,408290
44
44
  ingestion/resources/GRCh38_map.csv.gz,sha256=qriYO2_buCCb4T6WcuZ-pCwPxMsm0TL2OxAHvJ1cEfA,612373
@@ -47,13 +47,13 @@ ingestion/shared_util/coords_to_genes.py,sha256=vz9EfgFm3BS6pEPnslbEka8cJKlQZtHJ
47
47
  ingestion/shared_util/gene_to_coords.py,sha256=M-q5ateLSQ4fCF0uMk5TX2uBLRrcZzXqXEf05TPaLsU,876
48
48
  ingestion/shared_util/open_maybe_gzipped.py,sha256=FrOPJ4OgfpQGyT3f1Su1rFeuuYYu6QJ-nVIBIosbfhw,232
49
49
  ingestion/shared_util/types.py,sha256=u9AD2OrTQWMBtK_7VXHsD8Rv6HFs-7ZUItNl4KXdL7k,68
50
- ingestion/vcf_standardization/Variant.py,sha256=MBT8x25Ub1GRkTtnvPMV9SN7LUoF4Xkons-m3kYDkeY,5398
50
+ ingestion/vcf_standardization/Variant.py,sha256=aoSqT4XAECxCF0JZgv1YRmxuw20WGeWkwFTw0x6FmWc,5475
51
51
  ingestion/vcf_standardization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
- ingestion/vcf_standardization/standardize.py,sha256=PaRqQRSrnI79WFgNvbvnXFcvXHYqshOOxCXup5eMZ_M,2289
52
+ ingestion/vcf_standardization/standardize.py,sha256=zYzZxncq8USA1bUs26L-ByLPTnUlGyVvS3LJVGD19BM,2302
53
53
  ingestion/vcf_standardization/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
54
  ingestion/vcf_standardization/util/af_helpers.py,sha256=dpTzoeIQVeBRt0ETF3a9rp5ojZqznHg4x_hCZ8OPcOg,1061
55
55
  ingestion/vcf_standardization/util/dp_helpers.py,sha256=Nq8oLOLObu4_pv16qwwgpALRlUoJVCULrd9cFOD-eoI,823
56
- ingestion/vcf_standardization/util/read_write.py,sha256=IQotJ27To1MoQcRstc5AbHZtUuJz5cqkkZiHsDNaBvI,2471
57
- phc_ingestion-0.8.32.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
58
- phc_ingestion-0.8.32.dist-info/METADATA,sha256=dOeze5ldcBfhTefz5dF0weoa71CgORwhYWFGl8JAEPg,552
59
- phc_ingestion-0.8.32.dist-info/RECORD,,
56
+ ingestion/vcf_standardization/util/read_write.py,sha256=x3Pf6Dq8tmolblbCS5CrNmrcHS3FGfqBSFpFgvFGC4g,2526
57
+ phc_ingestion-0.8.34.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
58
+ phc_ingestion-0.8.34.dist-info/METADATA,sha256=5afEPjKh3tGTH02AjkVcQ-AluTJ-JuEqSBiZkD7HHAU,552
59
+ phc_ingestion-0.8.34.dist-info/RECORD,,