phc-ingestion 0.8.25__py3-none-any.whl → 0.8.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ from typing import TypedDict
2
+
3
+
4
+ class GeneWithLocation(TypedDict):
5
+ gene: str
6
+ chr: str
7
+ start: int
8
+ end: int
9
+
10
+
11
+ nextgen_specific_genes: list[GeneWithLocation] = [
12
+ {"gene": "IGK", "chr": "chr2", "start": 88852034, "end": 90258119},
13
+ {"gene": "NSD2", "chr": "chr4", "start": 1792518, "end": 1940193},
14
+ {"gene": "CCND3", "chr": "chr6", "start": 41920534, "end": 42562008},
15
+ {"gene": "MYC", "chr": "chr8", "start": 125309416, "end": 129673293},
16
+ {"gene": "CCND1", "chr": "chr11", "start": 69090733, "end": 69656860},
17
+ {"gene": "IGH", "chr": "chr14", "start": 105578834, "end": 109902208},
18
+ {"gene": "MAF", "chr": "chr16", "start": 78428398, "end": 79615096},
19
+ {"gene": "MAFB", "chr": "chr20", "start": 39039005, "end": 40688948},
20
+ {"gene": "IGL", "chr": "chr22", "start": 22012552, "end": 22965858},
21
+ ]
22
+
23
+
24
+ def maybe_get_matching_gene_for_location(chr: str, position: int) -> str | None:
25
+ for gene in nextgen_specific_genes:
26
+ if gene["chr"] == chr and gene["start"] <= position <= gene["end"]:
27
+ return gene["gene"]
28
+ return None
@@ -9,6 +9,17 @@ def build_variant_key_from_vcf_line(line: str) -> str:
9
9
  return f"{chrom}:{pos}:{ref}:{alt}"
10
10
 
11
11
 
12
+ def extract_filter_from_vcf_line(line: str) -> str:
13
+ split_line = line.strip().split("\t")
14
+ return split_line[6]
15
+
16
+
17
+ def replace_filter_in_vcf_line(line: str, new_filter: str) -> str:
18
+ split_line = line.strip().split("\t")
19
+ split_line[6] = new_filter
20
+ return "\t".join(split_line) + "\n"
21
+
22
+
12
23
  def pre_filter_somatic_vcf(
13
24
  somatic_vcf_file: str,
14
25
  somatic_vcf_snv_file: str,
@@ -19,22 +30,25 @@ def pre_filter_somatic_vcf(
19
30
  """
20
31
  Removes all variants from the `somatic_vcf_file` that are not
21
32
  also in the `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
33
+
34
+ Also updates the FILTER field in the `somatic_vcf_file` to match
35
+ the FILTER field of the corresponding variant in the
36
+ `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
22
37
  """
23
38
  log.info("Pre-filtering somatic VCF file")
24
39
 
25
- valid_variant_keys = set()
26
- with open_maybe_gzipped(somatic_vcf_snv_file, "rt") as f:
27
- for line in f:
28
- if line.startswith("#"):
29
- continue
30
- valid_variant_keys.add(build_variant_key_from_vcf_line(line))
31
- with open_maybe_gzipped(somatic_vcf_indel_file, "rt") as f:
32
- for line in f:
33
- if line.startswith("#"):
34
- continue
35
- valid_variant_keys.add(build_variant_key_from_vcf_line(line))
40
+ valid_variants_with_filters: dict[str, str] = {}
41
+
42
+ for file in [somatic_vcf_snv_file, somatic_vcf_indel_file]:
43
+ with open_maybe_gzipped(file, "rt") as f:
44
+ for line in f:
45
+ if line.startswith("#"):
46
+ continue
47
+ valid_variants_with_filters[build_variant_key_from_vcf_line(line)] = (
48
+ extract_filter_from_vcf_line(line)
49
+ )
36
50
 
37
- log.info(f"Found {len(valid_variant_keys)} valid variants")
51
+ log.info(f"Found {len(valid_variants_with_filters)} valid variants")
38
52
 
39
53
  output_vcf_path = f"{working_dir}/filtered_somatic.vcf.gz"
40
54
  with (
@@ -45,8 +59,9 @@ def pre_filter_somatic_vcf(
45
59
  if line.startswith("#"):
46
60
  w.write(line)
47
61
  else:
48
- if build_variant_key_from_vcf_line(line) in valid_variant_keys:
49
- w.write(line)
62
+ key = build_variant_key_from_vcf_line(line)
63
+ if key in valid_variants_with_filters:
64
+ w.write(replace_filter_in_vcf_line(line, valid_variants_with_filters[key]))
50
65
 
51
66
  log.info(f"Successfully pre-filtered somatic VCF file to {output_vcf_path}")
52
67
  return output_vcf_path
@@ -5,6 +5,7 @@ from typing import TypedDict
5
5
  from ingestion.shared_util.coords_to_genes import coords_to_genes
6
6
  from ingestion.nextgen.util.alteration_table import extract_variant_table
7
7
  from ingestion.nextgen.util.interpretation import map_interpretation
8
+ from ingestion.nextgen.util.nextgen_specific_genes import maybe_get_matching_gene_for_location
8
9
  from ingestion.shared_util.open_maybe_gzipped import open_maybe_gzipped
9
10
 
10
11
 
@@ -42,6 +43,23 @@ def is_del_dup_or_ins(variant: list[str]) -> bool:
42
43
  return any([x in variant[2] for x in ["MantaDEL", "MantaDUP", "MantaINS"]])
43
44
 
44
45
 
46
+ def get_gene_from_coords(
47
+ chromosome: str, start_position: str, end_position: str, log: Logger
48
+ ) -> str:
49
+ """
50
+ A number of genes of interest with specific start and end positions have been provided.
51
+ If a variant falls within the start and end positions of one of those genes of interest, that gene will be used.
52
+ Otherwise, we fall back to the standard gene lookup.
53
+ """
54
+ center_position = int((int(start_position) + int(end_position)) / 2)
55
+
56
+ gene = maybe_get_matching_gene_for_location(chromosome, center_position)
57
+ if gene:
58
+ return gene
59
+
60
+ return coords_to_genes("GRCh38", chromosome, center_position, log)
61
+
62
+
45
63
  def process_structural(
46
64
  sv_in_file: str, xml_in_file, root_path: str, prefix: str, log: Logger
47
65
  ) -> str | None:
@@ -74,9 +92,7 @@ def process_structural(
74
92
  effect = "insertion"
75
93
 
76
94
  # Get genes from coordinates using center point of start and end positions
77
- gene1 = coords_to_genes(
78
- "GRCh38", chromosome1, int((int(start_position1) + int(end_position1)) / 2), log
79
- )
95
+ gene1 = get_gene_from_coords(chromosome1, start_position1, end_position1, log)
80
96
  gene2 = "N/A"
81
97
 
82
98
  else:
@@ -89,12 +105,8 @@ def process_structural(
89
105
  effect = "translocation"
90
106
 
91
107
  # Get genes from coordinates using center point of start and end positions
92
- gene1 = coords_to_genes(
93
- "GRCh38", chromosome1, int((int(start_position1) + int(end_position1)) / 2), log
94
- )
95
- gene2 = coords_to_genes(
96
- "GRCh38", chromosome2, int((int(start_position2) + int(end_position2)) / 2), log
97
- )
108
+ gene1 = get_gene_from_coords(chromosome1, start_position1, end_position1, log)
109
+ gene2 = get_gene_from_coords(chromosome2, start_position2, end_position2, log)
98
110
 
99
111
  # Scrape interpretation
100
112
  interpretation = "unknown"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phc-ingestion
3
- Version: 0.8.25
3
+ Version: 0.8.27
4
4
  Summary: Functions for LifeOmic PHC genomic ingestions
5
5
  License: MIT
6
6
  Author-email: LifeOmic Development <development@lifeomic.com>
@@ -33,10 +33,11 @@ ingestion/nextgen/process.py,sha256=GKiQ2dCxrR7tBD8TSP6Wk-TyoX3xBGip5zfpeT2buiQ,
33
33
  ingestion/nextgen/util/alteration_table.py,sha256=KwpJCQv_rVsL30jkzgZn0bKdd205fjVodYBNTcK3D1s,4220
34
34
  ingestion/nextgen/util/interpretation.py,sha256=ozuzb0vozff34zfP6AdOiUmI8Q77hI02jve_nCPZHfE,297
35
35
  ingestion/nextgen/util/manifest_helpers.py,sha256=PpSay-pe62jk735nom1tVD9nDE8-CxmzzCrgpBhgtjY,1571
36
- ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=beAtOpznVtUOWKCUNNv1UukNBSa0LKmhXsJ_3K5xk2E,1759
36
+ ingestion/nextgen/util/nextgen_specific_genes.py,sha256=KcZ9HzUUobsLFM7z7oaXdnpVnY5-niYR-VSMKY985Po,1084
37
+ ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=K_gH4EnUXrKB22u_f8FqQVGrOS5LxXNsNO3VBn381eY,2301
37
38
  ingestion/nextgen/util/process_cnv.py,sha256=m-AhsXFlYw4LTzgJJaj5vXYbK5n3H7cImzBxD2To6M0,2598
38
39
  ingestion/nextgen/util/process_manifest.py,sha256=TAbCHwN_2m08jySn1J4qEd9Nrxjw9CrsspwFWack0V4,8448
39
- ingestion/nextgen/util/process_structural.py,sha256=BXhwbRtFLTZsDi4ioSna1qmMtxVcsB0R-xNIvymm5Vw,5947
40
+ ingestion/nextgen/util/process_structural.py,sha256=9xdJnSYtIT86lbhEIsPop39NGMwLHuqQGqlcQttlt64,6520
40
41
  ingestion/nextgen/util/process_vcf.py,sha256=TvyV5wyXXSpFy2lc6h3ljqVQ5VeDPGxihHl2_K6LalQ,8432
41
42
  ingestion/nextgen/util/types.py,sha256=SSzt5gv-kss1PR45eQUelypWrGI-dAfQMO3GSD-T-Wg,22
42
43
  ingestion/resources/GRCh37_map.csv.gz,sha256=JOEkjtbYrJpIdyoZdCvfJhvvz2dNfkSve7lXSXkCCD8,408290
@@ -53,6 +54,6 @@ ingestion/vcf_standardization/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
53
54
  ingestion/vcf_standardization/util/af_helpers.py,sha256=dpTzoeIQVeBRt0ETF3a9rp5ojZqznHg4x_hCZ8OPcOg,1061
54
55
  ingestion/vcf_standardization/util/dp_helpers.py,sha256=Nq8oLOLObu4_pv16qwwgpALRlUoJVCULrd9cFOD-eoI,823
55
56
  ingestion/vcf_standardization/util/read_write.py,sha256=IQotJ27To1MoQcRstc5AbHZtUuJz5cqkkZiHsDNaBvI,2471
56
- phc_ingestion-0.8.25.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
57
- phc_ingestion-0.8.25.dist-info/METADATA,sha256=AU2E4U0eLh5aXCGiCmVsVloyruqmkF_oNdkv7rmAfBQ,552
58
- phc_ingestion-0.8.25.dist-info/RECORD,,
57
+ phc_ingestion-0.8.27.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
58
+ phc_ingestion-0.8.27.dist-info/METADATA,sha256=vNzI7puyTHjuXFU7CBTQ4tUOiXxHF3aYUWAPgnFnA0Y,552
59
+ phc_ingestion-0.8.27.dist-info/RECORD,,