PyPI - phc-ingestion - Versions diffs - 0.8.32__tar.gz → 0.8.34__tar.gz - Mend

phc-ingestion 0.8.32tar.gz → 0.8.34tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.1
 Name: phc-ingestion
-Version: 0.8.32
+Version: 0.8.34
 Summary: Functions for LifeOmic PHC genomic ingestions
 License: MIT
 Author-email: LifeOmic Development <development@lifeomic.com>
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 # phc-ingestion

{phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/process.py RENAMED Viewed

@@ -2,6 +2,7 @@ from lifeomic_logging import scoped_logger
 from typing import Any, TypedDict
 from ruamel.yaml import YAML
+from ingestion.nextgen.util.alteration_table import extract_variant_table_rows_and_hyperdiploidy
 from ingestion.nextgen.util.pre_filter_somatic_vcf import pre_filter_somatic_vcf
 from ingestion.nextgen.util.process_cnv import process_cnv
 from ingestion.nextgen.util.process_manifest import process_manifest
@@ -36,54 +37,61 @@ def process(
         "projectId": project_id,
         "archiveFileId": source_file_id,
         "caseId": case_id,
-        "ingestion_id": ingestion_id,
+        "ingestionId": ingestion_id,
     }
     with scoped_logger(__name__, log_context) as log:
+        (
+            short_variant_table_rows,
+            copy_number_variant_table_rows,
+            structural_variant_table_rows,
+            hyperdiploidy_chromosomes,
+        ) = extract_variant_table_rows_and_hyperdiploidy(vendor_files["xmlFile"], log)
         cnv_path_name = process_cnv(
-            xml_in_file=vendor_files["xmlFile"],
-            cnv_in_file=vendor_files["somaticCnvTxtFile"],
-            root_path=local_output_dir,
-            prefix=case_id,
-            log=log,
+            vendor_files["somaticCnvTxtFile"],
+            copy_number_variant_table_rows,
+            local_output_dir,
+            case_id,
+            log,
         )
         structural_path_name, translocations = process_structural(
-            xml_in_file=vendor_files["xmlFile"],
-            sv_in_file=vendor_files["somaticSvVcfFile"],
-            root_path=local_output_dir,
-            prefix=case_id,
-            log=log,
+            vendor_files["somaticSvVcfFile"],
+            structural_variant_table_rows,
+            local_output_dir,
+            case_id,
+            log,
         )
         manifest = process_manifest(
-            xml_in_file=vendor_files["xmlFile"],
-            source_file_id=source_file_id,
-            prefix=case_id,
-            include_copy_number=bool(cnv_path_name),
-            include_structural=bool(structural_path_name),
-            somatic_translocations=translocations,
-            log=log,
+            vendor_files["xmlFile"],
+            source_file_id,
+            case_id,
+            bool(cnv_path_name),
+            bool(structural_path_name),
+            translocations,
+            hyperdiploidy_chromosomes,
         )
         pre_filtered_somatic_vcf_path = pre_filter_somatic_vcf(
             vendor_files["somaticVcfFile"],
             vendor_files["somaticVcfSnvFile"],
             vendor_files["somaticVcfIndelFile"],
+            short_variant_table_rows,
             local_output_dir,
             log,
         )
         somatic_vcf_meta_data = process_vcf(
-            vcf_in_file=pre_filtered_somatic_vcf_path,
-            root_path=local_output_dir,
-            case_id=case_id,
-            sequence_type="somatic",
-            xml_in_file=vendor_files["xmlFile"],
+            pre_filtered_somatic_vcf_path,
+            local_output_dir,
+            case_id,
+            "somatic",
+            short_variant_table_rows,
             log=log,
         )
         germline_vcf_meta_data = process_vcf(
-            vcf_in_file=vendor_files["germlineVcfFile"],
-            root_path=local_output_dir,
-            case_id=case_id,
-            sequence_type="germline",
-            xml_in_file=vendor_files["xmlFile"],
-            log=log,
+            vendor_files["germlineVcfFile"],
+            local_output_dir,
+            case_id,
+            "germline",
+            short_variant_table_rows,
+            log,
         )
     manifest_path_name = f"{local_output_dir}/{case_id}.ga4gh.genomics.yml"

phc-ingestion-0.8.34/ingestion/nextgen/util/alteration_table.py ADDED Viewed

@@ -0,0 +1,184 @@
+from logging import Logger
+import re
+from typing import TypedDict, Generic, TypeVar
+T = TypeVar("T")
+class AlterationTableRow(Generic[T], TypedDict):
+    gene: T
+    type: str
+    description: str
+    vaf: str
+    info: str
+class ShortVariantGene(TypedDict):
+    chr: str
+    pos: int
+class CopyNumberVariantGene(TypedDict):
+    gene: str
+    chr: str
+    start: int
+    end: int
+class StructuralVariantGene(TypedDict):
+    gene1: str
+    chr1: str
+    pos1: int
+    gene2: str
+    chr2: str
+    pos2: int
+base_short_variant_types: list[str] = [
+    "Missense",
+    "Frameshift",
+    "Stop gained",
+    "Stop lost",
+    "Inframe deletion",
+    "Inframe insertion",
+    "Inframe",
+    "Splice site",
+    "Splice region",
+    "Nonsense",
+    "Splice acceptor",
+    "Splice donor",
+]
+def get_short_variant_types() -> list[str]:
+    # For multi-word short variant types, sometimes the spaces are not included
+    short_variant_types: list[str] = []
+    for short_variant_type in base_short_variant_types:
+        short_variant_types.append(short_variant_type)
+        if " " in short_variant_type:
+            short_variant_types.append(short_variant_type.replace(" ", ""))
+    return short_variant_types
+def extract_all_table_lines(xml_in_file: str) -> list[str]:
+    with open(xml_in_file, "r") as f:
+        xml_lines = f.readlines()
+    in_range_trigger = False
+    table_lines: list[str] = []
+    for line in xml_lines:
+        if "Gene (Chr. Position, hg38)" in line:
+            in_range_trigger = True
+        if in_range_trigger:
+            if "</Table>" in line:
+                break
+            table_lines.append(line)
+    return table_lines
+def extract_alteration_table_rows(xml_in_file: str, log: Logger) -> list[AlterationTableRow[str]]:
+    table_lines = extract_all_table_lines(xml_in_file)
+    # Remove completely empty lines
+    table_lines = [line for line in table_lines if line.strip() != ""]
+    table_row_lines: list[list[str]] = []
+    current_row: list[str] = []
+    for line in table_lines:
+        if line.strip() == "</TR>":
+            if current_row:
+                table_row_lines.append(current_row)
+                current_row = []
+        line = re.sub(r"<\/?T.\/?>", "", line).strip()
+        if line and line != "p.":
+            current_row.append(line)
+    alteration_table_rows: list[AlterationTableRow[str]] = []
+    # Skip the first row which is the header
+    for row in table_row_lines[1:]:
+        # Sometimes the alteration table is "empty", in which case the `type` column will only contain "NA" values
+        if row[1] == "NA":
+            continue
+        alteration_table_rows.append(
+            {
+                "gene": row[0],
+                "type": row[1],
+                "description": row[2],
+                "vaf": row[3],
+                # Sometimes the info column is empty, so we need to check if it actually exists
+                # So far, it seems like rows with empty "info" columns are generally not useful for us
+                # and the data in them will not be used anywhere, so we just fill in an empty string
+                "info": row[4] if len(row) > 4 else "",
+            }
+        )
+    return alteration_table_rows
+def parse_short_variant_gene(gene: str) -> ShortVariantGene:
+    pattern = r"^.*\((?P<chr>chr\d+|chrX|chrY):(?P<pos>\d+).*\).*$"
+    match = re.match(pattern, gene)
+    if not match:
+        raise RuntimeError(f"Failed to parse gene field for short variant")
+    return {"chr": match.group("chr"), "pos": int(match.group("pos"))}
+def parse_copy_number_variant_gene(gene: str) -> CopyNumberVariantGene:
+    pattern = r"^(?P<gene>[A-Z1-9]*).*?\((?P<chr>chr\d+|chrX|chrY):(?P<start>\d+)_(?P<end>\d+)\).*$"
+    match = re.match(pattern, gene)
+    if not match:
+        raise RuntimeError(f"Failed to parse gene field for copy number variant")
+    return {
+        "gene": match.group("gene"),
+        "chr": match.group("chr"),
+        "start": int(match.group("start")),
+        "end": int(match.group("end")),
+    }
+def parse_structural_variant_gene(gene: str) -> StructuralVariantGene:
+    pattern = r"^(?P<gene1>[A-Z1-9]*)(-|\/)(?P<gene2>[A-Z1-9]*).*\(.*(?P<chr1>chr\d+|chrX|chrY):(?P<pos1>\d+).*;.*(?P<chr2>chr\d+|chrX|chrY):(?P<pos2>\d+).*\).*$"
+    match = re.match(pattern, gene)
+    if not match:
+        raise RuntimeError(f"Failed to parse gene field for structural variant")
+    return {
+        "gene1": match.group("gene1"),
+        "chr1": match.group("chr1"),
+        "pos1": int(match.group("pos1")),
+        "gene2": match.group("gene2"),
+        "chr2": match.group("chr2"),
+        "pos2": int(match.group("pos2")),
+    }
+def extract_variant_table_rows_and_hyperdiploidy(xml_in_file: str, log: Logger) -> tuple[
+    list[AlterationTableRow[ShortVariantGene]],
+    list[AlterationTableRow[CopyNumberVariantGene]],
+    list[AlterationTableRow[StructuralVariantGene]],
+    list[str] | None,
+]:
+    alteration_table_rows = extract_alteration_table_rows(xml_in_file, log)
+    short_variant_rows: list[AlterationTableRow[ShortVariantGene]] = []
+    copy_number_rows: list[AlterationTableRow[CopyNumberVariantGene]] = []
+    structural_variant_rows: list[AlterationTableRow[StructuralVariantGene]] = []
+    hyperdiploidy_chromosomes: list[str] | None = None
+    short_variant_types = get_short_variant_types()
+    for row in alteration_table_rows:
+        if row["type"] in short_variant_types:
+            short_variant_rows.append({**row, "gene": parse_short_variant_gene(row["gene"])})
+        elif row["type"] == "CNV":
+            copy_number_rows.append({**row, "gene": parse_copy_number_variant_gene(row["gene"])})
+        elif row["type"] == "Translocation":
+            structural_variant_rows.append(
+                {**row, "gene": parse_structural_variant_gene(row["gene"])}
+            )
+        elif row["type"] == "Hyperdiploidy":
+            hyperdiploidy_chromosomes = re.findall(r"\d+", row["gene"])
+    return short_variant_rows, copy_number_rows, structural_variant_rows, hyperdiploidy_chromosomes

phc-ingestion-0.8.34/ingestion/nextgen/util/interpretation.py ADDED Viewed

@@ -0,0 +1,28 @@
+from logging import Logger
+def map_interpretation(status: str, log: Logger):
+    """
+    Map interpretation for structural and copy number variants
+    """
+    if status == "Pathogenic":
+        return "Pathogenic"
+    elif "VUS" in status:
+        return "Uncertain significance"
+    else:
+        log.error(f"Failed to resolve interpretation: {status}")
+        return ""
+def map_vendsig(vendsig: str) -> str:
+    """
+    Map vendor significance for short variants
+    """
+    if vendsig in ["Pathogenic"]:
+        return "VENDSIG=Pathogenic"
+    elif vendsig in ["Likely Pathogenic", "LikelyPathogenic"]:
+        return "VENDSIG=Likely pathogenic"
+    elif vendsig in ["VUS"]:
+        return "VENDSIG=Uncertain significance"
+    else:
+        raise RuntimeError(f"Unable to map vendor significance: {vendsig}")

{phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/manifest_helpers.py RENAMED Viewed

@@ -1,5 +1,3 @@
-from ingestion.nextgen.util.alteration_table import extract_hyperdiploidy_row
 from logging import Logger
 import re
@@ -42,12 +40,3 @@ def parse_report_date(line: str) -> str:
     return parse_pattern(
         r"^.*Diagnostic Genomics Laboratory.*(\d{2}\/\d{2}\/\d{4}).*$", line, "report date"
     )
-def extract_hyperdiploidy_chromosomes(xml_in_file: str, log: Logger) -> list[str] | None:
-    hyperdiploidy_row_dict = extract_hyperdiploidy_row(xml_in_file, log)
-    if not hyperdiploidy_row_dict:
-        return None
-    return re.findall(r"\d+", hyperdiploidy_row_dict["gene"])

{phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/nextgen_specific_genes.py RENAMED Viewed

@@ -14,7 +14,7 @@ nextgen_specific_genes_with_location: list[GeneWithLocation] = [
     {"gene": "CCND3", "chr": "chr6", "start": 41920534, "end": 42562008},
     {"gene": "MYC", "chr": "chr8", "start": 125309416, "end": 129673293},
     {"gene": "CCND1", "chr": "chr11", "start": 69090733, "end": 69656860},
-    {"gene": "IGH", "chr": "chr14", "start": 105578834, "end": 109902208},
+    {"gene": "IGH", "chr": "chr14", "start": 105325507, "end": 109902208},
     {"gene": "MAF", "chr": "chr16", "start": 78428398, "end": 79615096},
     {"gene": "MAFB", "chr": "chr20", "start": 39039005, "end": 40688948},
     {"gene": "IGL", "chr": "chr22", "start": 22012552, "end": 22965858},
@@ -22,7 +22,7 @@ nextgen_specific_genes_with_location: list[GeneWithLocation] = [
 nextgen_specific_genes: set[str] = {gene["gene"] for gene in nextgen_specific_genes_with_location}
-def maybe_get_matching_gene_for_location(chr: str, position: int) -> str | None:
+def maybe_get_nextgen_specific_gene(chr: str, position: int) -> str | None:
     for gene in nextgen_specific_genes_with_location:
         if gene["chr"] == chr and gene["start"] <= position <= gene["end"]:
             return gene["gene"]

{phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/pre_filter_somatic_vcf.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from logging import Logger
+from ingestion.nextgen.util.alteration_table import AlterationTableRow, ShortVariantGene
 from ingestion.shared_util.open_maybe_gzipped import open_maybe_gzipped
@@ -14,26 +15,54 @@ def extract_filter_from_vcf_line(line: str) -> str:
     return split_line[6]
-def replace_filter_in_vcf_line(line: str, new_filter: str) -> str:
+def replace_filter_in_line(line: str, new_filter: str) -> str:
     split_line = line.strip().split("\t")
     split_line[6] = new_filter
     return "\t".join(split_line) + "\n"
+def is_line_in_alteration_table(
+    line: str, short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]]
+) -> bool:
+    """
+    Returns True if the line in the VCF appears in
+    the alteration table, False otherwise.
+    Matching in the alteration table is less strict than in the
+    VCF files; we only need to match chromosome and position.
+    Also position may differ by +1 or -1, as deletion and insertion positions
+    are represented differently in the VCF and the alteration table.
+    """
+    split_line = line.strip().split("\t")
+    chrom, pos = split_line[0], int(split_line[1])
+    for row in short_variant_table_rows:
+        ref_chrom, ref_pos = row["gene"]["chr"], row["gene"]["pos"]
+        if ref_chrom == chrom and (abs(ref_pos - pos) <= 1):
+            return True
+    return False
 def pre_filter_somatic_vcf(
     somatic_vcf_file: str,
     somatic_vcf_snv_file: str,
     somatic_vcf_indel_file: str,
+    short_variant_table_rows: list[AlterationTableRow[ShortVariantGene]],
     working_dir: str,
     log: Logger,
 ) -> str:
     """
     Removes all variants from the `somatic_vcf_file` that are not
-    also in the `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
+    also in the `somatic_vcf_snv_file`, the `somatic_vcf_indel_file`,
+    or the alteration table.
     Also updates the FILTER field in the `somatic_vcf_file` to match
     the FILTER field of the corresponding variant in the
     `somatic_vcf_snv_file` or `somatic_vcf_indel_file`.
+    For variants in the alteration table, the original FILTER field is kept.
     """
     log.info("Pre-filtering somatic VCF file")
@@ -48,20 +77,22 @@ def pre_filter_somatic_vcf(
                     extract_filter_from_vcf_line(line)
                 )
-    log.info(f"Found {len(valid_variants_with_filters)} valid variants")
+    log.info(f"Found {len(valid_variants_with_filters)} valid variants in the SNV and INDEL files")
     output_vcf_path = f"{working_dir}/filtered_somatic.vcf.gz"
     with (
-        open_maybe_gzipped(somatic_vcf_file, "rt") as f,
+        open_maybe_gzipped(somatic_vcf_file, "rt") as r,
         open_maybe_gzipped(output_vcf_path, "wt") as w,
     ):
-        for line in f:
+        for line in r:
             if line.startswith("#"):
                 w.write(line)
             else:
                 key = build_variant_key_from_vcf_line(line)
                 if key in valid_variants_with_filters:
-                    w.write(replace_filter_in_vcf_line(line, valid_variants_with_filters[key]))
+                    w.write(replace_filter_in_line(line, valid_variants_with_filters[key]))
+                elif is_line_in_alteration_table(line, short_variant_table_rows):
+                    w.write(line)
     log.info(f"Successfully pre-filtered somatic VCF file to {output_vcf_path}")
     return output_vcf_path

{phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/process_cnv.py RENAMED Viewed

@@ -1,20 +1,21 @@
 import pandas as pd
 from logging import Logger
-from ingestion.nextgen.util.alteration_table import extract_variant_table
+from ingestion.nextgen.util.alteration_table import AlterationTableRow, CopyNumberVariantGene
 from ingestion.nextgen.util.interpretation import map_interpretation
 def process_cnv(
-    xml_in_file: str, cnv_in_file: str, root_path: str, prefix: str, log: Logger
+    cnv_in_file: str,
+    copy_number_variant_table_rows: list[AlterationTableRow[CopyNumberVariantGene]],
+    output_dir: str,
+    case_id: str,
+    log: Logger,
 ) -> str | None:
-    copy_number_path_name = f"{root_path}/{prefix}.copynumber.csv"
-    sample_id = prefix
+    copy_number_path_name = f"{output_dir}/{case_id}.copynumber.csv"
+    sample_id = case_id
-    copy_number_variant_rows = []
-    copy_number_variant_table = extract_variant_table(
-        xml_in_file=xml_in_file, variant_type="copy number", log=log
-    )
+    copy_number_variant_rows: list[str] = []
     with open(cnv_in_file, "r") as f:
         cnv_rows = f.readlines()
@@ -45,20 +46,15 @@ def process_cnv(
         attributes = {}
         # Scrape interpretation
-        interpretation = None
-        if not copy_number_variant_table.empty:
-            for index, row in copy_number_variant_table.iterrows():
-                ref_gene = row["gene"].split(" ")[0]
-                ref_coord = row["gene"].split(" ")[1]
-                if (
-                    ref_gene == gene_id_only
-                    and ref_coord == f"({chromosome}:{start_position}_{end_position})"
-                ):
-                    interpretation = map_interpretation(row["info"], log)
-        if not interpretation:
-            interpretation = "unknown"
+        interpretation = "unknown"
+        for row in copy_number_variant_table_rows:
+            if (
+                row["gene"]["gene"] == gene_id_only
+                and row["gene"]["chr"] == chromosome
+                and row["gene"]["start"] <= int(start_position)
+                and row["gene"]["end"] >= int(end_position)
+            ):
+                interpretation = map_interpretation(row["info"], log)
         copy_number_variant_rows.append(
             f"{sample_id},{gene_id_only},{copy_number},{status},{attributes},{chromosome},{start_position},{end_position},{interpretation}\n"

{phc-ingestion-0.8.32 → phc-ingestion-0.8.34}/ingestion/nextgen/util/process_manifest.py RENAMED Viewed

@@ -65,7 +65,7 @@ def get_cell_purity(interpretation_lines: list):
         return float(00.00)
-def extract_patient_data(patient_info_lines: list):
+def extract_patient_data(patient_info_lines: list[str]):
     patient_data: dict = {}
     patient_data["patientInfo"] = {}
@@ -173,45 +173,46 @@ def extract_test_data(patient_info_lines: list, interpretation_lines: list):
 def process_manifest(
     xml_in_file: str,
     source_file_id: str,
-    prefix: str,
+    case_id: str,
     include_copy_number: bool,
     include_structural: bool,
     somatic_translocations: list[str],
-    log: Logger,
+    hyperdiploidy_chromosomes: list[str] | None,
 ):
     test_text = extract_xml_text(xml_in_file)
     interpretation_text = extract_interpretation_text(xml_in_file)
     manifest = extract_test_data(test_text, interpretation_text)
     manifest.update(extract_patient_data(test_text))
-    hyperdiploidy_chromosomes = manifest_helpers.extract_hyperdiploidy_chromosomes(xml_in_file, log)
+    file_prefix = f".lifeomic/nextgen/{case_id}/{case_id}"
     if hyperdiploidy_chromosomes:
         manifest["hyperdiploidyTrisomies"] = hyperdiploidy_chromosomes
     if somatic_translocations:
         manifest["somaticTranslocations"] = somatic_translocations
-    manifest["reportFile"] = f".lifeomic/nextgen/{prefix}/{prefix}.pdf"
+    manifest["reportFile"] = f"{file_prefix}.pdf"
     manifest["sourceFileId"] = source_file_id
     manifest["resources"] = []
     manifest["files"] = [
         {
-            "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.modified.somatic.nrm.filtered.vcf.gz",
+            "fileName": f"{file_prefix}.modified.somatic.nrm.filtered.vcf.gz",
             "sequenceType": "somatic",
             "type": "shortVariant",
         },
         {
-            "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.modified.germline.nrm.filtered.vcf.gz",
+            "fileName": f"{file_prefix}.modified.germline.nrm.filtered.vcf.gz",
             "sequenceType": "germline",
             "type": "shortVariant",
         },
         {
-            "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.somatic.updated.bam",
+            "fileName": f"{file_prefix}.somatic.updated.bam",
             "sequenceType": "somatic",
             "type": "read",
         },
         {
-            "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.germline.updated.bam",
+            "fileName": f"{file_prefix}.germline.updated.bam",
             "sequenceType": "germline",
             "type": "read",
         },
@@ -219,7 +220,7 @@ def process_manifest(
     if include_structural:
         manifest["files"].append(
             {
-                "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.structural.csv",
+                "fileName": f"{file_prefix}.structural.csv",
                 "sequenceType": "somatic",
                 "type": "structuralVariant",
             },
@@ -227,7 +228,7 @@ def process_manifest(
     if include_copy_number:
         manifest["files"].append(
             {
-                "fileName": f".lifeomic/nextgen/{prefix}/{prefix}.copynumber.csv",
+                "fileName": f"{file_prefix}.copynumber.csv",
                 "sequenceType": "somatic",
                 "type": "copyNumberVariant",
             }

phc-ingestion 0.8.32__tar.gz → 0.8.34__tar.gz

phc-ingestion 0.8.32tar.gz → 0.8.34tar.gz