PyPI - phc-ingestion - Versions diffs - 0.8.36__py3-none-any.whl → 0.8.38__py3-none-any.whl - Mend

phc-ingestion 0.8.36py3-none-any.whl → 0.8.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

ingestion/caris/util/json.py +10 -2
ingestion/caris/util/metadata.py +9 -1
ingestion/caris/util/vcf.py +42 -7
ingestion/foundation/util/vcf_etl.py +0 -1
ingestion/nextgen/process.py +1 -0
ingestion/nextgen/util/manifest_helpers.py +47 -1
ingestion/nextgen/util/process_manifest.py +9 -9
{phc_ingestion-0.8.36.dist-info → phc_ingestion-0.8.38.dist-info}/METADATA +4 -3
{phc_ingestion-0.8.36.dist-info → phc_ingestion-0.8.38.dist-info}/RECORD +10 -10
{phc_ingestion-0.8.36.dist-info → phc_ingestion-0.8.38.dist-info}/WHEEL +0 -0

ingestion/caris/util/json.py CHANGED Viewed

@@ -84,11 +84,17 @@ def process_caris_json(infile: str, outpath: str, file_name: str, source_file_id
                 ]
     # Get patient
-    metadata = extract_metadata(data, file_name, files, source_file_id, log)
+    metadata, is_test_cancelled_permit_vcf_skip = extract_metadata(
+        data, file_name, files, source_file_id, log
+    )
     structural_results = extract_structural(file_name, data, log)
     cnv_results = extract_cnv(file_name, data, log)
     rgel_results = convert_tsv_to_rgel(file_name, files, log)
-    vcf_results = extract_sv(file_name, bool(somatic_filename), bool(germline_filename))
+    include_empty = metadata["ihcTests"] and is_test_cancelled_permit_vcf_skip
+    vcf_results = extract_sv(
+        file_name, bool(somatic_filename), bool(germline_filename), include_empty
+    )
     # We might not have any of these files but we need an empty json object here.
     file_genome_references = {}
@@ -117,5 +123,7 @@ def process_caris_json(infile: str, outpath: str, file_name: str, source_file_id
         result["somatic_vcf"] = f"{outpath}/{somatic_filename}"
     if germline_filename is not None:
         result["germline_vcf"] = f"{outpath}/{germline_filename}"
+    if not germline_filename and not somatic_filename and include_empty:
+        result["somatic_vcf"] = f"{outpath}/{file_name}.modified.somatic.vcf.gz"
     return (result, germline_case_id, file_genome_references, data)

ingestion/caris/util/metadata.py CHANGED Viewed

@@ -80,6 +80,7 @@ def is_valid_test_entry(test: dict):
 # Build up the manifest iteratively because almost everything is optional
 def extract_metadata(data, prefix, files, source_file_id, log: Logger) -> dict:
+    is_test_cancelled_permit_vcf_skip = False
     metadata = {}
     test_details = data["testDetails"]
@@ -158,6 +159,13 @@ def extract_metadata(data, prefix, files, source_file_id, log: Logger) -> dict:
     # if not sufficient quantity we won't have test results
     if test_details["reportType"] != "QNS":
         for test in tests:
+            if "test_cancellation_reason" in test:
+                if test["test_cancellation_reason"] == "Quantitation quantity not sufficient":
+                    # capture cancellation reason before bailing
+                    # this is so we can generate an empty vcf so present biomarkers are
+                    # still ingested: https://lifeomic.atlassian.net/browse/PHC-5748
+                    is_test_cancelled_permit_vcf_skip = True
             if not is_valid_test_entry(test):
                 continue
             # Sometimes, if there is only a single test result,
@@ -244,4 +252,4 @@ def extract_metadata(data, prefix, files, source_file_id, log: Logger) -> dict:
                 )
     active_metadata = {k: v for k, v in metadata.items() if v is not None}
-    return active_metadata
+    return (active_metadata, is_test_cancelled_permit_vcf_skip)

ingestion/caris/util/vcf.py CHANGED Viewed

@@ -1,11 +1,5 @@
 import datetime
 import gzip
-import io
-import os
-import re
-import subprocess
-import sys
-import zipfile
 from logging import Logger
@@ -13,8 +7,38 @@ from ingestion.caris.util.tests import safely_extract_tests_from_json_data
 from ingestion.vcf_standardization.standardize import standardize_vcf
+def create_empty_vcf_zip(prefix):
+    vcf_gzip_path = f"{prefix}.modified.somatic.vcf.gz"
+    content = (
+        """##fileformat=VCFv4.1
+##filedate="""
+        + datetime.datetime.now().isoformat()
+        + """
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=R8,Description="IndelRepeatLength is greater than 8">
+##FILTER=<ID=R8.1,Description="IndelRepeatLength of a monomer is greater than 8">
+##FILTER=<ID=R8.2,Description="IndelRepeatLength of a dimer is greater than 8">
+##FILTER=<ID=sb,Description="Variant strand bias high">
+##FILTER=<ID=sb.s,Description="Variant strand bias significantly high (only for SNV)">
+##FILTER=<ID=rs,Description="Variant with rs (dbSNP) number in a non-core gene">
+##FILTER=<ID=FP,Description="Possibly false positives due to high similarity to off-target regions">
+##FILTER=<ID=NC,Description="Noncoding INDELs on non-core genes">
+##FILTER=<ID=lowDP,Description="low depth variant">
+##FILTER=<ID=Benign,Description="Benign variant">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AF,Number=1,Type=String,Description="Variant Allele Frequency">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	"""
+        + prefix
+        + """
+"""
+    )
+    with gzip.open(vcf_gzip_path, "wb") as f:
+        f.write(content.encode("utf-8"))
 # This is done in next step, we are just adding to yaml
-def extract_sv(prefix, include_somatic: bool, include_germline: bool):
+def extract_sv(prefix, include_somatic: bool, include_germline: bool, include_empty: bool):
     vcfs = []
     # Hard-code genome reference for Caris VCFs
@@ -40,6 +64,17 @@ def extract_sv(prefix, include_somatic: bool, include_germline: bool):
             }
         )
+    if not vcfs and include_empty:
+        create_empty_vcf_zip(prefix)
+        vcfs.append(
+            {
+                "fileName": f".lifeomic/caris/{prefix}/{prefix}.modified.somatic.nrm.filtered.vcf.gz",
+                "sequenceType": "somatic",
+                "type": "shortVariant",
+                "reference": genome_reference,
+            }
+        )
     return vcfs

ingestion/foundation/util/vcf_etl.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gzip
 import xmltodict
-from natsort import natsorted
 from logging import Logger
 import re
 import os

ingestion/nextgen/process.py CHANGED Viewed

@@ -68,6 +68,7 @@ def process(
             bool(structural_path_name),
             translocations,
             hyperdiploidy_chromosomes,
+            log,
         )
         pre_filtered_somatic_vcf_path = pre_filter_somatic_vcf(
             vendor_files["somaticVcfFile"],

ingestion/nextgen/util/manifest_helpers.py CHANGED Viewed

@@ -36,7 +36,53 @@ def parse_report_id(line: str) -> str:
     return parse_pattern(r"^.*Accession #: (.*?) .*$", line, "report ID")
-def parse_report_date(line: str) -> str:
+def parse_report_date_single_line(line: str) -> str:
     return parse_pattern(
         r"^.*Diagnostic Genomics Laboratory.*(\d{2}\/\d{2}\/\d{4}).*$", line, "report date"
     )
+def parse_report_date_multiline(patient_info_lines: list[str]) -> str:
+    in_range_trigger = False
+    for line in patient_info_lines:
+        if "Laboratory" in line:
+            in_range_trigger = True
+            continue
+        if in_range_trigger:
+            formatted_line = re.sub(r"<\/?T.\/?>", "", line).strip()
+            if not formatted_line:
+                continue
+            return parse_pattern(
+                r"^.*(\d{2}\/\d{2}\/\d{4}).*$", formatted_line, "report date from multiline"
+            )
+    raise ValueError("Could not parse report date from lines")
+def parse_report_date(patient_info_lines: list[str], log: Logger) -> str:
+    """
+    Typically, the report date is in a form like:
+    ```
+    Diagnostic Genomics Laboratory 01/01/2021
+    ```
+    However, sometimes the date is split across multiple lines, like:
+    ```
+    Diagnostic Genomics Laboratory
+    ...random empty lines or lines with only tags...
+    01/01/2021
+    ```
+    This function attempts to first parse the date from a single line, and if that fails,
+    it will attempt to parse it from multiple lines.
+    """
+    for line in patient_info_lines:
+        if "Laboratory" in line:
+            try:
+                report_date = parse_report_date_single_line(line)
+                return report_date
+            except ValueError:
+                log.warning("Could not parse report date from single line")
+                break
+    return parse_report_date_multiline(patient_info_lines)

ingestion/nextgen/util/process_manifest.py CHANGED Viewed

@@ -17,7 +17,7 @@ def search_and_grab(array: list, search_item: str, grab_index: int):
     return array[array.index([i for i in array if re.search(search_item, i)][0]) + grab_index]
-def extract_xml_text(xml_in_file: str):
+def extract_xml_text(xml_in_file: str) -> list[str]:
     with open(xml_in_file, "r") as f:
         xml_lines = f.readlines()
@@ -35,7 +35,7 @@ def extract_xml_text(xml_in_file: str):
     return patient_info_lines
-def extract_interpretation_text(xml_in_file: str):
+def extract_interpretation_text(xml_in_file: str) -> list[str]:
     with open(xml_in_file, "r") as f:
         xml_lines = f.readlines()
@@ -111,7 +111,7 @@ def extract_patient_data(patient_info_lines: list[str]):
     return patient_data
-def extract_test_data(patient_info_lines: list, interpretation_lines: list):
+def extract_test_data(patient_info_lines: list[str], interpretation_lines: list[str], log: Logger):
     # Initialize manifest and hard-code some values
     manifest: dict[str, Any] = {}
     manifest["testType"] = "Plasma Cell Myeloma Panel"
@@ -128,12 +128,11 @@ def extract_test_data(patient_info_lines: list, interpretation_lines: list):
     manifest["medFacilID"] = ""
     manifest["medFacilName"] = "IU Health"
-    for line in patient_info_lines:
-        if "reportDate" not in manifest and "Laboratory" in line:
-            report_date = manifest_helpers.parse_report_date(line)
-            manifest["reportDate"] = transform_date(report_date)
-            manifest["indexedDate"] = manifest["reportDate"]
+    report_date = manifest_helpers.parse_report_date(patient_info_lines, log)
+    manifest["reportDate"] = transform_date(report_date)
+    manifest["indexedDate"] = manifest["reportDate"]
+    for line in patient_info_lines:
         if "collDate" not in manifest and "Collected" in line:
             collArray = line.split(" ")
             coll_date = search_and_grab(collArray, "Collected", 1)
@@ -178,10 +177,11 @@ def process_manifest(
     include_structural: bool,
     somatic_translocations: list[str],
     hyperdiploidy_chromosomes: list[str] | None,
+    log: Logger,
 ):
     test_text = extract_xml_text(xml_in_file)
     interpretation_text = extract_interpretation_text(xml_in_file)
-    manifest = extract_test_data(test_text, interpretation_text)
+    manifest = extract_test_data(test_text, interpretation_text, log)
     manifest.update(extract_patient_data(test_text))
     file_prefix = f".lifeomic/nextgen/{case_id}/{case_id}"

{phc_ingestion-0.8.36.dist-info → phc_ingestion-0.8.38.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: phc-ingestion
-Version: 0.8.36
+Version: 0.8.38
 Summary: Functions for LifeOmic PHC genomic ingestions
 License: MIT
 Author-email: LifeOmic Development <development@lifeomic.com>
@@ -8,11 +8,12 @@ Requires-Python: >=3.11
 Requires-Dist: jsonschema<5.0.0,>=4.16.0
 Requires-Dist: lifeomic-logging<0.4.0,>=0.3.2
 Requires-Dist: natsort==7.1.1
+Requires-Dist: numpy>=2.1.2
 Requires-Dist: packaging>=23.1
-Requires-Dist: pandas<1.6.0,>=1.5.0
+Requires-Dist: pandas>=2.2.3
 Requires-Dist: ruamel.yaml==0.17.21
 Requires-Dist: schema>=0.7.5
-Requires-Dist: xmltodict==0.13.0
+Requires-Dist: xmltodict>=0.14.2
 Description-Content-Type: text/markdown
 # phc-ingestion

{phc_ingestion-0.8.36.dist-info → phc_ingestion-0.8.38.dist-info}/RECORD RENAMED Viewed

@@ -8,15 +8,15 @@ ingestion/caris/util/ga4gh.py,sha256=-jNQj79zspxG67MxHzOfwAhLbb9je55M1h4-i5ri-tU
 ingestion/caris/util/hla.py,sha256=X_t6ngBRvmdG3m4I2_KnPFeWn3BaH-3IWHtOvDbS32A,770
 ingestion/caris/util/ihc.py,sha256=vegxudxHj7tLihrXGbEx_ptwkSsu3YCCB1nZVwoiYXg,12312
 ingestion/caris/util/interpretation.py,sha256=CghNurqeVA5VTBBorU8-ZTN-PVNPnR8wrmTwKCH3568,555
-ingestion/caris/util/json.py,sha256=xVEfwKOwyRCEXZWKr9zXooPIrxGkUtSxcAYuDUHkAxw,4706
-ingestion/caris/util/metadata.py,sha256=629-yPh_qIZKJCJzJHEvo6EUGxIBW3Gw31RSytF4v08,9541
+ingestion/caris/util/json.py,sha256=HBU3Tf-XSi9fGHANYUtD8maXNYqmmnpncGh0KCDaPEU,5018
+ingestion/caris/util/metadata.py,sha256=C50e5a6zqYeUG_RcZvFvN-UEXWNJb0q03dMOGWkDgO0,10070
 ingestion/caris/util/specimen_details.py,sha256=R3uKHlLR056XcQbUPI6IO2dLr-z5Z5AJi866DJ379Qw,2105
 ingestion/caris/util/structural.py,sha256=EUcMIea_WnafoVmFLIyEqlJ_HtYIj_g6qkekXa7QNQs,4628
 ingestion/caris/util/tar.py,sha256=BGR_2vBbxyMgF-GzJ3SrihsPdOzII4SFVz9tvKV5vo0,482
 ingestion/caris/util/tests.py,sha256=mcG3A8TW81_sn2Bfoa-Gd6Q1sR3_R4FX2BNskD4DkJk,372
 ingestion/caris/util/tmb.py,sha256=DVi1wPSjVr_32ZCc6Yb51tGqUlcxUx40yCvuqvNuDx4,1027
 ingestion/caris/util/tsv.py,sha256=xeIfDUtqG_5ewkbaPLakqm4kQlu6ClgkAf4tefqKlJA,1595
-ingestion/caris/util/vcf.py,sha256=btqaWhIXjjpm7uoXeLuAMnxVO7tlwjuia4m9ZbtdHMQ,3440
+ingestion/caris/util/vcf.py,sha256=Lkr4HnjMmMvEBVkD-9EkxRI3HpFmgCkgj6CXN4lBfIg,5058
 ingestion/foundation/__init__.py,sha256=CuUMsxSvWPAVzvnxx4hois632HpXwhwpjtMtiM98UoM,49
 ingestion/foundation/process.py,sha256=T8YTvXRiThqE1LTERhrzvvD69mP4qJ7soJ1ZIbu8Y1Y,3151
 ingestion/foundation/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -24,19 +24,19 @@ ingestion/foundation/util/cnv.py,sha256=YSKCaOBhjZDNXth_GxC-50crDURpTNCMefoHo0uO
 ingestion/foundation/util/fnv.py,sha256=-VstGsBKXM0duC-IpwUkektoTZ9yQUR0IQcDb1HibY0,5937
 ingestion/foundation/util/ga4gh.py,sha256=nc14JStpT7tG7v-dXTrbpPZi29I-HbKKBGNxEZAudhg,10987
 ingestion/foundation/util/interpretation.py,sha256=LVVUmMyD6Un1rIKXqiyQDUC6oIJUd8cU3I9YHD5fsXg,405
-ingestion/foundation/util/vcf_etl.py,sha256=vljaXq8KIjp6oYqXq1FyP3bDhHM8THLCDFGGFp9igv0,2163
+ingestion/foundation/util/vcf_etl.py,sha256=ZBrX1XGRz-ymLUEiVcjjqmPZPb-AfD9On8UkZJDa1Dk,2133
 ingestion/generic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestion/generic/process.py,sha256=WJHV_-SKhrDZ3JS3fm9DVMoW3Zs2t50GiraSV3vlLHE,1548
 ingestion/generic/utils.py,sha256=1MEIru7uq38IjUdL8lcHqDH0oTki9uWrz1f2e-pmRoU,2814
 ingestion/nextgen/__init__.py,sha256=7LQ-h_Bvc5P1QcHMdzsqi1Qm4fTJn04-ozar2ty9wSc,59
-ingestion/nextgen/process.py,sha256=kDCnU685v7aqJ3i4HpFdb7HqgHRSBKqtYPpuyN7qWmM,3976
+ingestion/nextgen/process.py,sha256=5Z0RfclwTAYZruGDiLPutjPCYFh1DJpoWY9dnttghT4,3993
 ingestion/nextgen/util/alteration_table.py,sha256=bvTrXEhIRye5BzSXjZEBy1AvXLZgG0rNkNt3e3rcvv0,6127
 ingestion/nextgen/util/interpretation.py,sha256=56wVk9j-w59gM-11iODXbKuUtZcEwe8zJQSXpjyCguw,872
-ingestion/nextgen/util/manifest_helpers.py,sha256=2xrpEtHbCb1Kea1wJeObkDfTiBklmffQt_o2hMgOSOE,1208
+ingestion/nextgen/util/manifest_helpers.py,sha256=LH5em0xsu9Hrs175vfx6SX8W1Ww2FFRp2wIBSfIEMUM,2725
 ingestion/nextgen/util/nextgen_specific_genes.py,sha256=1jFcqvtYAlJ7eBwOBm1UC2TzAbjHjdlvPBUzxr1G8dY,1206
 ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=mIaUihmGLbS38D4Gy_Qtf1lFAfW0A-LgAgQmsrEiI-M,3529
 ingestion/nextgen/util/process_cnv.py,sha256=MIirc8e0k6lsaTZkRM3U3L3IvbrcHmKQ4xlIu585514,2430
-ingestion/nextgen/util/process_manifest.py,sha256=EnV9I90vnanDvuoErbMfz6yAfjzM5LdhhUF4q5DJd8w,8428
+ingestion/nextgen/util/process_manifest.py,sha256=RsHzDGL1OBea2raoEHACo8owRodIFpX3xVE-aFOoyrg,8428
 ingestion/nextgen/util/process_structural.py,sha256=FKjkK7BkIlocnLs8rFCjrMC39FCQnD0nQCeWvi7cRoA,7539
 ingestion/nextgen/util/process_vcf.py,sha256=ZZURSMnZhHDpFahzijZ4MvCfSWTPdIktzmnCKVVUbGs,7768
 ingestion/nextgen/util/types.py,sha256=SSzt5gv-kss1PR45eQUelypWrGI-dAfQMO3GSD-T-Wg,22
@@ -54,6 +54,6 @@ ingestion/vcf_standardization/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
 ingestion/vcf_standardization/util/af_helpers.py,sha256=dpTzoeIQVeBRt0ETF3a9rp5ojZqznHg4x_hCZ8OPcOg,1061
 ingestion/vcf_standardization/util/dp_helpers.py,sha256=Nq8oLOLObu4_pv16qwwgpALRlUoJVCULrd9cFOD-eoI,823
 ingestion/vcf_standardization/util/read_write.py,sha256=x3Pf6Dq8tmolblbCS5CrNmrcHS3FGfqBSFpFgvFGC4g,2526
-phc_ingestion-0.8.36.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
-phc_ingestion-0.8.36.dist-info/METADATA,sha256=Fe445KhsY-t71Eb7bJ67HuJB96Tl-8lvsS16x8UEJaw,552
-phc_ingestion-0.8.36.dist-info/RECORD,,
+phc_ingestion-0.8.38.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
+phc_ingestion-0.8.38.dist-info/METADATA,sha256=zGjelnmcC8NGRJa7qsZmKB6ifrwH1v6PjlR3RDwZ9vs,573
+phc_ingestion-0.8.38.dist-info/RECORD,,

{phc_ingestion-0.8.36.dist-info → phc_ingestion-0.8.38.dist-info}/WHEEL RENAMED Viewed

File without changes

phc-ingestion 0.8.36__py3-none-any.whl → 0.8.38__py3-none-any.whl

phc-ingestion 0.8.36py3-none-any.whl → 0.8.38py3-none-any.whl