phc-ingestion 0.10.6__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestion/caris/util/ihc.py +18 -4
- ingestion/caris/util/metadata.py +3 -3
- ingestion/foundation/util/cnv.py +4 -2
- ingestion/foundation/util/fnv.py +4 -1
- ingestion/foundation/util/ga4gh.py +28 -12
- ingestion/foundation/util/vcf_etl.py +3 -0
- ingestion/nebula/manifest_assembler.py +62 -32
- ingestion/nebula/process.py +8 -4
- ingestion/shared_util/fetch_patient.py +76 -0
- ingestion/vcf_standardization/util/read_write.py +32 -6
- {phc_ingestion-0.10.6.dist-info → phc_ingestion-1.0.0.dist-info}/METADATA +2 -1
- {phc_ingestion-0.10.6.dist-info → phc_ingestion-1.0.0.dist-info}/RECORD +13 -12
- {phc_ingestion-0.10.6.dist-info → phc_ingestion-1.0.0.dist-info}/WHEEL +0 -0
ingestion/caris/util/ihc.py
CHANGED
|
@@ -121,9 +121,16 @@ def get_ihc_results(data, log) -> list[dict]:
|
|
|
121
121
|
)
|
|
122
122
|
trigger = True
|
|
123
123
|
if test_results.get("stainPercent"):
|
|
124
|
-
|
|
124
|
+
try:
|
|
125
|
+
stain_percent = float(test_results["stainPercent"])
|
|
126
|
+
if np.isnan(stain_percent) == False:
|
|
127
|
+
log.warning(
|
|
128
|
+
f'IHC test {test_results["biomarkerName"]} has an unexpected pattern for "stainPercent": value of "{test_results["stainPercent"]}" was given when None was expected'
|
|
129
|
+
)
|
|
130
|
+
trigger = True
|
|
131
|
+
except (ValueError, TypeError):
|
|
125
132
|
log.warning(
|
|
126
|
-
f'IHC test {test_results["biomarkerName"]} has an
|
|
133
|
+
f'IHC test {test_results["biomarkerName"]} has an invalid value for "stainPercent": "{test_results["stainPercent"]}"'
|
|
127
134
|
)
|
|
128
135
|
trigger = True
|
|
129
136
|
|
|
@@ -193,9 +200,16 @@ def get_ihc_results(data, log) -> list[dict]:
|
|
|
193
200
|
)
|
|
194
201
|
trigger = True
|
|
195
202
|
if test_results.get("stainPercent"):
|
|
196
|
-
|
|
203
|
+
try:
|
|
204
|
+
stain_percent = float(test_results["stainPercent"])
|
|
205
|
+
if not np.isnan(stain_percent):
|
|
206
|
+
log.warning(
|
|
207
|
+
f'IHC test {test_results["biomarkerName"]} has an unexpected pattern for "stainPercent": value of "{test_results["stainPercent"]}" was given when None was expected'
|
|
208
|
+
)
|
|
209
|
+
trigger = True
|
|
210
|
+
except (ValueError, TypeError):
|
|
197
211
|
log.warning(
|
|
198
|
-
f'IHC test {test_results["biomarkerName"]} has an
|
|
212
|
+
f'IHC test {test_results["biomarkerName"]} has an invalid value for "stainPercent": "{test_results["stainPercent"]}"'
|
|
199
213
|
)
|
|
200
214
|
trigger = True
|
|
201
215
|
if test_results.get("threshold"):
|
ingestion/caris/util/metadata.py
CHANGED
|
@@ -106,10 +106,10 @@ def extract_metadata(data, prefix, files, source_file_id, log: Logger) -> dict:
|
|
|
106
106
|
patient = data["patientInformation"]
|
|
107
107
|
metadata["bodySiteSystem"] = "http://lifeomic.com/fhir/sequence-body-site"
|
|
108
108
|
metadata["reportID"] = get_report_id(test_details)
|
|
109
|
-
metadata["mrn"] = patient["mrn"]
|
|
110
|
-
metadata["patientLastName"] = patient["lastName"]
|
|
109
|
+
metadata["mrn"] = patient["mrn"].strip()
|
|
110
|
+
metadata["patientLastName"] = patient["lastName"].strip()
|
|
111
111
|
|
|
112
|
-
metadata["patientDOB"] = patient["dob"]
|
|
112
|
+
metadata["patientDOB"] = patient["dob"].strip()
|
|
113
113
|
|
|
114
114
|
# Get physician info - ordering name, NPI, and facility
|
|
115
115
|
metadata["medFacilName"] = get_med_facil_name(physician_details)
|
ingestion/foundation/util/cnv.py
CHANGED
|
@@ -42,13 +42,15 @@ def extract_copy_numbers(
|
|
|
42
42
|
log.info("Extracting copy numbers from xml")
|
|
43
43
|
copy_number_list: dict = {"CopyNumbers": []}
|
|
44
44
|
|
|
45
|
-
if
|
|
45
|
+
if (
|
|
46
|
+
"variant-report" in results_payload_dict
|
|
47
|
+
and "copy-number-alterations" in results_payload_dict["variant-report"].keys()
|
|
48
|
+
):
|
|
46
49
|
if (
|
|
47
50
|
results_payload_dict["variant-report"]["copy-number-alterations"] is not None
|
|
48
51
|
and "copy-number-alteration"
|
|
49
52
|
in results_payload_dict["variant-report"]["copy-number-alterations"].keys()
|
|
50
53
|
):
|
|
51
|
-
|
|
52
54
|
variants_dict = results_payload_dict["variant-report"]["copy-number-alterations"][
|
|
53
55
|
"copy-number-alteration"
|
|
54
56
|
]
|
ingestion/foundation/util/fnv.py
CHANGED
|
@@ -67,7 +67,10 @@ def extract_fusion_variant(
|
|
|
67
67
|
log.info("Extracting fusion variants from xml")
|
|
68
68
|
fusion_variant_list: dict = {"FusionVariants": []}
|
|
69
69
|
|
|
70
|
-
if
|
|
70
|
+
if (
|
|
71
|
+
"variant-report" in results_payload_dict
|
|
72
|
+
and "rearrangements" in results_payload_dict["variant-report"].keys()
|
|
73
|
+
):
|
|
71
74
|
if (
|
|
72
75
|
results_payload_dict["variant-report"]["rearrangements"] is not None
|
|
73
76
|
and "rearrangement" in results_payload_dict["variant-report"]["rearrangements"].keys()
|
|
@@ -111,17 +111,23 @@ def get_hrd_status(genes) -> Optional[str]:
|
|
|
111
111
|
hrd_dict = {
|
|
112
112
|
"HRD Positive": "positive",
|
|
113
113
|
"HRD Not Detected": "negative",
|
|
114
|
+
"HRDsig Positive": "positive",
|
|
115
|
+
"HRDsig Not Detected": "negative",
|
|
116
|
+
"HRDsig Negative": "negative",
|
|
114
117
|
}
|
|
115
|
-
hrd_entry =
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
118
|
+
hrd_entry = next(
|
|
119
|
+
(
|
|
120
|
+
entry
|
|
121
|
+
for entry in genes.get("Gene", {})
|
|
122
|
+
if entry.get("Name", "") == "Homologous Recombination status"
|
|
123
|
+
or entry.get("Name", "") == "HRD signature"
|
|
124
|
+
),
|
|
125
|
+
None,
|
|
126
|
+
)
|
|
127
|
+
if not hrd_entry:
|
|
124
128
|
return None
|
|
129
|
+
hrd_val = hrd_entry.get("Alterations", {}).get("Alteration", {}).get("Name", "")
|
|
130
|
+
return hrd_dict.get(hrd_val, None)
|
|
125
131
|
|
|
126
132
|
|
|
127
133
|
def get_test_yml(
|
|
@@ -155,6 +161,14 @@ def get_test_yml(
|
|
|
155
161
|
(prop for prop in properties if prop.get("@key") == "ComprehensiveTumorFractionScore"),
|
|
156
162
|
{},
|
|
157
163
|
).get("value")
|
|
164
|
+
hrd_score = next(
|
|
165
|
+
(
|
|
166
|
+
prop
|
|
167
|
+
for prop in properties
|
|
168
|
+
if prop.get("@key") == "HomologousRecombinationDeficiencyScore"
|
|
169
|
+
),
|
|
170
|
+
{},
|
|
171
|
+
).get("value")
|
|
158
172
|
plasma_tumor_fraction_score = None
|
|
159
173
|
if plasma_tumor_fraction_score_value:
|
|
160
174
|
plasma_tumor_fraction_score = get_plasma_tumor_fraction_score_from_value(
|
|
@@ -166,7 +180,7 @@ def get_test_yml(
|
|
|
166
180
|
# Hard-code genome reference for FMI
|
|
167
181
|
genome_reference = "GRCh37"
|
|
168
182
|
|
|
169
|
-
mrn = get_mrn(pmi)
|
|
183
|
+
mrn = get_mrn(pmi).strip()
|
|
170
184
|
|
|
171
185
|
trf = get_trf(sample)
|
|
172
186
|
|
|
@@ -206,8 +220,8 @@ def get_test_yml(
|
|
|
206
220
|
"reportDate": reportDate,
|
|
207
221
|
"reportID": trf,
|
|
208
222
|
"mrn": mrn,
|
|
209
|
-
"patientDOB": get_date(pmi.get("DOB")),
|
|
210
|
-
"patientLastName": pmi.get("LastName"),
|
|
223
|
+
"patientDOB": get_date(pmi.get("DOB")).strip(),
|
|
224
|
+
"patientLastName": pmi.get("LastName").strip(),
|
|
211
225
|
"medFacilName": med_facil_info[0],
|
|
212
226
|
"medFacilID": med_facil_info[1],
|
|
213
227
|
"orderingMDName": ordering_md_info[0],
|
|
@@ -325,5 +339,7 @@ def get_test_yml(
|
|
|
325
339
|
# add homologous recombination deficiency
|
|
326
340
|
if hrd_status:
|
|
327
341
|
yaml_file["hrdStatus"] = hrd_status
|
|
342
|
+
if hrd_score != None and is_number(hrd_score):
|
|
343
|
+
yaml_file["hrdScore"] = float(hrd_score)
|
|
328
344
|
|
|
329
345
|
return yaml_file
|
|
@@ -55,6 +55,9 @@ def get_xml_short_vars(xml_file: str, log):
|
|
|
55
55
|
xml_short_vars = xml_dict["rr:ResultsReport"]["rr:ResultsPayload"]["variant-report"][
|
|
56
56
|
"short-variants"
|
|
57
57
|
]["short-variant"]
|
|
58
|
+
except KeyError as e:
|
|
59
|
+
log.info(f"Missing key in XML structure for short variants: {str(e)}")
|
|
60
|
+
return vendsig_dict
|
|
58
61
|
except TypeError:
|
|
59
62
|
log.info("No short variants found in xml")
|
|
60
63
|
return vendsig_dict
|
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
import re
|
|
2
|
+
import gzip
|
|
3
|
+
from typing import Optional, TypedDict, Any, Callable
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from dateutil import parser
|
|
3
6
|
from ingestion.shared_util.lambda_client import LambdaClient
|
|
7
|
+
from ingestion.vcf_standardization.util.read_write import read_headers
|
|
4
8
|
from ingestion.nebula.constants import (
|
|
5
9
|
DATASET_SYSTEM,
|
|
6
10
|
NEBULA_KIT_ID_SYSTEM,
|
|
@@ -9,7 +13,6 @@ from ingestion.nebula.constants import (
|
|
|
9
13
|
NEBULA_BODY_SITE,
|
|
10
14
|
NEBULA_INDICATION,
|
|
11
15
|
)
|
|
12
|
-
import re
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
class HumanName(TypedDict):
|
|
@@ -37,7 +40,7 @@ class Patient(Resource):
|
|
|
37
40
|
gender: str | None
|
|
38
41
|
birthDate: str | None
|
|
39
42
|
managingOrganization: Reference | None
|
|
40
|
-
generalPractitioner: Reference | None
|
|
43
|
+
generalPractitioner: list[Reference] | None
|
|
41
44
|
|
|
42
45
|
|
|
43
46
|
class Organization(Resource):
|
|
@@ -49,11 +52,14 @@ class Practitioner(Resource):
|
|
|
49
52
|
|
|
50
53
|
|
|
51
54
|
class ManifestAssembler:
|
|
52
|
-
def __init__(
|
|
55
|
+
def __init__(
|
|
56
|
+
self, ingestion_id: str, account_id: str, project_id: str, kit_id: str, vcf_file_path: str
|
|
57
|
+
):
|
|
53
58
|
self.ingestion_id = ingestion_id
|
|
54
59
|
self.account_id = account_id
|
|
55
60
|
self.project_id = project_id
|
|
56
61
|
self.kit_id = kit_id
|
|
62
|
+
self.vcf_file_path = vcf_file_path
|
|
57
63
|
self.client = LambdaClient(
|
|
58
64
|
"patient-service",
|
|
59
65
|
{
|
|
@@ -72,9 +78,16 @@ class ManifestAssembler:
|
|
|
72
78
|
|
|
73
79
|
response = self.client.invoke(path, "get", None, params)
|
|
74
80
|
entries = response.get("entry", [])
|
|
75
|
-
patient = entries[0]["resource"] if len(entries) > 0 else None
|
|
76
81
|
|
|
77
|
-
|
|
82
|
+
if len(entries) == 0:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
if len(entries) > 1:
|
|
86
|
+
raise RuntimeError(
|
|
87
|
+
f"Found {len(entries)} patients with kit id {self.kit_id}. Expected 1."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return entries[0]["resource"]
|
|
78
91
|
|
|
79
92
|
def __fetch_resource_by_type_and_reference(
|
|
80
93
|
self,
|
|
@@ -92,28 +105,16 @@ class ManifestAssembler:
|
|
|
92
105
|
except RuntimeError:
|
|
93
106
|
return None
|
|
94
107
|
|
|
95
|
-
def __extract_identifier_from_resource(self, resource: Resource
|
|
96
|
-
if not resource:
|
|
97
|
-
return ""
|
|
98
|
-
|
|
108
|
+
def __extract_identifier_from_resource(self, resource: Resource) -> str:
|
|
99
109
|
identifiers = resource.get("identifier", [])
|
|
100
110
|
return identifiers[0].get("value", "") if identifiers else ""
|
|
101
111
|
|
|
102
112
|
def __extract_id_from_reference(self, reference: Reference) -> str:
|
|
103
|
-
|
|
104
|
-
return ""
|
|
105
|
-
|
|
106
|
-
ref_string = reference.get("reference")
|
|
107
|
-
if not ref_string:
|
|
108
|
-
return ""
|
|
109
|
-
|
|
113
|
+
ref_string = reference.get("reference", "")
|
|
110
114
|
parts = ref_string.split("/")
|
|
111
115
|
return parts[1] if len(parts) > 1 else parts[0]
|
|
112
116
|
|
|
113
117
|
def __extract_elation_mrn(self, patient: Patient) -> str:
|
|
114
|
-
if not patient:
|
|
115
|
-
return ""
|
|
116
|
-
|
|
117
118
|
identifier = next(
|
|
118
119
|
(
|
|
119
120
|
x
|
|
@@ -134,12 +135,9 @@ class ManifestAssembler:
|
|
|
134
135
|
|
|
135
136
|
def __parse_human_name(self, human_name: list[HumanName] | None):
|
|
136
137
|
if not human_name:
|
|
137
|
-
return
|
|
138
|
+
return {}
|
|
138
139
|
|
|
139
|
-
human_name = next((x for x in human_name if x.get("use") == "official"),
|
|
140
|
-
|
|
141
|
-
if not human_name:
|
|
142
|
-
return None
|
|
140
|
+
human_name = next((x for x in human_name if x.get("use") == "official"), human_name[0])
|
|
143
141
|
|
|
144
142
|
last_name = human_name.get("family", "")
|
|
145
143
|
first_name = human_name.get("given", [])[0]
|
|
@@ -150,6 +148,37 @@ class ManifestAssembler:
|
|
|
150
148
|
"fullName": f"{first_name} {last_name}",
|
|
151
149
|
}
|
|
152
150
|
|
|
151
|
+
def _safe(self, lambda_func: Callable[[], Any]) -> Any:
|
|
152
|
+
try:
|
|
153
|
+
return lambda_func()
|
|
154
|
+
except Exception as e:
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
def __extract_collection_date(self) -> dict[str, str]:
|
|
158
|
+
in_file = self.vcf_file_path
|
|
159
|
+
with gzip.open(in_file, "rt") if in_file.endswith(".gz") else open(in_file, "r") as f:
|
|
160
|
+
headers = read_headers(f)
|
|
161
|
+
for header in headers:
|
|
162
|
+
epoch_parts = re.search(r"Epoch=(\d+)", header)
|
|
163
|
+
epoch_time_raw = epoch_parts.group(1) if epoch_parts else None
|
|
164
|
+
epoch_time = self._safe(
|
|
165
|
+
lambda: datetime.fromtimestamp(int(epoch_time_raw) / 1000.0, tz=timezone.utc)
|
|
166
|
+
)
|
|
167
|
+
date_parts = re.search(r"Date=\"(.*)\"", header)
|
|
168
|
+
date_string_raw = date_parts.group(1) if date_parts else None
|
|
169
|
+
date_string = self._safe(
|
|
170
|
+
lambda: parser.parse(date_string_raw, tzinfos={"GST": "UTC+4"})
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
report_date = epoch_time or date_string
|
|
174
|
+
if report_date:
|
|
175
|
+
return {
|
|
176
|
+
"collDate": report_date.astimezone(tz=timezone.utc)
|
|
177
|
+
.isoformat(timespec="milliseconds")
|
|
178
|
+
.replace("+00:00", "Z")
|
|
179
|
+
}
|
|
180
|
+
return {}
|
|
181
|
+
|
|
153
182
|
def create_manifest(self) -> dict[str, Any]:
|
|
154
183
|
patient = self.__fetch_patient_by_kit_id()
|
|
155
184
|
|
|
@@ -157,17 +186,17 @@ class ManifestAssembler:
|
|
|
157
186
|
raise RuntimeError(f"Patient with kit id {self.kit_id} not found")
|
|
158
187
|
patient_birth_date = patient.get("birthDate")
|
|
159
188
|
if not patient_birth_date:
|
|
160
|
-
raise RuntimeError("Patient birth date is
|
|
189
|
+
raise RuntimeError("Patient birth date is required to create a manifest")
|
|
161
190
|
|
|
162
191
|
organization: Organization | None = self.__fetch_resource_by_type_and_reference(
|
|
163
192
|
"Organization", patient.get("managingOrganization")
|
|
164
193
|
)
|
|
165
194
|
|
|
166
195
|
general_practitioner: Practitioner | None = self.__fetch_resource_by_type_and_reference(
|
|
167
|
-
"Practitioner", patient.get("generalPractitioner")
|
|
196
|
+
"Practitioner", next(iter(patient.get("generalPractitioner", [])), None)
|
|
168
197
|
)
|
|
169
198
|
|
|
170
|
-
patient_info = self.__parse_human_name(patient
|
|
199
|
+
patient_info = self.__parse_human_name(patient.get("name"))
|
|
171
200
|
practitioner_info = self.__parse_human_name(
|
|
172
201
|
general_practitioner.get("name") if general_practitioner else None
|
|
173
202
|
)
|
|
@@ -175,6 +204,7 @@ class ManifestAssembler:
|
|
|
175
204
|
return {
|
|
176
205
|
"name": "Nebula",
|
|
177
206
|
"indexedDate": datetime.now().strftime("%Y-%m-%d"),
|
|
207
|
+
**self.__extract_collection_date(),
|
|
178
208
|
"reference": "GRCh38",
|
|
179
209
|
"patientId": patient.get("id"),
|
|
180
210
|
"mrn": self.__extract_elation_mrn(patient),
|
|
@@ -185,10 +215,10 @@ class ManifestAssembler:
|
|
|
185
215
|
"indication": NEBULA_INDICATION,
|
|
186
216
|
"indicationDisplay": NEBULA_INDICATION,
|
|
187
217
|
"patientInfo": {
|
|
188
|
-
"lastName": patient_info.get("lastName"),
|
|
218
|
+
"lastName": patient_info.get("lastName", ""),
|
|
189
219
|
"dob": datetime.fromisoformat(patient_birth_date).strftime("%Y-%m-%d"),
|
|
190
|
-
"firstName": patient_info.get("firstName"),
|
|
191
|
-
"gender": patient
|
|
220
|
+
"firstName": patient_info.get("firstName", ""),
|
|
221
|
+
"gender": patient.get("gender", ""),
|
|
192
222
|
},
|
|
193
223
|
**(
|
|
194
224
|
{
|
ingestion/nebula/process.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
3
|
from ingestion.nebula.constants import NEBULA_TEST_TYPE
|
|
4
|
+
from ingestion.shared_util.ga4gh import create_yaml
|
|
4
5
|
from ingestion.vcf_standardization.standardize import standardize_vcf
|
|
5
6
|
from lifeomic_logging import scoped_logger
|
|
6
7
|
from ingestion.nebula.manifest_assembler import ManifestAssembler
|
|
@@ -14,8 +15,9 @@ def process(vcf_file, out_path, file_name, source_file_id, ingestion_id, account
|
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
case_id = file_name
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
manifest = ManifestAssembler(
|
|
19
|
+
ingestion_id, account_id, project_id, case_id, vcf_file
|
|
20
|
+
).create_manifest()
|
|
19
21
|
base_vcf_file = os.path.basename(vcf_file)
|
|
20
22
|
vcf_out = base_vcf_file.replace(".vcf", ".modified.vcf")
|
|
21
23
|
vcf_final = base_vcf_file.replace(".vcf", ".modified.nrm.filtered.vcf")
|
|
@@ -34,15 +36,17 @@ def process(vcf_file, out_path, file_name, source_file_id, ingestion_id, account
|
|
|
34
36
|
manifest["testType"] = NEBULA_TEST_TYPE
|
|
35
37
|
manifest["reportID"] = case_id
|
|
36
38
|
manifest["sourceFileId"] = source_file_id
|
|
37
|
-
manifest["resources"] = [{"fileName": f".lifeomic/
|
|
39
|
+
manifest["resources"] = [{"fileName": f".lifeomic/nebula/{case_id}/{base_vcf_file}"}]
|
|
38
40
|
manifest["files"] = [
|
|
39
41
|
{
|
|
40
|
-
"fileName": f".lifeomic/
|
|
42
|
+
"fileName": f".lifeomic/nebula/{case_id}/{vcf_final}",
|
|
41
43
|
"sequenceType": "germline",
|
|
42
44
|
"type": "shortVariant",
|
|
43
45
|
}
|
|
44
46
|
]
|
|
45
47
|
|
|
48
|
+
create_yaml(manifest, f"{out_path}/{file_name}")
|
|
49
|
+
|
|
46
50
|
# Hard-code genome reference for Nebula VCFs
|
|
47
51
|
genome_reference = "GRCh38"
|
|
48
52
|
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from lifeomic_logging import scoped_logger
|
|
3
|
+
from ingestion.shared_util.lambda_client import LambdaClient
|
|
4
|
+
|
|
5
|
+
# Constants
|
|
6
|
+
DATASET_SYSTEM = "http://lifeomic.com/fhir/dataset"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def fetch_patient(
|
|
10
|
+
*,
|
|
11
|
+
elation_id: str,
|
|
12
|
+
given_name: str,
|
|
13
|
+
birthdate: str,
|
|
14
|
+
project_id: str,
|
|
15
|
+
account_id: str,
|
|
16
|
+
ingestion_id: str,
|
|
17
|
+
) -> dict[str, Any] | None:
|
|
18
|
+
"""
|
|
19
|
+
Fetch a patient from the patient-service lambda using Elation ID AND given name AND
|
|
20
|
+
birthdate.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
The patient resource if found, None otherwise
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
RuntimeError: When multiple patients are found (ambiguous match)
|
|
27
|
+
"""
|
|
28
|
+
log_context = {
|
|
29
|
+
"accountId": account_id,
|
|
30
|
+
"projectId": project_id,
|
|
31
|
+
"elationId": elation_id,
|
|
32
|
+
"ingestionId": ingestion_id,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
with scoped_logger(__name__, log_context) as log:
|
|
36
|
+
# Create LambdaClient instance with proper headers
|
|
37
|
+
client = LambdaClient(
|
|
38
|
+
"patient-service",
|
|
39
|
+
{
|
|
40
|
+
"Content-Type": "application/json",
|
|
41
|
+
"LifeOmic-Account": account_id,
|
|
42
|
+
"LifeOmic-Correlation-Id": ingestion_id,
|
|
43
|
+
},
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Search by Elation ID AND given name AND birthdate
|
|
47
|
+
log.info(
|
|
48
|
+
f"Searching for patient with Elation ID: {elation_id}, given name: {given_name}, birthdate: {birthdate}"
|
|
49
|
+
)
|
|
50
|
+
response = client.invoke(
|
|
51
|
+
f"/{account_id}/dstu3/Patient",
|
|
52
|
+
"get",
|
|
53
|
+
None,
|
|
54
|
+
{
|
|
55
|
+
"_tag": f"{DATASET_SYSTEM}|{project_id}",
|
|
56
|
+
"identifier": elation_id,
|
|
57
|
+
"name": given_name,
|
|
58
|
+
"birthdate": birthdate,
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
entries = response.get("entry", [])
|
|
62
|
+
|
|
63
|
+
if len(entries) == 0:
|
|
64
|
+
error_msg = f"No patient found with Elation ID: {elation_id}, given name: {given_name}, birthdate: {birthdate}"
|
|
65
|
+
log.error(error_msg)
|
|
66
|
+
raise RuntimeError(error_msg)
|
|
67
|
+
|
|
68
|
+
if len(entries) > 1:
|
|
69
|
+
error_msg = f"Found multiple patients when one was expected. Found {len(entries)}. Elation ID {elation_id}, given name {given_name}, birthdate {birthdate}."
|
|
70
|
+
log.error(error_msg)
|
|
71
|
+
raise RuntimeError(error_msg)
|
|
72
|
+
|
|
73
|
+
log.info(
|
|
74
|
+
f"Found patient with Elation ID: {elation_id}, given name: {given_name}, birthdate: {birthdate}"
|
|
75
|
+
)
|
|
76
|
+
return entries[0]["resource"]
|
|
@@ -4,6 +4,8 @@ import re
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import Iterator, Optional
|
|
6
6
|
|
|
7
|
+
import pysam
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
def check_vcf(infile: str, log: Logger) -> None:
|
|
9
11
|
log.info("Checking VCF file")
|
|
@@ -72,11 +74,35 @@ def write_vcf(
|
|
|
72
74
|
) -> int:
|
|
73
75
|
log.info(f"Writing standardized VCF to {outfile}")
|
|
74
76
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
77
|
+
if compression:
|
|
78
|
+
# Write uncompressed first, then compress with bgzip (indexable format)
|
|
79
|
+
# pysam.tabix_compress requires an uncompressed input file
|
|
80
|
+
# Create a temporary uncompressed file
|
|
81
|
+
if outfile.endswith(".gz"):
|
|
82
|
+
temp_uncompressed = outfile[:-3] # Remove .gz extension
|
|
83
|
+
else:
|
|
84
|
+
temp_uncompressed = outfile + ".tmp"
|
|
85
|
+
|
|
86
|
+
with open(temp_uncompressed, "w") as w:
|
|
87
|
+
w.write("\n".join(headers) + "\n")
|
|
88
|
+
for variant in variants_gen:
|
|
89
|
+
line_count += 1
|
|
90
|
+
if variant:
|
|
91
|
+
w.write(variant + "\n")
|
|
92
|
+
|
|
93
|
+
# Compress with bgzip (indexable format) using pysam
|
|
94
|
+
log.info(f"Compressing VCF with bgzip to {outfile}")
|
|
95
|
+
pysam.tabix_compress(temp_uncompressed, outfile, force=True)
|
|
96
|
+
|
|
97
|
+
# Clean up temporary uncompressed file
|
|
98
|
+
if os.path.exists(temp_uncompressed):
|
|
99
|
+
os.remove(temp_uncompressed)
|
|
100
|
+
else:
|
|
101
|
+
with open(outfile, "w") as w:
|
|
102
|
+
w.write("\n".join(headers) + "\n")
|
|
103
|
+
for variant in variants_gen:
|
|
104
|
+
line_count += 1
|
|
105
|
+
if variant:
|
|
106
|
+
w.write(variant + "\n")
|
|
81
107
|
|
|
82
108
|
return line_count
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: phc-ingestion
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Functions for LifeOmic PHC genomic ingestions
|
|
5
5
|
License: MIT
|
|
6
6
|
Author-email: LifeOmic Development <development@lifeomic.com>
|
|
@@ -13,6 +13,7 @@ Requires-Dist: natsort==7.1.1
|
|
|
13
13
|
Requires-Dist: numpy>=2.1.2
|
|
14
14
|
Requires-Dist: packaging>=23.1
|
|
15
15
|
Requires-Dist: pandas>=2.2.3
|
|
16
|
+
Requires-Dist: pysam>=0.21.0
|
|
16
17
|
Requires-Dist: ruamel.yaml==0.17.21
|
|
17
18
|
Requires-Dist: schema>=0.7.5
|
|
18
19
|
Requires-Dist: urllib3>=1.26.16
|
|
@@ -5,10 +5,10 @@ ingestion/caris/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
|
5
5
|
ingestion/caris/util/cnv.py,sha256=Nrc0aoG2k4tmrqHb69hAuXr0adDZIVVJRRjf9_sO91E,4441
|
|
6
6
|
ingestion/caris/util/detect_genome_ref.py,sha256=MpiPa71QmlO3MWvjxPzNdbEHyOhorOcpQWWlwE5BO4c,1640
|
|
7
7
|
ingestion/caris/util/hla.py,sha256=X_t6ngBRvmdG3m4I2_KnPFeWn3BaH-3IWHtOvDbS32A,770
|
|
8
|
-
ingestion/caris/util/ihc.py,sha256=
|
|
8
|
+
ingestion/caris/util/ihc.py,sha256=1XfFJjvQq9E45cdPX4EPrg2VkXABxbMJP_BOgzA8z7g,13091
|
|
9
9
|
ingestion/caris/util/interpretation.py,sha256=CghNurqeVA5VTBBorU8-ZTN-PVNPnR8wrmTwKCH3568,555
|
|
10
10
|
ingestion/caris/util/json.py,sha256=aifO1hnZwNSS-ZtY20otyGbfIoc23w9HMWJ5D56lhFo,5020
|
|
11
|
-
ingestion/caris/util/metadata.py,sha256=
|
|
11
|
+
ingestion/caris/util/metadata.py,sha256=OaEG50FrYqyj1n9u9F3Qt17SX2K0cmIIzI6d8Assj64,10311
|
|
12
12
|
ingestion/caris/util/specimen_details.py,sha256=wDTIelrGelAuSljmE6dAoJRjb9kwrlhmB2f1_vcCiUc,2963
|
|
13
13
|
ingestion/caris/util/structural.py,sha256=EUcMIea_WnafoVmFLIyEqlJ_HtYIj_g6qkekXa7QNQs,4628
|
|
14
14
|
ingestion/caris/util/tests.py,sha256=mcG3A8TW81_sn2Bfoa-Gd6Q1sR3_R4FX2BNskD4DkJk,372
|
|
@@ -18,18 +18,18 @@ ingestion/caris/util/vcf.py,sha256=Lkr4HnjMmMvEBVkD-9EkxRI3HpFmgCkgj6CXN4lBfIg,5
|
|
|
18
18
|
ingestion/foundation/__init__.py,sha256=CuUMsxSvWPAVzvnxx4hois632HpXwhwpjtMtiM98UoM,49
|
|
19
19
|
ingestion/foundation/process.py,sha256=mzKVVefGZD1HkEi0Hbb9eD_flwGp1S6SZ7FNXZpWnvI,2464
|
|
20
20
|
ingestion/foundation/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
-
ingestion/foundation/util/cnv.py,sha256=
|
|
22
|
-
ingestion/foundation/util/fnv.py,sha256
|
|
23
|
-
ingestion/foundation/util/ga4gh.py,sha256=
|
|
21
|
+
ingestion/foundation/util/cnv.py,sha256=eyTn1lqbo0tmEacnqWuGATt1h9lllTlVlboT8n5C6Ao,4283
|
|
22
|
+
ingestion/foundation/util/fnv.py,sha256=B8tOREYsR45aIGdSj8SPpe7f3L8FlGaIhLe0DSNqCnc,6006
|
|
23
|
+
ingestion/foundation/util/ga4gh.py,sha256=GVHjMQNd7RaRg9qX7fStBcW5toLjvJCuHvu6hqm4tAo,11803
|
|
24
24
|
ingestion/foundation/util/interpretation.py,sha256=LVVUmMyD6Un1rIKXqiyQDUC6oIJUd8cU3I9YHD5fsXg,405
|
|
25
|
-
ingestion/foundation/util/vcf_etl.py,sha256=
|
|
25
|
+
ingestion/foundation/util/vcf_etl.py,sha256=GXV5JXswwdyHEEdPsM3Qq8tDPFkvZajrZn5chWgF53k,2266
|
|
26
26
|
ingestion/generic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
27
|
ingestion/generic/process.py,sha256=ZaVnZ_gx9faDUsuresI1A0oCegTa-dPQT7DBFMeZGyY,1777
|
|
28
28
|
ingestion/generic/utils.py,sha256=1MEIru7uq38IjUdL8lcHqDH0oTki9uWrz1f2e-pmRoU,2814
|
|
29
29
|
ingestion/nebula/__init__.py,sha256=VauK-rup_N8ZXVohx3HYqHX_PE_WoPyMUhdv2R7al4o,45
|
|
30
30
|
ingestion/nebula/constants.py,sha256=thKqSwemdaAwAmKvF4FEVI9l1Ph5ergsnMlx6nWte7E,357
|
|
31
|
-
ingestion/nebula/manifest_assembler.py,sha256=
|
|
32
|
-
ingestion/nebula/process.py,sha256=
|
|
31
|
+
ingestion/nebula/manifest_assembler.py,sha256=kcRSy6pixHkuVEK9QSoM-i6ZdLWMSYXw39eKGHvam34,7995
|
|
32
|
+
ingestion/nebula/process.py,sha256=N9OuipynGV_XgEL3nO5I_-di1tk9szOy8LqsyNTw0E0,2323
|
|
33
33
|
ingestion/nextgen/__init__.py,sha256=7LQ-h_Bvc5P1QcHMdzsqi1Qm4fTJn04-ozar2ty9wSc,59
|
|
34
34
|
ingestion/nextgen/process.py,sha256=5Z0RfclwTAYZruGDiLPutjPCYFh1DJpoWY9dnttghT4,3993
|
|
35
35
|
ingestion/nextgen/util/alteration_table.py,sha256=JTWBL1Fqj_pGsH5vwuVEnCUJle2wOBk6VYImHYCF9vg,6129
|
|
@@ -46,6 +46,7 @@ ingestion/resources/GRCh37_map.csv.gz,sha256=JOEkjtbYrJpIdyoZdCvfJhvvz2dNfkSve7l
|
|
|
46
46
|
ingestion/resources/GRCh38_map.csv.gz,sha256=qriYO2_buCCb4T6WcuZ-pCwPxMsm0TL2OxAHvJ1cEfA,612373
|
|
47
47
|
ingestion/shared_util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
48
|
ingestion/shared_util/coords_to_genes.py,sha256=vz9EfgFm3BS6pEPnslbEka8cJKlQZtHJdH2WRCCUMdE,1669
|
|
49
|
+
ingestion/shared_util/fetch_patient.py,sha256=-E-C2m6-fLfKy-r7rSsjZNu_ACGKvdjl1UuB-2gvaJc,2442
|
|
49
50
|
ingestion/shared_util/ga4gh.py,sha256=-jNQj79zspxG67MxHzOfwAhLbb9je55M1h4-i5ri-tU,507
|
|
50
51
|
ingestion/shared_util/gene_to_coords.py,sha256=M-q5ateLSQ4fCF0uMk5TX2uBLRrcZzXqXEf05TPaLsU,876
|
|
51
52
|
ingestion/shared_util/lambda_client.py,sha256=0EdV5nOqe_w-OoDyi72w1P0lk30g1vlTW2sD3ci_Qqw,2695
|
|
@@ -58,7 +59,7 @@ ingestion/vcf_standardization/standardize.py,sha256=zYzZxncq8USA1bUs26L-ByLPTnUl
|
|
|
58
59
|
ingestion/vcf_standardization/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
60
|
ingestion/vcf_standardization/util/af_helpers.py,sha256=dpTzoeIQVeBRt0ETF3a9rp5ojZqznHg4x_hCZ8OPcOg,1061
|
|
60
61
|
ingestion/vcf_standardization/util/dp_helpers.py,sha256=Nq8oLOLObu4_pv16qwwgpALRlUoJVCULrd9cFOD-eoI,823
|
|
61
|
-
ingestion/vcf_standardization/util/read_write.py,sha256=
|
|
62
|
-
phc_ingestion-0.
|
|
63
|
-
phc_ingestion-0.
|
|
64
|
-
phc_ingestion-0.
|
|
62
|
+
ingestion/vcf_standardization/util/read_write.py,sha256=xogLdqtm1xGzigY459LqP_1zM6c5X9AjAFGkfaDI-bg,3479
|
|
63
|
+
phc_ingestion-1.0.0.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
|
|
64
|
+
phc_ingestion-1.0.0.dist-info/METADATA,sha256=cBi1pg6CnOcySSTE8zTJzsvS-Tq48YTB5R5waazNTuQ,705
|
|
65
|
+
phc_ingestion-1.0.0.dist-info/RECORD,,
|
|
File without changes
|