phc-ingestion 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,50 @@
1
+ import os
2
+
3
+ from ingestion.vcf_standardization.standardize import standardize_vcf
4
+ from ingestion.broad.utils import check_manifest
5
+ from lifeomic_logging import scoped_logger
6
+
7
+
8
+ def process(
9
+ manifest_file: str, vcf_file: str, source_file_id: str, out_path: str, case_id: str
10
+ ) -> dict[str, str]:
11
+ with scoped_logger(__name__) as log:
12
+
13
+ # Read in supplied manifest
14
+ manifest = check_manifest(manifest_file, case_id, log)
15
+
16
+ # Process VCF
17
+ base_vcf_file = os.path.basename(vcf_file)
18
+ vcf_out = base_vcf_file.replace(".vcf", ".modified.vcf")
19
+ vcf_final = base_vcf_file.replace(".vcf", ".modified.nrm.filtered.vcf")
20
+ if not vcf_final.endswith(".gz"):
21
+ vcf_final = vcf_final + ".gz"
22
+ # All broad VCF ingestions are germline, so ensure the
23
+ # sample name is prefixed with "germline_". This matches
24
+ # the downstream logic in genomic-manifest
25
+ sample_name = f"germline_{case_id}"
26
+ vcf_line_count = standardize_vcf(
27
+ vcf_file, vcf_out, out_path, sample_name, log, compression=True
28
+ )
29
+
30
+ # Add to manifest
31
+ manifest["sourceFileId"] = source_file_id
32
+ manifest["resources"] = [
33
+ {"fileName": f".lifeomic/broad_filtered/{case_id}/{base_vcf_file}"}
34
+ ]
35
+ manifest["files"] = [
36
+ {
37
+ "fileName": f".lifeomic/broad_filtered/{case_id}/{vcf_final}",
38
+ "sequenceType": "germline",
39
+ "type": "shortVariant",
40
+ }
41
+ ]
42
+
43
+ case_metadata = {
44
+ "test_type": manifest["testType"],
45
+ "vcf_line_count": vcf_line_count,
46
+ "case_id": manifest["reportID"],
47
+ "germline_genome_reference": manifest["reference"],
48
+ }
49
+
50
+ return case_metadata, manifest
@@ -0,0 +1,82 @@
1
+ import os
2
+ from logging import Logger
3
+ from schema import Schema, SchemaError, Optional
4
+ from ruamel.yaml import YAML
5
+
6
+
7
+ def check_manifest(manifest_file_path: str, case_id, log: Logger):
8
+ manifest_schema = Schema(
9
+ {
10
+ # Required fields
11
+ "name": str, # Vendor name
12
+ "testType": str, # Vendor test name
13
+ "indexedDate": str, # Date of ingestion YYYY-MM-DD
14
+ "reference": str, # Reference genome
15
+ "mrn": str, # Patient MRN
16
+ "patientInfo": {
17
+ "lastName": str,
18
+ "dob": str, # YYYY-MM-DD
19
+ "firstName": str,
20
+ "gender": str,
21
+ },
22
+ # Optional fields
23
+ Optional("receivedDate"): str,
24
+ Optional("collDate"): str,
25
+ Optional("reportDate"): str,
26
+ Optional("reportFile"): str,
27
+ Optional("medFacilName"): str,
28
+ Optional("medFacilID"): str,
29
+ Optional("orderingMDName"): str,
30
+ Optional("orderingMDNPI"): str,
31
+ Optional("indicationSystem"): "http://lifeomic.com/fhir/sequence-indication",
32
+ Optional("indication"): str,
33
+ Optional("indicationDisplay"): str,
34
+ Optional("bodySite"): str,
35
+ Optional("bodySiteDisplay"): str,
36
+ Optional("bodySiteSystem"): "http://lifeomic.com/fhir/sequence-body-site",
37
+ Optional("tmb"): str,
38
+ Optional("tmbScore"): float,
39
+ Optional("msi"): str,
40
+ Optional("lossOfHeterozygosityScore"): int,
41
+ Optional("lossOfHeterozygosityStatus"): str,
42
+ Optional("ihcTests"): any,
43
+ Optional("nonHumanContent"): any,
44
+ Optional("plasmaTumorFraction"): str,
45
+ Optional("cellPurity"): float,
46
+ Optional("hrdStatus"): str,
47
+ Optional("sampleId"): str,
48
+ }
49
+ )
50
+
51
+ # Read in manifest yaml
52
+ if os.path.exists(manifest_file_path):
53
+ with open(manifest_file_path, "r") as file:
54
+ manifest = YAML(typ="safe")
55
+ manifest = manifest.load(file)
56
+ else:
57
+ raise FileNotFoundError(f"Manifest file not found: {manifest_file_path}")
58
+
59
+ # Validate
60
+ try:
61
+ manifest_schema.validate(manifest)
62
+ except SchemaError as e:
63
+ log.error(e)
64
+ raise e
65
+
66
+ # Add duplicate fields from supplied ones to fit formatting
67
+ manifest["patientInfo"]["identifiers"] = [
68
+ {
69
+ "codingCode": "MR",
70
+ "codingSystem": "http://hl7.org/fhir/v2/0203",
71
+ "value": manifest["mrn"],
72
+ }
73
+ ]
74
+
75
+ manifest["patientLastName"] = manifest["patientInfo"]["lastName"]
76
+
77
+ manifest["patientDOB"] = manifest["patientInfo"]["dob"]
78
+
79
+ manifest["reportID"] = case_id # Vendor report ID / Case ID
80
+
81
+ log.info(f"Manifest file validated")
82
+ return manifest
@@ -25,7 +25,9 @@ def parse_patient_name(line: str) -> str:
25
25
 
26
26
 
27
27
  def parse_sample_number(line: str) -> str:
28
- return parse_pattern(r"^.*Specimen #: (\d*-?R?) .*$", line, "sample number")
28
+ return parse_pattern(
29
+ r"^.*Specimen #: (\d+(?:-[A-Za-z0-9]+)?(?:-?R)?) .*$", line, "sample number"
30
+ )
29
31
 
30
32
 
31
33
  def parse_body_site(line: str) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phc-ingestion
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: Functions for LifeOmic PHC genomic ingestions
5
5
  License: MIT
6
6
  Author-email: LifeOmic Development <development@lifeomic.com>
@@ -1,4 +1,7 @@
1
1
  ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ ingestion/broad/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ ingestion/broad/process.py,sha256=IrtnANQlbPpI7aza-N_v8qMyxBOX17jq904wDNq7Nng,1803
4
+ ingestion/broad/utils.py,sha256=YHzbmB0tP-aQjvY9vRSn8iOhailleYwvctDbB0IEKb8,2853
2
5
  ingestion/caris/__init__.py,sha256=sFHl6qZWaacTbCFyikkIkuC-z459m5XQ7kwwoCw7HK8,61
3
6
  ingestion/caris/process.py,sha256=7GaSTg9LjqSn7iTNbY3FJIPFvVjkd6wgXiVC00pn-7A,1411
4
7
  ingestion/caris/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -34,7 +37,7 @@ ingestion/nextgen/__init__.py,sha256=7LQ-h_Bvc5P1QcHMdzsqi1Qm4fTJn04-ozar2ty9wSc
34
37
  ingestion/nextgen/process.py,sha256=5Z0RfclwTAYZruGDiLPutjPCYFh1DJpoWY9dnttghT4,3993
35
38
  ingestion/nextgen/util/alteration_table.py,sha256=JTWBL1Fqj_pGsH5vwuVEnCUJle2wOBk6VYImHYCF9vg,6129
36
39
  ingestion/nextgen/util/interpretation.py,sha256=a_B8jVjJXjkrN0hNzB260WNlZdY-BkL26LyLZcYP20A,950
37
- ingestion/nextgen/util/manifest_helpers.py,sha256=zucNhbSss6IuU-kf3tpVoSxV27iSNQohGSMUXQQePSc,2729
40
+ ingestion/nextgen/util/manifest_helpers.py,sha256=Eo89CBRDrMGLIq2OBkKS9lH5yL4pgMUOysle1TL9qr8,2765
38
41
  ingestion/nextgen/util/nextgen_specific_genes.py,sha256=1jFcqvtYAlJ7eBwOBm1UC2TzAbjHjdlvPBUzxr1G8dY,1206
39
42
  ingestion/nextgen/util/pre_filter_somatic_vcf.py,sha256=mIaUihmGLbS38D4Gy_Qtf1lFAfW0A-LgAgQmsrEiI-M,3529
40
43
  ingestion/nextgen/util/process_cnv.py,sha256=MIirc8e0k6lsaTZkRM3U3L3IvbrcHmKQ4xlIu585514,2430
@@ -60,6 +63,6 @@ ingestion/vcf_standardization/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
60
63
  ingestion/vcf_standardization/util/af_helpers.py,sha256=dpTzoeIQVeBRt0ETF3a9rp5ojZqznHg4x_hCZ8OPcOg,1061
61
64
  ingestion/vcf_standardization/util/dp_helpers.py,sha256=Nq8oLOLObu4_pv16qwwgpALRlUoJVCULrd9cFOD-eoI,823
62
65
  ingestion/vcf_standardization/util/read_write.py,sha256=x3Pf6Dq8tmolblbCS5CrNmrcHS3FGfqBSFpFgvFGC4g,2526
63
- phc_ingestion-1.0.2.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
64
- phc_ingestion-1.0.2.dist-info/METADATA,sha256=PrNukFYVStjbSRrBD7OjQTv0ZVOIZMwbQ2LDgWP6lNM,676
65
- phc_ingestion-1.0.2.dist-info/RECORD,,
66
+ phc_ingestion-1.0.4.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
67
+ phc_ingestion-1.0.4.dist-info/METADATA,sha256=_pBhlXnkSblAR6gqgXmPMq7FjvAZY_1y55rfdpB9rVQ,676
68
+ phc_ingestion-1.0.4.dist-info/RECORD,,