phc-ingestion 0.8.36__tar.gz → 0.8.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/PKG-INFO +1 -1
  2. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/json.py +10 -2
  3. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/metadata.py +9 -1
  4. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/vcf.py +42 -7
  5. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/foundation/util/vcf_etl.py +0 -1
  6. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/process.py +1 -0
  7. phc-ingestion-0.8.38/ingestion/nextgen/util/manifest_helpers.py +88 -0
  8. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/process_manifest.py +9 -9
  9. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/pyproject.toml +4 -3
  10. phc-ingestion-0.8.36/ingestion/nextgen/util/manifest_helpers.py +0 -42
  11. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/PYPI.md +0 -0
  12. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/__init__.py +0 -0
  13. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/__init__.py +0 -0
  14. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/process.py +0 -0
  15. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/__init__.py +0 -0
  16. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/cnv.py +0 -0
  17. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/detect_genome_ref.py +0 -0
  18. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/ga4gh.py +0 -0
  19. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/hla.py +0 -0
  20. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/ihc.py +0 -0
  21. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/interpretation.py +0 -0
  22. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/specimen_details.py +0 -0
  23. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/structural.py +0 -0
  24. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/tar.py +0 -0
  25. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/tests.py +0 -0
  26. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/tmb.py +0 -0
  27. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/caris/util/tsv.py +0 -0
  28. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/foundation/__init__.py +0 -0
  29. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/foundation/process.py +0 -0
  30. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/foundation/util/__init__.py +0 -0
  31. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/foundation/util/cnv.py +0 -0
  32. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/foundation/util/fnv.py +0 -0
  33. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/foundation/util/ga4gh.py +0 -0
  34. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/foundation/util/interpretation.py +0 -0
  35. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/generic/__init__.py +0 -0
  36. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/generic/process.py +0 -0
  37. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/generic/utils.py +0 -0
  38. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/__init__.py +0 -0
  39. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/alteration_table.py +0 -0
  40. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/interpretation.py +0 -0
  41. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/nextgen_specific_genes.py +0 -0
  42. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/pre_filter_somatic_vcf.py +0 -0
  43. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/process_cnv.py +0 -0
  44. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/process_structural.py +0 -0
  45. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/process_vcf.py +0 -0
  46. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/nextgen/util/types.py +0 -0
  47. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/resources/GRCh37_map.csv.gz +0 -0
  48. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/resources/GRCh38_map.csv.gz +0 -0
  49. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/shared_util/__init__.py +0 -0
  50. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/shared_util/coords_to_genes.py +0 -0
  51. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/shared_util/gene_to_coords.py +0 -0
  52. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/shared_util/open_maybe_gzipped.py +0 -0
  53. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/shared_util/types.py +0 -0
  54. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/vcf_standardization/Variant.py +0 -0
  55. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/vcf_standardization/__init__.py +0 -0
  56. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/vcf_standardization/standardize.py +0 -0
  57. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/vcf_standardization/util/__init__.py +0 -0
  58. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/vcf_standardization/util/af_helpers.py +0 -0
  59. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/vcf_standardization/util/dp_helpers.py +0 -0
  60. {phc-ingestion-0.8.36 → phc-ingestion-0.8.38}/ingestion/vcf_standardization/util/read_write.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phc-ingestion
3
- Version: 0.8.36
3
+ Version: 0.8.38
4
4
  Summary: Functions for LifeOmic PHC genomic ingestions
5
5
  License: MIT
6
6
  Author-email: LifeOmic Development <development@lifeomic.com>
@@ -84,11 +84,17 @@ def process_caris_json(infile: str, outpath: str, file_name: str, source_file_id
84
84
  ]
85
85
 
86
86
  # Get patient
87
- metadata = extract_metadata(data, file_name, files, source_file_id, log)
87
+ metadata, is_test_cancelled_permit_vcf_skip = extract_metadata(
88
+ data, file_name, files, source_file_id, log
89
+ )
88
90
  structural_results = extract_structural(file_name, data, log)
89
91
  cnv_results = extract_cnv(file_name, data, log)
90
92
  rgel_results = convert_tsv_to_rgel(file_name, files, log)
91
- vcf_results = extract_sv(file_name, bool(somatic_filename), bool(germline_filename))
93
+
94
+ include_empty = metadata["ihcTests"] and is_test_cancelled_permit_vcf_skip
95
+ vcf_results = extract_sv(
96
+ file_name, bool(somatic_filename), bool(germline_filename), include_empty
97
+ )
92
98
 
93
99
  # We might not have any of these files but we need an empty json object here.
94
100
  file_genome_references = {}
@@ -117,5 +123,7 @@ def process_caris_json(infile: str, outpath: str, file_name: str, source_file_id
117
123
  result["somatic_vcf"] = f"{outpath}/{somatic_filename}"
118
124
  if germline_filename is not None:
119
125
  result["germline_vcf"] = f"{outpath}/{germline_filename}"
126
+ if not germline_filename and not somatic_filename and include_empty:
127
+ result["somatic_vcf"] = f"{outpath}/{file_name}.modified.somatic.vcf.gz"
120
128
 
121
129
  return (result, germline_case_id, file_genome_references, data)
@@ -80,6 +80,7 @@ def is_valid_test_entry(test: dict):
80
80
 
81
81
  # Build up the manifest iteratively because almost everything is optional
82
82
  def extract_metadata(data, prefix, files, source_file_id, log: Logger) -> dict:
83
+ is_test_cancelled_permit_vcf_skip = False
83
84
  metadata = {}
84
85
 
85
86
  test_details = data["testDetails"]
@@ -158,6 +159,13 @@ def extract_metadata(data, prefix, files, source_file_id, log: Logger) -> dict:
158
159
  # if not sufficient quantity we won't have test results
159
160
  if test_details["reportType"] != "QNS":
160
161
  for test in tests:
162
+ if "test_cancellation_reason" in test:
163
+ if test["test_cancellation_reason"] == "Quantitation quantity not sufficient":
164
+ # capture cancellation reason before bailing
165
+ # this is so we can generate an empty vcf so present biomarkers are
166
+ # still ingested: https://lifeomic.atlassian.net/browse/PHC-5748
167
+ is_test_cancelled_permit_vcf_skip = True
168
+
161
169
  if not is_valid_test_entry(test):
162
170
  continue
163
171
  # Sometimes, if there is only a single test result,
@@ -244,4 +252,4 @@ def extract_metadata(data, prefix, files, source_file_id, log: Logger) -> dict:
244
252
  )
245
253
 
246
254
  active_metadata = {k: v for k, v in metadata.items() if v is not None}
247
- return active_metadata
255
+ return (active_metadata, is_test_cancelled_permit_vcf_skip)
@@ -1,11 +1,5 @@
1
1
  import datetime
2
2
  import gzip
3
- import io
4
- import os
5
- import re
6
- import subprocess
7
- import sys
8
- import zipfile
9
3
 
10
4
  from logging import Logger
11
5
 
@@ -13,8 +7,38 @@ from ingestion.caris.util.tests import safely_extract_tests_from_json_data
13
7
  from ingestion.vcf_standardization.standardize import standardize_vcf
14
8
 
15
9
 
10
+ def create_empty_vcf_zip(prefix):
11
+ vcf_gzip_path = f"{prefix}.modified.somatic.vcf.gz"
12
+ content = (
13
+ """##fileformat=VCFv4.1
14
+ ##filedate="""
15
+ + datetime.datetime.now().isoformat()
16
+ + """
17
+ ##FILTER=<ID=PASS,Description="All filters passed">
18
+ ##FILTER=<ID=R8,Description="IndelRepeatLength is greater than 8">
19
+ ##FILTER=<ID=R8.1,Description="IndelRepeatLength of a monomer is greater than 8">
20
+ ##FILTER=<ID=R8.2,Description="IndelRepeatLength of a dimer is greater than 8">
21
+ ##FILTER=<ID=sb,Description="Variant strand bias high">
22
+ ##FILTER=<ID=sb.s,Description="Variant strand bias significantly high (only for SNV)">
23
+ ##FILTER=<ID=rs,Description="Variant with rs (dbSNP) number in a non-core gene">
24
+ ##FILTER=<ID=FP,Description="Possibly false positives due to high similarity to off-target regions">
25
+ ##FILTER=<ID=NC,Description="Noncoding INDELs on non-core genes">
26
+ ##FILTER=<ID=lowDP,Description="low depth variant">
27
+ ##FILTER=<ID=Benign,Description="Benign variant">
28
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
29
+ ##FORMAT=<ID=AF,Number=1,Type=String,Description="Variant Allele Frequency">
30
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT """
31
+ + prefix
32
+ + """
33
+ """
34
+ )
35
+
36
+ with gzip.open(vcf_gzip_path, "wb") as f:
37
+ f.write(content.encode("utf-8"))
38
+
39
+
16
40
  # This is done in next step, we are just adding to yaml
17
- def extract_sv(prefix, include_somatic: bool, include_germline: bool):
41
+ def extract_sv(prefix, include_somatic: bool, include_germline: bool, include_empty: bool):
18
42
  vcfs = []
19
43
 
20
44
  # Hard-code genome reference for Caris VCFs
@@ -40,6 +64,17 @@ def extract_sv(prefix, include_somatic: bool, include_germline: bool):
40
64
  }
41
65
  )
42
66
 
67
+ if not vcfs and include_empty:
68
+ create_empty_vcf_zip(prefix)
69
+ vcfs.append(
70
+ {
71
+ "fileName": f".lifeomic/caris/{prefix}/{prefix}.modified.somatic.nrm.filtered.vcf.gz",
72
+ "sequenceType": "somatic",
73
+ "type": "shortVariant",
74
+ "reference": genome_reference,
75
+ }
76
+ )
77
+
43
78
  return vcfs
44
79
 
45
80
 
@@ -1,6 +1,5 @@
1
1
  import gzip
2
2
  import xmltodict
3
- from natsort import natsorted
4
3
  from logging import Logger
5
4
  import re
6
5
  import os
@@ -68,6 +68,7 @@ def process(
68
68
  bool(structural_path_name),
69
69
  translocations,
70
70
  hyperdiploidy_chromosomes,
71
+ log,
71
72
  )
72
73
  pre_filtered_somatic_vcf_path = pre_filter_somatic_vcf(
73
74
  vendor_files["somaticVcfFile"],
@@ -0,0 +1,88 @@
1
+ from logging import Logger
2
+ import re
3
+
4
+
5
+ def parse_pattern(pattern: str, line: str, name: str) -> str:
6
+ regex = re.compile(pattern)
7
+ match = regex.match(line)
8
+
9
+ if not match:
10
+ raise ValueError(f"Could not parse {name} from line")
11
+
12
+ return match.group(1).strip()
13
+
14
+
15
+ def parse_indication(line: str) -> str:
16
+ return parse_pattern(r"^.*Reason for Referral:(.*?)(Patient|<).*$", line, "indication")
17
+
18
+
19
+ def parse_ordering_md(line: str) -> str:
20
+ return parse_pattern(r"^.*Physician Name:(.*?)(Reason|<).*$", line, "ordering MD")
21
+
22
+
23
+ def parse_patient_name(line: str) -> str:
24
+ return parse_pattern(r"^.*Patient Name: (.*?)(Accession|<).*$", line, "patient name")
25
+
26
+
27
+ def parse_sample_number(line: str) -> str:
28
+ return parse_pattern(r"^.*Specimen #: (\d*) .*$", line, "sample number")
29
+
30
+
31
+ def parse_body_site(line: str) -> str:
32
+ return parse_pattern(r"^.*Specimen:(.*?)(Age|Birthdate|<).*$", line, "body site")
33
+
34
+
35
+ def parse_report_id(line: str) -> str:
36
+ return parse_pattern(r"^.*Accession #: (.*?) .*$", line, "report ID")
37
+
38
+
39
+ def parse_report_date_single_line(line: str) -> str:
40
+ return parse_pattern(
41
+ r"^.*Diagnostic Genomics Laboratory.*(\d{2}\/\d{2}\/\d{4}).*$", line, "report date"
42
+ )
43
+
44
+
45
+ def parse_report_date_multiline(patient_info_lines: list[str]) -> str:
46
+ in_range_trigger = False
47
+
48
+ for line in patient_info_lines:
49
+ if "Laboratory" in line:
50
+ in_range_trigger = True
51
+ continue
52
+ if in_range_trigger:
53
+ formatted_line = re.sub(r"<\/?T.\/?>", "", line).strip()
54
+ if not formatted_line:
55
+ continue
56
+ return parse_pattern(
57
+ r"^.*(\d{2}\/\d{2}\/\d{4}).*$", formatted_line, "report date from multiline"
58
+ )
59
+
60
+ raise ValueError("Could not parse report date from lines")
61
+
62
+
63
+ def parse_report_date(patient_info_lines: list[str], log: Logger) -> str:
64
+ """
65
+ Typically, the report date is in a form like:
66
+ ```
67
+ Diagnostic Genomics Laboratory 01/01/2021
68
+ ```
69
+
70
+ However, sometimes the date is split across multiple lines, like:
71
+ ```
72
+ Diagnostic Genomics Laboratory
73
+ ...random empty lines or lines with only tags...
74
+ 01/01/2021
75
+ ```
76
+ This function attempts to first parse the date from a single line, and if that fails,
77
+ it will attempt to parse it from multiple lines.
78
+ """
79
+ for line in patient_info_lines:
80
+ if "Laboratory" in line:
81
+ try:
82
+ report_date = parse_report_date_single_line(line)
83
+ return report_date
84
+ except ValueError:
85
+ log.warning("Could not parse report date from single line")
86
+ break
87
+
88
+ return parse_report_date_multiline(patient_info_lines)
@@ -17,7 +17,7 @@ def search_and_grab(array: list, search_item: str, grab_index: int):
17
17
  return array[array.index([i for i in array if re.search(search_item, i)][0]) + grab_index]
18
18
 
19
19
 
20
- def extract_xml_text(xml_in_file: str):
20
+ def extract_xml_text(xml_in_file: str) -> list[str]:
21
21
  with open(xml_in_file, "r") as f:
22
22
  xml_lines = f.readlines()
23
23
 
@@ -35,7 +35,7 @@ def extract_xml_text(xml_in_file: str):
35
35
  return patient_info_lines
36
36
 
37
37
 
38
- def extract_interpretation_text(xml_in_file: str):
38
+ def extract_interpretation_text(xml_in_file: str) -> list[str]:
39
39
  with open(xml_in_file, "r") as f:
40
40
  xml_lines = f.readlines()
41
41
 
@@ -111,7 +111,7 @@ def extract_patient_data(patient_info_lines: list[str]):
111
111
  return patient_data
112
112
 
113
113
 
114
- def extract_test_data(patient_info_lines: list, interpretation_lines: list):
114
+ def extract_test_data(patient_info_lines: list[str], interpretation_lines: list[str], log: Logger):
115
115
  # Initialize manifest and hard-code some values
116
116
  manifest: dict[str, Any] = {}
117
117
  manifest["testType"] = "Plasma Cell Myeloma Panel"
@@ -128,12 +128,11 @@ def extract_test_data(patient_info_lines: list, interpretation_lines: list):
128
128
  manifest["medFacilID"] = ""
129
129
  manifest["medFacilName"] = "IU Health"
130
130
 
131
- for line in patient_info_lines:
132
- if "reportDate" not in manifest and "Laboratory" in line:
133
- report_date = manifest_helpers.parse_report_date(line)
134
- manifest["reportDate"] = transform_date(report_date)
135
- manifest["indexedDate"] = manifest["reportDate"]
131
+ report_date = manifest_helpers.parse_report_date(patient_info_lines, log)
132
+ manifest["reportDate"] = transform_date(report_date)
133
+ manifest["indexedDate"] = manifest["reportDate"]
136
134
 
135
+ for line in patient_info_lines:
137
136
  if "collDate" not in manifest and "Collected" in line:
138
137
  collArray = line.split(" ")
139
138
  coll_date = search_and_grab(collArray, "Collected", 1)
@@ -178,10 +177,11 @@ def process_manifest(
178
177
  include_structural: bool,
179
178
  somatic_translocations: list[str],
180
179
  hyperdiploidy_chromosomes: list[str] | None,
180
+ log: Logger,
181
181
  ):
182
182
  test_text = extract_xml_text(xml_in_file)
183
183
  interpretation_text = extract_interpretation_text(xml_in_file)
184
- manifest = extract_test_data(test_text, interpretation_text)
184
+ manifest = extract_test_data(test_text, interpretation_text, log)
185
185
  manifest.update(extract_patient_data(test_text))
186
186
 
187
187
  file_prefix = f".lifeomic/nextgen/{case_id}/{case_id}"
@@ -1,19 +1,20 @@
1
1
  [project]
2
2
  name = "phc-ingestion"
3
- version = "0.8.36"
3
+ version = "0.8.38"
4
4
  description = "Functions for LifeOmic PHC genomic ingestions"
5
5
  authors = [
6
6
  { name = "LifeOmic Development", email = "development@lifeomic.com" },
7
7
  ]
8
8
  dependencies = [
9
9
  "lifeomic-logging>=0.3.2,<0.4.0",
10
- "xmltodict==0.13.0",
10
+ "xmltodict>=0.14.2",
11
11
  "natsort==7.1.1",
12
12
  "ruamel.yaml==0.17.21",
13
- "pandas>=1.5.0,<1.6.0",
13
+ "pandas>=2.2.3",
14
14
  "jsonschema>=4.16.0,<5.0.0",
15
15
  "schema>=0.7.5",
16
16
  "packaging>=23.1",
17
+ "numpy>=2.1.2",
17
18
  ]
18
19
  requires-python = ">=3.11"
19
20
  readme = "PYPI.md"
@@ -1,42 +0,0 @@
1
- from logging import Logger
2
- import re
3
-
4
-
5
- def parse_pattern(pattern: str, line: str, name: str) -> str:
6
- regex = re.compile(pattern)
7
- match = regex.match(line)
8
-
9
- if not match:
10
- raise ValueError(f"Could not parse {name} from line")
11
-
12
- return match.group(1).strip()
13
-
14
-
15
- def parse_indication(line: str) -> str:
16
- return parse_pattern(r"^.*Reason for Referral:(.*?)(Patient|<).*$", line, "indication")
17
-
18
-
19
- def parse_ordering_md(line: str) -> str:
20
- return parse_pattern(r"^.*Physician Name:(.*?)(Reason|<).*$", line, "ordering MD")
21
-
22
-
23
- def parse_patient_name(line: str) -> str:
24
- return parse_pattern(r"^.*Patient Name: (.*?)(Accession|<).*$", line, "patient name")
25
-
26
-
27
- def parse_sample_number(line: str) -> str:
28
- return parse_pattern(r"^.*Specimen #: (\d*) .*$", line, "sample number")
29
-
30
-
31
- def parse_body_site(line: str) -> str:
32
- return parse_pattern(r"^.*Specimen:(.*?)(Age|Birthdate|<).*$", line, "body site")
33
-
34
-
35
- def parse_report_id(line: str) -> str:
36
- return parse_pattern(r"^.*Accession #: (.*?) .*$", line, "report ID")
37
-
38
-
39
- def parse_report_date(line: str) -> str:
40
- return parse_pattern(
41
- r"^.*Diagnostic Genomics Laboratory.*(\d{2}\/\d{2}\/\d{4}).*$", line, "report date"
42
- )
File without changes