phc-ingestion 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,13 +4,13 @@ import json
4
4
  import os
5
5
  import shutil
6
6
 
7
- from ingestion.caris.util.tar import unpack
7
+ from ingestion.shared_util.tar import unpack
8
8
  from ingestion.caris.util.metadata import extract_metadata
9
9
  from ingestion.caris.util.structural import extract_structural
10
10
  from ingestion.caris.util.cnv import extract_cnv
11
11
  from ingestion.caris.util.tsv import convert_tsv_to_rgel
12
12
  from ingestion.caris.util.vcf import extract_sv
13
- from ingestion.caris.util.ga4gh import create_yaml
13
+ from ingestion.shared_util.ga4gh import create_yaml
14
14
  from logging import Logger
15
15
 
16
16
  CARIS_CASE_ID_LENGTH = len("TN22-000000")
@@ -1,46 +1,26 @@
1
1
  import os
2
- from pathlib import Path
3
- from ingestion.vcf_standardization.standardize import standardize_vcf
4
- from lifeomic_logging import scoped_logger
5
2
 
3
+ from ingestion.nebula.util.json import process_nebula_json
4
+ from ingestion.nebula.util.vcf import process_nebula_vcf
5
+ from lifeomic_logging import scoped_logger
6
6
 
7
- def process(vcf_file: str, source_file_id: str, out_path: str, case_id: str) -> dict:
8
- with scoped_logger(__name__) as log:
9
- # TODO: do we need to take in + process the manifest file here?
10
7
 
11
- # Process VCF
12
- base_vcf_file = os.path.basename(vcf_file)
13
- vcf_out = base_vcf_file.replace(".vcf", ".modified.vcf")
14
- vcf_final = base_vcf_file.replace(".vcf", ".modified.nrm.filtered.vcf")
15
- if not vcf_final.endswith(".gz"):
16
- vcf_final = vcf_final + ".gz"
8
+ def process(infile, outpath, file_name, source_file_id):
17
9
 
18
- # Assuming Nebula VCFs are germline
19
- sample_name = f"germline_{case_id}"
20
- vcf_line_count = standardize_vcf(
21
- vcf_file, vcf_out, out_path, sample_name, log, compression=True
10
+ with scoped_logger(__name__) as log:
11
+ log.info(f"Beginning Nebula ingestion for file: {file_name}")
12
+ os.makedirs(f"{outpath}", exist_ok=True)
13
+ result, file_genome_references, json_data = process_nebula_json(
14
+ infile, outpath, file_name, source_file_id, log
15
+ )
16
+ somatic_vcf_line_count = process_nebula_vcf(
17
+ result["somatic_vcf"], json_data, outpath, file_name, log
22
18
  )
23
-
24
- # Create a basic manifest for the Nebula VCF
25
- manifest = {
26
- "testType": "Nebula",
27
- "sourceFileId": source_file_id,
28
- "reference": "GRCh38", # Assuming GRCh38, adjust as needed
29
- "resources": [{"fileName": f".lifeomic/nebula/{case_id}/{base_vcf_file}"}],
30
- "files": [
31
- {
32
- "fileName": f".lifeomic/nebula/{case_id}/{vcf_final}",
33
- "sequenceType": "germline",
34
- "type": "shortVariant",
35
- }
36
- ],
37
- }
38
-
39
19
  case_metadata = {
40
- "test_type": "Nebula",
41
- "vcf_line_count": vcf_line_count,
42
- "case_id": case_id,
43
- "germline_genome_reference": manifest["reference"],
20
+ "somatic_vcf_line_count": somatic_vcf_line_count,
44
21
  }
45
22
 
46
- return case_metadata, manifest
23
+ if file_genome_references != {}:
24
+ case_metadata.update(file_genome_references)
25
+
26
+ return case_metadata
@@ -0,0 +1,72 @@
1
+ import glob
2
+ import gzip
3
+ import os
4
+ import shutil
5
+
6
+ from ingestion.shared_util.tar import unpack
7
+ from ingestion.nebula.util.vcf import extract_sv
8
+ from ingestion.shared_util.ga4gh import create_yaml
9
+ from logging import Logger
10
+
11
+
12
+ def handle_tsv(file: str, file_list: list[str]) -> dict[str, str]:
13
+ multiple_tsv = len([file for file in file_list if file.endswith("tsv")]) > 1
14
+
15
+ if not multiple_tsv or "Transformed" in file:
16
+ return {
17
+ "tsv": file,
18
+ }
19
+ return {}
20
+
21
+
22
+ def process_nebula_json(
23
+ infile: str, outpath: str, file_name: str, source_file_id: str, log: Logger
24
+ ):
25
+ # Unpack tarball and go into the new directory
26
+ unpack(infile, outpath)
27
+ os.chdir(outpath)
28
+
29
+ file_list = glob.glob("*")
30
+ files: dict[str, str] = {}
31
+
32
+ for file in file_list:
33
+ extension = ".".join(file.split(".")[1:])
34
+ if file.endswith("vcf"):
35
+ files["somatic.vcf"] = file
36
+ else:
37
+ # There should only be the vcf file
38
+ files[extension] = file
39
+
40
+ log.info(f"Files in tarball input: {file_list}")
41
+
42
+ somatic_filename = None
43
+ data = {}
44
+ metadata = {}
45
+
46
+ # Sometimes they don't come in gzipped
47
+ for key in files.keys():
48
+ if "somatic.vcf" in key:
49
+ somatic_filename = files["somatic.vcf"].replace(".vcf", ".somatic.vcf") + ".gz"
50
+ with open(files["somatic.vcf"], "rb") as f_in:
51
+ with gzip.open(somatic_filename, "wb") as f_out:
52
+ shutil.copyfileobj(f_in, f_out)
53
+
54
+ vcf_results = extract_sv(file_name, bool(somatic_filename), False, False)
55
+
56
+ # We might not have any of these files but we need an empty json object here.
57
+ file_genome_references = {}
58
+ if vcf_results:
59
+ metadata["files"] = metadata["files"] + vcf_results
60
+ for vcf in vcf_results:
61
+ seq_type = vcf.get("sequenceType")
62
+ file_genome_references[f"{seq_type}_genome_reference"] = vcf["reference"]
63
+
64
+ create_yaml(metadata, file_name)
65
+
66
+ # Return VCF files for immediate processing, and JSON data for adding vendsig
67
+ result = {}
68
+
69
+ if somatic_filename is not None:
70
+ result["somatic_vcf"] = f"{outpath}/{somatic_filename}"
71
+
72
+ return (result, file_genome_references, data)
@@ -0,0 +1,100 @@
1
+ import datetime
2
+ import gzip
3
+
4
+ from logging import Logger
5
+
6
+ from ingestion.vcf_standardization.standardize import standardize_vcf
7
+
8
+
9
+ def create_empty_vcf_zip(prefix):
10
+ vcf_gzip_path = f"{prefix}.modified.somatic.vcf.gz"
11
+ content = (
12
+ """##fileformat=VCFv4.1
13
+ ##filedate="""
14
+ + datetime.datetime.now().isoformat()
15
+ + """
16
+ ##FILTER=<ID=PASS,Description="All filters passed">
17
+ ##FILTER=<ID=R8,Description="IndelRepeatLength is greater than 8">
18
+ ##FILTER=<ID=R8.1,Description="IndelRepeatLength of a monomer is greater than 8">
19
+ ##FILTER=<ID=R8.2,Description="IndelRepeatLength of a dimer is greater than 8">
20
+ ##FILTER=<ID=sb,Description="Variant strand bias high">
21
+ ##FILTER=<ID=sb.s,Description="Variant strand bias significantly high (only for SNV)">
22
+ ##FILTER=<ID=rs,Description="Variant with rs (dbSNP) number in a non-core gene">
23
+ ##FILTER=<ID=FP,Description="Possibly false positives due to high similarity to off-target regions">
24
+ ##FILTER=<ID=NC,Description="Noncoding INDELs on non-core genes">
25
+ ##FILTER=<ID=lowDP,Description="low depth variant">
26
+ ##FILTER=<ID=Benign,Description="Benign variant">
27
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
28
+ ##FORMAT=<ID=AF,Number=1,Type=String,Description="Variant Allele Frequency">
29
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT """
30
+ + prefix
31
+ + """
32
+ """
33
+ )
34
+
35
+ with gzip.open(vcf_gzip_path, "wb") as f:
36
+ f.write(content.encode("utf-8"))
37
+
38
+
39
+ # This is done in next step, we are just adding to yaml
40
+ def extract_sv(prefix, include_somatic: bool):
41
+ vcfs = []
42
+
43
+ # Hard-code genome reference for Nebula VCFs
44
+ genome_reference = "GRCh38"
45
+
46
+ if include_somatic:
47
+ vcfs.append(
48
+ {
49
+ "fileName": f".lifeomic/nebula/{prefix}/{prefix}.modified.somatic.nrm.filtered.vcf.gz",
50
+ "sequenceType": "somatic",
51
+ "type": "shortVariant", # todo: Is this always going to be shortVariant?
52
+ "reference": genome_reference,
53
+ }
54
+ )
55
+
56
+ return vcfs
57
+
58
+
59
+ def get_vendsig_dict(json_data, log: Logger):
60
+ # Return a dicitionary of {'chr:star_pos:ref:alt' : 'vendsig'}
61
+ vendsig_dict = {"vendor": "nebula"}
62
+
63
+ return vendsig_dict
64
+
65
+
66
+ def map_vendsig(ci: str) -> str:
67
+ if ci in ["Pathogenic Variant", "Pathogenic"]:
68
+ return "Pathogenic"
69
+ elif ci in ["Likely Pathogenic Variant", "Likely Pathogenic"]:
70
+ return "Likely pathogenic"
71
+ elif ci in ["Benign Variant", "Benign"]:
72
+ return "Benign"
73
+ elif ci in ["Likely Benign Variant", "Likely Benign"]:
74
+ return "Likely benign"
75
+ elif ci in ["Variant of Uncertain Significance", "VUS"]:
76
+ return "Uncertain significance"
77
+ else:
78
+ return "Unknown"
79
+
80
+
81
+ def process_nebula_vcf(infile, json_data, outpath, file_name, log: Logger):
82
+ line_count = 0
83
+ vendsig_dict = {"vendor": "nebula"}
84
+
85
+ outfile = f"{file_name}.modified.somatic.vcf"
86
+ sample_name = file_name
87
+ # Read in a dictionary of variants with VENDSIG from the JSON file for somatic only
88
+ vendsig_dict = get_vendsig_dict(json_data, log)
89
+
90
+ line_count = standardize_vcf(
91
+ infile=infile,
92
+ outfile=outfile,
93
+ out_path=outpath,
94
+ case_id=sample_name,
95
+ log=log,
96
+ vendsig_dict=vendsig_dict,
97
+ compression=True,
98
+ )
99
+
100
+ return line_count
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phc-ingestion
3
- Version: 0.9.3
3
+ Version: 0.10.1
4
4
  Summary: Functions for LifeOmic PHC genomic ingestions
5
5
  License: MIT
6
6
  Author-email: LifeOmic Development <development@lifeomic.com>
@@ -4,15 +4,13 @@ ingestion/caris/process.py,sha256=7GaSTg9LjqSn7iTNbY3FJIPFvVjkd6wgXiVC00pn-7A,14
4
4
  ingestion/caris/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  ingestion/caris/util/cnv.py,sha256=Nrc0aoG2k4tmrqHb69hAuXr0adDZIVVJRRjf9_sO91E,4441
6
6
  ingestion/caris/util/detect_genome_ref.py,sha256=MpiPa71QmlO3MWvjxPzNdbEHyOhorOcpQWWlwE5BO4c,1640
7
- ingestion/caris/util/ga4gh.py,sha256=-jNQj79zspxG67MxHzOfwAhLbb9je55M1h4-i5ri-tU,507
8
7
  ingestion/caris/util/hla.py,sha256=X_t6ngBRvmdG3m4I2_KnPFeWn3BaH-3IWHtOvDbS32A,770
9
8
  ingestion/caris/util/ihc.py,sha256=vegxudxHj7tLihrXGbEx_ptwkSsu3YCCB1nZVwoiYXg,12312
10
9
  ingestion/caris/util/interpretation.py,sha256=CghNurqeVA5VTBBorU8-ZTN-PVNPnR8wrmTwKCH3568,555
11
- ingestion/caris/util/json.py,sha256=HBU3Tf-XSi9fGHANYUtD8maXNYqmmnpncGh0KCDaPEU,5018
10
+ ingestion/caris/util/json.py,sha256=aifO1hnZwNSS-ZtY20otyGbfIoc23w9HMWJ5D56lhFo,5020
12
11
  ingestion/caris/util/metadata.py,sha256=a6NToMtGtIRrlMd3CQwq4IRjGGmIiBA9JFwsATjNEoQ,10287
13
12
  ingestion/caris/util/specimen_details.py,sha256=wDTIelrGelAuSljmE6dAoJRjb9kwrlhmB2f1_vcCiUc,2963
14
13
  ingestion/caris/util/structural.py,sha256=EUcMIea_WnafoVmFLIyEqlJ_HtYIj_g6qkekXa7QNQs,4628
15
- ingestion/caris/util/tar.py,sha256=BGR_2vBbxyMgF-GzJ3SrihsPdOzII4SFVz9tvKV5vo0,482
16
14
  ingestion/caris/util/tests.py,sha256=mcG3A8TW81_sn2Bfoa-Gd6Q1sR3_R4FX2BNskD4DkJk,372
17
15
  ingestion/caris/util/tmb.py,sha256=DVi1wPSjVr_32ZCc6Yb51tGqUlcxUx40yCvuqvNuDx4,1027
18
16
  ingestion/caris/util/tsv.py,sha256=xeIfDUtqG_5ewkbaPLakqm4kQlu6ClgkAf4tefqKlJA,1595
@@ -29,7 +27,9 @@ ingestion/generic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
29
27
  ingestion/generic/process.py,sha256=ZaVnZ_gx9faDUsuresI1A0oCegTa-dPQT7DBFMeZGyY,1777
30
28
  ingestion/generic/utils.py,sha256=1MEIru7uq38IjUdL8lcHqDH0oTki9uWrz1f2e-pmRoU,2814
31
29
  ingestion/nebula/__init__.py,sha256=VauK-rup_N8ZXVohx3HYqHX_PE_WoPyMUhdv2R7al4o,45
32
- ingestion/nebula/process.py,sha256=Ss2hReq9dAYFcBC4r89RYKcdE6cdUNXvgH2VMBijJPM,1687
30
+ ingestion/nebula/process.py,sha256=r4zKxVRPzl0pI-OInGIy8V_Z2K4UW_iIf9ggUvGZZlk,893
31
+ ingestion/nebula/util/json.py,sha256=ChThuJjF7o4xMQ4gkX7s2w9wgZ5_k528XxiSI3-gOCU,2230
32
+ ingestion/nebula/util/vcf.py,sha256=83JWG6ndPxHYZNXVj6-Gon9AY9JnGeR1JMivUQoSHjM,3311
33
33
  ingestion/nextgen/__init__.py,sha256=7LQ-h_Bvc5P1QcHMdzsqi1Qm4fTJn04-ozar2ty9wSc,59
34
34
  ingestion/nextgen/process.py,sha256=5Z0RfclwTAYZruGDiLPutjPCYFh1DJpoWY9dnttghT4,3993
35
35
  ingestion/nextgen/util/alteration_table.py,sha256=JTWBL1Fqj_pGsH5vwuVEnCUJle2wOBk6VYImHYCF9vg,6129
@@ -46,8 +46,10 @@ ingestion/resources/GRCh37_map.csv.gz,sha256=JOEkjtbYrJpIdyoZdCvfJhvvz2dNfkSve7l
46
46
  ingestion/resources/GRCh38_map.csv.gz,sha256=qriYO2_buCCb4T6WcuZ-pCwPxMsm0TL2OxAHvJ1cEfA,612373
47
47
  ingestion/shared_util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
48
  ingestion/shared_util/coords_to_genes.py,sha256=vz9EfgFm3BS6pEPnslbEka8cJKlQZtHJdH2WRCCUMdE,1669
49
+ ingestion/shared_util/ga4gh.py,sha256=-jNQj79zspxG67MxHzOfwAhLbb9je55M1h4-i5ri-tU,507
49
50
  ingestion/shared_util/gene_to_coords.py,sha256=M-q5ateLSQ4fCF0uMk5TX2uBLRrcZzXqXEf05TPaLsU,876
50
51
  ingestion/shared_util/open_maybe_gzipped.py,sha256=FrOPJ4OgfpQGyT3f1Su1rFeuuYYu6QJ-nVIBIosbfhw,232
52
+ ingestion/shared_util/tar.py,sha256=BGR_2vBbxyMgF-GzJ3SrihsPdOzII4SFVz9tvKV5vo0,482
51
53
  ingestion/shared_util/types.py,sha256=u9AD2OrTQWMBtK_7VXHsD8Rv6HFs-7ZUItNl4KXdL7k,68
52
54
  ingestion/vcf_standardization/Variant.py,sha256=aoSqT4XAECxCF0JZgv1YRmxuw20WGeWkwFTw0x6FmWc,5475
53
55
  ingestion/vcf_standardization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -56,6 +58,6 @@ ingestion/vcf_standardization/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
56
58
  ingestion/vcf_standardization/util/af_helpers.py,sha256=dpTzoeIQVeBRt0ETF3a9rp5ojZqznHg4x_hCZ8OPcOg,1061
57
59
  ingestion/vcf_standardization/util/dp_helpers.py,sha256=Nq8oLOLObu4_pv16qwwgpALRlUoJVCULrd9cFOD-eoI,823
58
60
  ingestion/vcf_standardization/util/read_write.py,sha256=x3Pf6Dq8tmolblbCS5CrNmrcHS3FGfqBSFpFgvFGC4g,2526
59
- phc_ingestion-0.9.3.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
60
- phc_ingestion-0.9.3.dist-info/METADATA,sha256=BeKjYNu_HC4PvcMS57E38mys7Q2gvpcrDTvaI0vvdPs,572
61
- phc_ingestion-0.9.3.dist-info/RECORD,,
61
+ phc_ingestion-0.10.1.dist-info/WHEEL,sha256=B19PGBCYhWaz2p_UjAoRVh767nYQfk14Sn4TpIZ-nfU,87
62
+ phc_ingestion-0.10.1.dist-info/METADATA,sha256=TKRNABWds66BCmH2r2QoLxqcqNEZzUsom92zWJdx2aw,573
63
+ phc_ingestion-0.10.1.dist-info/RECORD,,
File without changes
File without changes