phc-ingestion 0.9.2__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/PKG-INFO +1 -1
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/json.py +2 -2
- phc-ingestion-0.10.0/ingestion/nebula/__init__.py +1 -0
- phc-ingestion-0.10.0/ingestion/nebula/process.py +25 -0
- phc-ingestion-0.10.0/ingestion/nebula/util/json.py +72 -0
- phc-ingestion-0.10.0/ingestion/nebula/util/vcf.py +100 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/pyproject.toml +1 -1
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/PYPI.md +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/process.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/cnv.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/detect_genome_ref.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/hla.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/ihc.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/interpretation.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/metadata.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/specimen_details.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/structural.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/tests.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/tmb.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/tsv.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/caris/util/vcf.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/foundation/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/foundation/process.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/foundation/util/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/foundation/util/cnv.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/foundation/util/fnv.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/foundation/util/ga4gh.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/foundation/util/interpretation.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/foundation/util/vcf_etl.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/generic/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/generic/process.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/generic/utils.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/process.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/alteration_table.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/interpretation.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/manifest_helpers.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/nextgen_specific_genes.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/pre_filter_somatic_vcf.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/process_cnv.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/process_manifest.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/process_structural.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/process_vcf.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/types.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/resources/GRCh37_map.csv.gz +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/resources/GRCh38_map.csv.gz +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/shared_util/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/shared_util/coords_to_genes.py +0 -0
- {phc-ingestion-0.9.2/ingestion/caris/util → phc-ingestion-0.10.0/ingestion/shared_util}/ga4gh.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/shared_util/gene_to_coords.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/shared_util/open_maybe_gzipped.py +0 -0
- {phc-ingestion-0.9.2/ingestion/caris/util → phc-ingestion-0.10.0/ingestion/shared_util}/tar.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/shared_util/types.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/Variant.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/standardize.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/util/__init__.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/util/af_helpers.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/util/dp_helpers.py +0 -0
- {phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/util/read_write.py +0 -0
|
@@ -4,13 +4,13 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
import shutil
|
|
6
6
|
|
|
7
|
-
from ingestion.
|
|
7
|
+
from ingestion.shared_util.tar import unpack
|
|
8
8
|
from ingestion.caris.util.metadata import extract_metadata
|
|
9
9
|
from ingestion.caris.util.structural import extract_structural
|
|
10
10
|
from ingestion.caris.util.cnv import extract_cnv
|
|
11
11
|
from ingestion.caris.util.tsv import convert_tsv_to_rgel
|
|
12
12
|
from ingestion.caris.util.vcf import extract_sv
|
|
13
|
-
from ingestion.
|
|
13
|
+
from ingestion.shared_util.ga4gh import create_yaml
|
|
14
14
|
from logging import Logger
|
|
15
15
|
|
|
16
16
|
CARIS_CASE_ID_LENGTH = len("TN22-000000")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from ingestion.nebula.process import process
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from ingestion.nebula.util.json import process_nebula_json
|
|
4
|
+
from ingestion.nebula.util.vcf import process_nebula_vcf
|
|
5
|
+
from lifeomic_logging import scoped_logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def process_nebula(infile, outpath, file_name, source_file_id):
|
|
9
|
+
|
|
10
|
+
with scoped_logger(__name__) as log:
|
|
11
|
+
os.makedirs(f"{outpath}", exist_ok=True)
|
|
12
|
+
result, file_genome_references, json_data = process_nebula_json(
|
|
13
|
+
infile, outpath, file_name, source_file_id, log
|
|
14
|
+
)
|
|
15
|
+
somatic_vcf_line_count = process_nebula_vcf(
|
|
16
|
+
result["somatic_vcf"], json_data, outpath, file_name, log
|
|
17
|
+
)
|
|
18
|
+
case_metadata = {
|
|
19
|
+
"somatic_vcf_line_count": somatic_vcf_line_count,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if file_genome_references != {}:
|
|
23
|
+
case_metadata.update(file_genome_references)
|
|
24
|
+
|
|
25
|
+
return case_metadata
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import gzip
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
|
|
6
|
+
from ingestion.shared_util.tar import unpack
|
|
7
|
+
from ingestion.nebula.util.vcf import extract_sv
|
|
8
|
+
from ingestion.shared_util.ga4gh import create_yaml
|
|
9
|
+
from logging import Logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def handle_tsv(file: str, file_list: list[str]) -> dict[str, str]:
|
|
13
|
+
multiple_tsv = len([file for file in file_list if file.endswith("tsv")]) > 1
|
|
14
|
+
|
|
15
|
+
if not multiple_tsv or "Transformed" in file:
|
|
16
|
+
return {
|
|
17
|
+
"tsv": file,
|
|
18
|
+
}
|
|
19
|
+
return {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def process_nebula_json(
|
|
23
|
+
infile: str, outpath: str, file_name: str, source_file_id: str, log: Logger
|
|
24
|
+
):
|
|
25
|
+
# Unpack tarball and go into the new directory
|
|
26
|
+
unpack(infile, outpath)
|
|
27
|
+
os.chdir(outpath)
|
|
28
|
+
|
|
29
|
+
file_list = glob.glob("*")
|
|
30
|
+
files: dict[str, str] = {}
|
|
31
|
+
|
|
32
|
+
for file in file_list:
|
|
33
|
+
extension = ".".join(file.split(".")[1:])
|
|
34
|
+
if file.endswith("vcf"):
|
|
35
|
+
files["somatic.vcf"] = file
|
|
36
|
+
else:
|
|
37
|
+
# There should only be the vcf file
|
|
38
|
+
files[extension] = file
|
|
39
|
+
|
|
40
|
+
log.info(f"Files in tarball input: {file_list}")
|
|
41
|
+
|
|
42
|
+
somatic_filename = None
|
|
43
|
+
data = {}
|
|
44
|
+
metadata = {}
|
|
45
|
+
|
|
46
|
+
# Sometimes they don't come in gzipped
|
|
47
|
+
for key in files.keys():
|
|
48
|
+
if "somatic.vcf" in key:
|
|
49
|
+
somatic_filename = files["somatic.vcf"].replace(".vcf", ".somatic.vcf") + ".gz"
|
|
50
|
+
with open(files["somatic.vcf"], "rb") as f_in:
|
|
51
|
+
with gzip.open(somatic_filename, "wb") as f_out:
|
|
52
|
+
shutil.copyfileobj(f_in, f_out)
|
|
53
|
+
|
|
54
|
+
vcf_results = extract_sv(file_name, bool(somatic_filename), False, False)
|
|
55
|
+
|
|
56
|
+
# We might not have any of these files but we need an empty json object here.
|
|
57
|
+
file_genome_references = {}
|
|
58
|
+
if vcf_results:
|
|
59
|
+
metadata["files"] = metadata["files"] + vcf_results
|
|
60
|
+
for vcf in vcf_results:
|
|
61
|
+
seq_type = vcf.get("sequenceType")
|
|
62
|
+
file_genome_references[f"{seq_type}_genome_reference"] = vcf["reference"]
|
|
63
|
+
|
|
64
|
+
create_yaml(metadata, file_name)
|
|
65
|
+
|
|
66
|
+
# Return VCF files for immediate processing, and JSON data for adding vendsig
|
|
67
|
+
result = {}
|
|
68
|
+
|
|
69
|
+
if somatic_filename is not None:
|
|
70
|
+
result["somatic_vcf"] = f"{outpath}/{somatic_filename}"
|
|
71
|
+
|
|
72
|
+
return (result, file_genome_references, data)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import gzip
|
|
3
|
+
|
|
4
|
+
from logging import Logger
|
|
5
|
+
|
|
6
|
+
from ingestion.vcf_standardization.standardize import standardize_vcf
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def create_empty_vcf_zip(prefix):
|
|
10
|
+
vcf_gzip_path = f"{prefix}.modified.somatic.vcf.gz"
|
|
11
|
+
content = (
|
|
12
|
+
"""##fileformat=VCFv4.1
|
|
13
|
+
##filedate="""
|
|
14
|
+
+ datetime.datetime.now().isoformat()
|
|
15
|
+
+ """
|
|
16
|
+
##FILTER=<ID=PASS,Description="All filters passed">
|
|
17
|
+
##FILTER=<ID=R8,Description="IndelRepeatLength is greater than 8">
|
|
18
|
+
##FILTER=<ID=R8.1,Description="IndelRepeatLength of a monomer is greater than 8">
|
|
19
|
+
##FILTER=<ID=R8.2,Description="IndelRepeatLength of a dimer is greater than 8">
|
|
20
|
+
##FILTER=<ID=sb,Description="Variant strand bias high">
|
|
21
|
+
##FILTER=<ID=sb.s,Description="Variant strand bias significantly high (only for SNV)">
|
|
22
|
+
##FILTER=<ID=rs,Description="Variant with rs (dbSNP) number in a non-core gene">
|
|
23
|
+
##FILTER=<ID=FP,Description="Possibly false positives due to high similarity to off-target regions">
|
|
24
|
+
##FILTER=<ID=NC,Description="Noncoding INDELs on non-core genes">
|
|
25
|
+
##FILTER=<ID=lowDP,Description="low depth variant">
|
|
26
|
+
##FILTER=<ID=Benign,Description="Benign variant">
|
|
27
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
|
28
|
+
##FORMAT=<ID=AF,Number=1,Type=String,Description="Variant Allele Frequency">
|
|
29
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT """
|
|
30
|
+
+ prefix
|
|
31
|
+
+ """
|
|
32
|
+
"""
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
with gzip.open(vcf_gzip_path, "wb") as f:
|
|
36
|
+
f.write(content.encode("utf-8"))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# This is done in next step, we are just adding to yaml
|
|
40
|
+
def extract_sv(prefix, include_somatic: bool):
|
|
41
|
+
vcfs = []
|
|
42
|
+
|
|
43
|
+
# Hard-code genome reference for Nebula VCFs
|
|
44
|
+
genome_reference = "GRCh38"
|
|
45
|
+
|
|
46
|
+
if include_somatic:
|
|
47
|
+
vcfs.append(
|
|
48
|
+
{
|
|
49
|
+
"fileName": f".lifeomic/nebula/{prefix}/{prefix}.modified.somatic.nrm.filtered.vcf.gz",
|
|
50
|
+
"sequenceType": "somatic",
|
|
51
|
+
"type": "shortVariant", # todo: Is this always going to be shortVariant?
|
|
52
|
+
"reference": genome_reference,
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return vcfs
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_vendsig_dict(json_data, log: Logger):
|
|
60
|
+
# Return a dicitionary of {'chr:star_pos:ref:alt' : 'vendsig'}
|
|
61
|
+
vendsig_dict = {"vendor": "nebula"}
|
|
62
|
+
|
|
63
|
+
return vendsig_dict
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def map_vendsig(ci: str) -> str:
|
|
67
|
+
if ci in ["Pathogenic Variant", "Pathogenic"]:
|
|
68
|
+
return "Pathogenic"
|
|
69
|
+
elif ci in ["Likely Pathogenic Variant", "Likely Pathogenic"]:
|
|
70
|
+
return "Likely pathogenic"
|
|
71
|
+
elif ci in ["Benign Variant", "Benign"]:
|
|
72
|
+
return "Benign"
|
|
73
|
+
elif ci in ["Likely Benign Variant", "Likely Benign"]:
|
|
74
|
+
return "Likely benign"
|
|
75
|
+
elif ci in ["Variant of Uncertain Significance", "VUS"]:
|
|
76
|
+
return "Uncertain significance"
|
|
77
|
+
else:
|
|
78
|
+
return "Unknown"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def process_nebula_vcf(infile, json_data, outpath, file_name, log: Logger):
|
|
82
|
+
line_count = 0
|
|
83
|
+
vendsig_dict = {"vendor": "nebula"}
|
|
84
|
+
|
|
85
|
+
outfile = f"{file_name}.modified.somatic.vcf"
|
|
86
|
+
sample_name = file_name
|
|
87
|
+
# Read in a dictionary of variants with VENDSIG from the JSON file for somatic only
|
|
88
|
+
vendsig_dict = get_vendsig_dict(json_data, log)
|
|
89
|
+
|
|
90
|
+
line_count = standardize_vcf(
|
|
91
|
+
infile=infile,
|
|
92
|
+
outfile=outfile,
|
|
93
|
+
out_path=outpath,
|
|
94
|
+
case_id=sample_name,
|
|
95
|
+
log=log,
|
|
96
|
+
vendsig_dict=vendsig_dict,
|
|
97
|
+
compression=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return line_count
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/nextgen_specific_genes.py
RENAMED
|
File without changes
|
{phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/nextgen/util/pre_filter_somatic_vcf.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{phc-ingestion-0.9.2/ingestion/caris/util → phc-ingestion-0.10.0/ingestion/shared_util}/ga4gh.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{phc-ingestion-0.9.2/ingestion/caris/util → phc-ingestion-0.10.0/ingestion/shared_util}/tar.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/util/af_helpers.py
RENAMED
|
File without changes
|
{phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/util/dp_helpers.py
RENAMED
|
File without changes
|
{phc-ingestion-0.9.2 → phc-ingestion-0.10.0}/ingestion/vcf_standardization/util/read_write.py
RENAMED
|
File without changes
|