ugbio-featuremap 1.20.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ugbio_featuremap-1.20.0/PKG-INFO +15 -0
- ugbio_featuremap-1.20.0/README.featuremap.md +3 -0
- ugbio_featuremap-1.20.0/pyproject.toml +45 -0
- ugbio_featuremap-1.20.0/setup.cfg +4 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/__init__.py +0 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/aggregate_lists.awk +121 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/annotate_featuremap.py +88 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/create_somatic_featuremap.py +260 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/explode_lists.awk +54 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/featuremap_consensus_utils.py +441 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/featuremap_to_dataframe.py +1343 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/featuremap_utils.py +60 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/featuremap_xgb_prediction.py +572 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/filter_dataframe.py +762 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/integrate_mpileup_to_sfm.py +327 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/pileup_featuremap.py +143 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/somatic_featuremap_fields_transformation.py +767 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/somatic_featuremap_inference_utils.py +88 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap/somatic_featuremap_utils.py +124 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/PKG-INFO +15 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/SOURCES.txt +23 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/dependency_links.txt +1 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/entry_points.txt +8 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/requires.txt +3 -0
- ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ugbio_featuremap
|
|
3
|
+
Version: 1.20.0
|
|
4
|
+
Summary: Ultima Genomics FeatureMap utils
|
|
5
|
+
Author-email: Itai Rusinek <itai.rusinek@ultimagen.com>, Gat Krieger <gat.krieger@ultimagen.com>, Avigail Moldovan <avigail.moldovan@ultimagen.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: ugbio_core[ml,vcfbed]
|
|
10
|
+
Requires-Dist: ugbio_ppmseq
|
|
11
|
+
Requires-Dist: polars>=1.27.1
|
|
12
|
+
|
|
13
|
+
# ugbio_featuremap
|
|
14
|
+
|
|
15
|
+
This module includes featuremap python scripts and utils for bioinformatics pipelines.
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ugbio_featuremap"
|
|
3
|
+
version = "1.20.0"
|
|
4
|
+
requires-python = ">=3.11"
|
|
5
|
+
dependencies = [
|
|
6
|
+
"ugbio_core[vcfbed,ml]",
|
|
7
|
+
"ugbio_ppmseq",
|
|
8
|
+
"polars>=1.27.1",
|
|
9
|
+
]
|
|
10
|
+
description = "Ultima Genomics FeatureMap utils"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Itai Rusinek", email = "itai.rusinek@ultimagen.com" },
|
|
13
|
+
{ name = "Gat Krieger", email = "gat.krieger@ultimagen.com" },
|
|
14
|
+
{ name = "Avigail Moldovan", email = "avigail.moldovan@ultimagen.com" },
|
|
15
|
+
]
|
|
16
|
+
readme = "README.featuremap.md"
|
|
17
|
+
|
|
18
|
+
[project.license]
|
|
19
|
+
text = "Apache-2.0"
|
|
20
|
+
|
|
21
|
+
[project.scripts]
|
|
22
|
+
run_tests = "pytest:main"
|
|
23
|
+
featuremap_to_dataframe = "ugbio_featuremap.featuremap_to_dataframe:main"
|
|
24
|
+
filter_featuremap = "ugbio_featuremap.filter_dataframe:main"
|
|
25
|
+
add_aggregate_params_and_xgb_score_to_pileup_featuremap = "ugbio_featuremap.featuremap_xgb_prediction:main"
|
|
26
|
+
create_somatic_featuremap = "ugbio_featuremap.create_somatic_featuremap:main"
|
|
27
|
+
integrate_mpileup_to_sfm = "ugbio_featuremap.integrate_mpileup_to_sfm:main"
|
|
28
|
+
somatic_featuremap_fields_transformation = "ugbio_featuremap.somatic_featuremap_fields_transformation:main"
|
|
29
|
+
|
|
30
|
+
[tool.uv.sources.ugbio_core]
|
|
31
|
+
workspace = true
|
|
32
|
+
|
|
33
|
+
[tool.uv.sources.ugbio_ppmseq]
|
|
34
|
+
workspace = true
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.package-data]
|
|
37
|
+
ugbio_featuremap = [
|
|
38
|
+
"*.awk",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = [
|
|
43
|
+
"setuptools>=61.0",
|
|
44
|
+
]
|
|
45
|
+
build-backend = "setuptools.build_meta"
|
|
File without changes
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/awk -f
|
|
2
|
+
# AWK script for computing aggregation metrics (mean, min, max, count, count_zero) for list format fields in VCF TSV output
|
|
3
|
+
# Also supports expanding fixed-size columns (e.g., AD -> AD_0, AD_1)
|
|
4
|
+
#
|
|
5
|
+
# Usage:
|
|
6
|
+
# awk -v list_indices="3,4,5" -f aggregate_lists.awk input.tsv
|
|
7
|
+
# awk -v list_indices="3,4" -v expand_indices="5" -v expand_sizes="2" -f aggregate_lists.awk input.tsv
|
|
8
|
+
|
|
9
|
+
BEGIN {
|
|
10
|
+
# Parse list_indices parameter (0-based column indices, convert to 1-based for AWK)
|
|
11
|
+
num_list_cols = split(list_indices, indices, ",")
|
|
12
|
+
for (i = 1; i <= num_list_cols; i++) {
|
|
13
|
+
col_idx = indices[i] + 1 # Convert to 1-based
|
|
14
|
+
list_cols[col_idx] = 1
|
|
15
|
+
ordered_list_cols[i] = col_idx
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
# Parse expand_indices and sizes (for expanding fixed-size columns)
|
|
19
|
+
if (expand_indices != "") {
|
|
20
|
+
num_expand_cols = split(expand_indices, expand_idx_list, ",")
|
|
21
|
+
split(expand_sizes, expand_size_list, ",")
|
|
22
|
+
for (i = 1; i <= num_expand_cols; i++) {
|
|
23
|
+
col_idx = expand_idx_list[i] + 1 # Convert to 1-based
|
|
24
|
+
expand_cols[col_idx] = expand_size_list[i]
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function compute_aggregations(col_idx, values_str, values, n, i, val, sum, count, min_val, max_val, mean, count_zero) {
|
|
30
|
+
# Split the list (comma-separated values)
|
|
31
|
+
n = split(values_str, values, ",")
|
|
32
|
+
|
|
33
|
+
sum = 0
|
|
34
|
+
count = 0
|
|
35
|
+
count_zero = 0
|
|
36
|
+
min_val = ""
|
|
37
|
+
max_val = ""
|
|
38
|
+
|
|
39
|
+
# Process each value in the list
|
|
40
|
+
for (i = 1; i <= n; i++) {
|
|
41
|
+
# Trim whitespace and try to convert to number
|
|
42
|
+
val = values[i]
|
|
43
|
+
gsub(/^[ \t]+|[ \t]+$/, "", val)
|
|
44
|
+
|
|
45
|
+
# Skip empty values, ".", or non-numeric values
|
|
46
|
+
if (val == "" || val == "." || val !~ /^-?[0-9]+\.?[0-9]*$/) {
|
|
47
|
+
continue
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
val = val + 0 # Convert to number
|
|
51
|
+
|
|
52
|
+
if (val == 0) count_zero++
|
|
53
|
+
|
|
54
|
+
if (count == 0) {
|
|
55
|
+
min_val = val
|
|
56
|
+
max_val = val
|
|
57
|
+
} else {
|
|
58
|
+
if (val < min_val) min_val = val
|
|
59
|
+
if (val > max_val) max_val = val
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
sum += val
|
|
63
|
+
count++
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
# Return results as string: "mean\tmin\tmax\tcount\tcount_zero"
|
|
67
|
+
if (count == 0) {
|
|
68
|
+
return ".\t.\t.\t0\t0"
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
mean = sum / count
|
|
72
|
+
return sprintf("%.6f\t%.6f\t%.6f\t%d\t%d", mean, min_val, max_val, count, count_zero)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function expand_column(values_str, size, values, n, i, result) {
|
|
76
|
+
# Expand the column and output individual elements
|
|
77
|
+
n = split(values_str, values, ",")
|
|
78
|
+
result = ""
|
|
79
|
+
|
|
80
|
+
for (i = 1; i <= size; i++) {
|
|
81
|
+
if (i > 1) result = result "\t"
|
|
82
|
+
if (i <= n && values[i] != "" && values[i] != ".") {
|
|
83
|
+
result = result values[i]
|
|
84
|
+
} else {
|
|
85
|
+
result = result "."
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return result
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
{
|
|
93
|
+
# Store the current row
|
|
94
|
+
for (i = 1; i <= NF; i++) {
|
|
95
|
+
row[i] = $i
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Output columns, replacing list columns with aggregated metrics or expanded columns
|
|
99
|
+
first = 1
|
|
100
|
+
for (j = 1; j <= NF; j++) {
|
|
101
|
+
if (j in list_cols) {
|
|
102
|
+
# Replace list column with mean, min, max, count, count_zero
|
|
103
|
+
if (!first) printf "\t"
|
|
104
|
+
aggs = compute_aggregations(j, row[j])
|
|
105
|
+
printf "%s", aggs
|
|
106
|
+
first = 0
|
|
107
|
+
} else if (j in expand_cols) {
|
|
108
|
+
# Expand fixed-size column into individual columns
|
|
109
|
+
if (!first) printf "\t"
|
|
110
|
+
expanded_vals = expand_column(row[j], expand_cols[j])
|
|
111
|
+
printf "%s", expanded_vals
|
|
112
|
+
first = 0
|
|
113
|
+
} else {
|
|
114
|
+
# Output non-list column as-is
|
|
115
|
+
if (!first) printf "\t"
|
|
116
|
+
printf "%s", row[j]
|
|
117
|
+
first = 0
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
printf "\n"
|
|
121
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
#!/env/python
|
|
2
|
+
# Copyright 2023 Ultima Genomics Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# DESCRIPTION
|
|
16
|
+
# Add additional feature annotations to featuremap, to be used from single-read SNV qual recalibration
|
|
17
|
+
# CHANGELOG in reverse chronological order
|
|
18
|
+
import argparse
|
|
19
|
+
import sys
|
|
20
|
+
|
|
21
|
+
from ugbio_core.consts import DEFAULT_FLOW_ORDER
|
|
22
|
+
|
|
23
|
+
from ugbio_featuremap.featuremap_utils import annotate_featuremap
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_args(argv: list[str]) -> argparse.Namespace:
|
|
27
|
+
parser = argparse.ArgumentParser(prog="annotate featuremap", description=run.__doc__)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"-i",
|
|
30
|
+
"--featuremap_path",
|
|
31
|
+
type=str,
|
|
32
|
+
required=True,
|
|
33
|
+
help="input featuremap file",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument("--ppmSeq_adapter_version", type=str, default=None, help="ppmSeq adapter version")
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"-o",
|
|
38
|
+
"--output_featuremap",
|
|
39
|
+
type=str,
|
|
40
|
+
required=True,
|
|
41
|
+
help="Path of annotated featuremap file",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument("-r", "--ref_fasta", type=str, required=True, help="Reference genome fasta file")
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--flow_order",
|
|
46
|
+
type=str,
|
|
47
|
+
default=DEFAULT_FLOW_ORDER,
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
"--motif_length_to_annotate",
|
|
51
|
+
type=int,
|
|
52
|
+
default=3,
|
|
53
|
+
)
|
|
54
|
+
parser.add_argument("--max_hmer_length", type=int, default=20)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"-@",
|
|
57
|
+
"--process_number",
|
|
58
|
+
type=int,
|
|
59
|
+
default=0,
|
|
60
|
+
help="""Number of processes to use for parallelization.
|
|
61
|
+
If N < 1, use all-available - abs(N) cores. Default 0""",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return parser.parse_args(argv[1:])
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def run(argv):
|
|
68
|
+
"""Add additional feature annotations to featuremap"""
|
|
69
|
+
args = parse_args(argv)
|
|
70
|
+
|
|
71
|
+
annotate_featuremap(
|
|
72
|
+
input_featuremap=args.featuremap_path,
|
|
73
|
+
output_featuremap=args.output_featuremap,
|
|
74
|
+
ref_fasta=args.ref_fasta,
|
|
75
|
+
ppmseq_adapter_version=args.ppmSeq_adapter_version,
|
|
76
|
+
flow_order=args.flow_order,
|
|
77
|
+
motif_length_to_annotate=args.motif_length_to_annotate,
|
|
78
|
+
max_hmer_length=args.max_hmer_length,
|
|
79
|
+
process_number=args.process_number,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def main():
|
|
84
|
+
run(sys.argv)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
main()
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
from os.path import join as pjoin
|
|
7
|
+
|
|
8
|
+
import pysam
|
|
9
|
+
from ugbio_core.logger import logger
|
|
10
|
+
from ugbio_core.vcf_utils import VcfUtils
|
|
11
|
+
|
|
12
|
+
vu = VcfUtils()
|
|
13
|
+
created_files = []
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def add_single_read_filter(input_vcf: str, output_directory: str, n_threads: int = 1) -> str:
|
|
17
|
+
"""
|
|
18
|
+
Add SingleRead filter to a VCF file.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
input_vcf : str
|
|
23
|
+
Path to the input VCF file.
|
|
24
|
+
output_directory : str
|
|
25
|
+
Output directory for the filtered VCF file.
|
|
26
|
+
n_threads : int, optional
|
|
27
|
+
Number of threads to use (default is 1).
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
str
|
|
32
|
+
Path to the output VCF file with SingleRead filter applied.
|
|
33
|
+
"""
|
|
34
|
+
logger.info("Adding SingleRead filter to the tumor file")
|
|
35
|
+
out_add_filter_vcf = pjoin(
|
|
36
|
+
output_directory, os.path.basename(input_vcf).replace(".vcf.gz", "") + ".with_sr_filter.vcf.gz"
|
|
37
|
+
)
|
|
38
|
+
vu.filter_vcf(
|
|
39
|
+
input_vcf=input_vcf,
|
|
40
|
+
output_vcf=out_add_filter_vcf,
|
|
41
|
+
filter_name="SingleRead",
|
|
42
|
+
exclude_expression="sum(FMT/FILT)<2",
|
|
43
|
+
n_threads=n_threads,
|
|
44
|
+
)
|
|
45
|
+
vu.index_vcf(out_add_filter_vcf)
|
|
46
|
+
logger.info("Adding SingleRead filter to the tumor file: done")
|
|
47
|
+
created_files.append(out_add_filter_vcf)
|
|
48
|
+
created_files.append(f"{out_add_filter_vcf}.tbi")
|
|
49
|
+
return out_add_filter_vcf
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def merge_vcf_files(tumor_vcf, normal_vcf, out_merged_vcf, n_cpu: int | None = None):
|
|
53
|
+
"""
|
|
54
|
+
Merge tumor and normal VCF files into a single VCF file.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
tumor_vcf : str
|
|
59
|
+
Path to the tumor VCF file with INFO fields moved to FORMAT.
|
|
60
|
+
normal_vcf : str
|
|
61
|
+
Path to the normal VCF file with INFO fields moved to FORMAT.
|
|
62
|
+
out_merged_vcf : str
|
|
63
|
+
Path to the output merged VCF file.
|
|
64
|
+
n_cpu: int
|
|
65
|
+
Number of CPU to use in merge and view
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
str
|
|
70
|
+
Path to the output merged VCF file with tumor-PASS variants only.
|
|
71
|
+
"""
|
|
72
|
+
if n_cpu is None:
|
|
73
|
+
n_cpu = os.cpu_count()
|
|
74
|
+
|
|
75
|
+
# Check if tumor and normal VCFs have the same sample name
|
|
76
|
+
with pysam.VariantFile(tumor_vcf) as tumor_vcf_in, pysam.VariantFile(normal_vcf) as normal_vcf_in:
|
|
77
|
+
tumor_sample_name = list(tumor_vcf_in.header.samples)[0]
|
|
78
|
+
normal_sample_name = list(normal_vcf_in.header.samples)[0]
|
|
79
|
+
|
|
80
|
+
# merging T-N VCF files - this results with records from both tumor and normal VCF files
|
|
81
|
+
out_merged_full_vcf = out_merged_vcf.replace(".vcf.gz", "") + ".full.vcf.gz"
|
|
82
|
+
cmd_merge = [
|
|
83
|
+
"bcftools",
|
|
84
|
+
"merge",
|
|
85
|
+
"--threads",
|
|
86
|
+
str(n_cpu),
|
|
87
|
+
"-m",
|
|
88
|
+
"none",
|
|
89
|
+
"-Oz",
|
|
90
|
+
"-o",
|
|
91
|
+
out_merged_full_vcf,
|
|
92
|
+
tumor_vcf,
|
|
93
|
+
normal_vcf,
|
|
94
|
+
]
|
|
95
|
+
if tumor_sample_name == normal_sample_name:
|
|
96
|
+
logger.warning(
|
|
97
|
+
f"Tumor and normal VCFs have the same sample name ({tumor_sample_name}). "
|
|
98
|
+
"Using --force-samples to allow merging."
|
|
99
|
+
)
|
|
100
|
+
cmd_merge.insert(2, "--force-samples")
|
|
101
|
+
logger.debug(" ".join(cmd_merge))
|
|
102
|
+
subprocess.check_call(cmd_merge)
|
|
103
|
+
vu.index_vcf(out_merged_full_vcf)
|
|
104
|
+
|
|
105
|
+
created_files.append(out_merged_full_vcf)
|
|
106
|
+
created_files.append(out_merged_full_vcf + ".tbi")
|
|
107
|
+
|
|
108
|
+
# Keep only records that are present in the tumor VCF file
|
|
109
|
+
vu.view_vcf(
|
|
110
|
+
input_vcf=out_merged_full_vcf,
|
|
111
|
+
output_vcf=out_merged_vcf,
|
|
112
|
+
n_threads=n_cpu,
|
|
113
|
+
extra_args="-i 'COUNT(FORMAT/RL[0:0]) > 0'",
|
|
114
|
+
)
|
|
115
|
+
vu.index_vcf(out_merged_vcf)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def __parse_args(argv: list[str]) -> argparse.Namespace:
|
|
119
|
+
"""
|
|
120
|
+
Parse command line arguments.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
argv : list of str
|
|
125
|
+
Command line arguments.
|
|
126
|
+
|
|
127
|
+
Returns
|
|
128
|
+
-------
|
|
129
|
+
argparse.Namespace
|
|
130
|
+
Parsed arguments.
|
|
131
|
+
"""
|
|
132
|
+
parser = argparse.ArgumentParser(
|
|
133
|
+
prog="create_somatic_pileup_featuremap.py",
|
|
134
|
+
description=run.__doc__,
|
|
135
|
+
)
|
|
136
|
+
parser.add_argument("--tumor_vcf", help="tumor vcf file", required=True, type=str)
|
|
137
|
+
parser.add_argument("--normal_vcf", help="normal vcf file", required=True, type=str)
|
|
138
|
+
parser.add_argument("--sample_name", help="sample_name", required=True, type=str)
|
|
139
|
+
parser.add_argument("--cpu", help="number of CPU to use", required=False, type=int, default=8)
|
|
140
|
+
parser.add_argument(
|
|
141
|
+
"--out_directory",
|
|
142
|
+
help="out directory where intermediate and output files will be saved."
|
|
143
|
+
" if not supplied all files will be written to current directory",
|
|
144
|
+
required=False,
|
|
145
|
+
type=str,
|
|
146
|
+
default=".",
|
|
147
|
+
)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
"--keep-non-pass-tumor-candidates",
|
|
150
|
+
help="If set, the output VCF will also contain non-PASS variants.",
|
|
151
|
+
action="store_true",
|
|
152
|
+
default=False,
|
|
153
|
+
)
|
|
154
|
+
return parser.parse_args(argv[1:])
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def run(argv):
|
|
158
|
+
"""
|
|
159
|
+
Merge two VCF files (tumor and normal) into a single VCF file.
|
|
160
|
+
|
|
161
|
+
The output VCF file will have tumor records (filtered for SingleRead variants)
|
|
162
|
+
merged with corresponding normal records.
|
|
163
|
+
If the `--keep-non-pass-tumor-candidates` flag is set, non-PASS variants will also be included in the output.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
argv : list of str
|
|
168
|
+
Command line arguments.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
None
|
|
173
|
+
"""
|
|
174
|
+
args = __parse_args(argv)
|
|
175
|
+
logger.setLevel(logging.DEBUG)
|
|
176
|
+
for handler in logger.handlers:
|
|
177
|
+
handler.setLevel(logging.DEBUG)
|
|
178
|
+
|
|
179
|
+
logger.info(f"Output directory: {args.out_directory}")
|
|
180
|
+
|
|
181
|
+
# Create output directory if it doesn't exist
|
|
182
|
+
if not os.path.exists(args.out_directory):
|
|
183
|
+
os.makedirs(args.out_directory)
|
|
184
|
+
logger.info(f"Created output directory: {args.out_directory}")
|
|
185
|
+
|
|
186
|
+
# Add SingleRead filter to the tumor VCF file
|
|
187
|
+
out_add_filter_vcf = add_single_read_filter(args.tumor_vcf, args.out_directory, args.cpu)
|
|
188
|
+
|
|
189
|
+
# Merge the tumor and normal VCF files into a single VCF file
|
|
190
|
+
if args.keep_non_pass_tumor_candidates:
|
|
191
|
+
logger.info("including non-PASS tumor variants.")
|
|
192
|
+
tumor_vcf = out_add_filter_vcf.replace(".with_sr_filter.vcf.gz", ".no_sr.vcf.gz")
|
|
193
|
+
vu.view_vcf(
|
|
194
|
+
input_vcf=out_add_filter_vcf,
|
|
195
|
+
output_vcf=tumor_vcf,
|
|
196
|
+
n_threads=args.cpu,
|
|
197
|
+
extra_args="-i 'FILTER!~\"SingleRead\"'",
|
|
198
|
+
)
|
|
199
|
+
vu.index_vcf(tumor_vcf)
|
|
200
|
+
logger.info("No filtering for tumor-PASS variants. Filtering only for SingleRead variants.")
|
|
201
|
+
created_files.append(tumor_vcf)
|
|
202
|
+
created_files.append(tumor_vcf + ".tbi")
|
|
203
|
+
else:
|
|
204
|
+
logger.info("Filtering for tumor-PASS variants only.")
|
|
205
|
+
tumor_vcf = out_add_filter_vcf.replace(".vcf.gz", ".tumor_PASS.vcf.gz")
|
|
206
|
+
vu.view_vcf(
|
|
207
|
+
input_vcf=out_add_filter_vcf,
|
|
208
|
+
output_vcf=tumor_vcf,
|
|
209
|
+
n_threads=args.cpu,
|
|
210
|
+
extra_args="-f PASS -i 'FILTER!~\"SingleRead\"'",
|
|
211
|
+
)
|
|
212
|
+
vu.index_vcf(tumor_vcf)
|
|
213
|
+
logger.info("Filtering for tumor-PASS variants only: done")
|
|
214
|
+
created_files.append(tumor_vcf)
|
|
215
|
+
created_files.append(tumor_vcf + ".tbi")
|
|
216
|
+
|
|
217
|
+
unfiltered_normal_vcf = pjoin(
|
|
218
|
+
args.out_directory, os.path.basename(args.normal_vcf).replace(".vcf.gz", ".unfiltered.vcf.gz")
|
|
219
|
+
)
|
|
220
|
+
vu.remove_filter_annotations(args.normal_vcf, unfiltered_normal_vcf, args.cpu)
|
|
221
|
+
vu.index_vcf(unfiltered_normal_vcf)
|
|
222
|
+
created_files.append(unfiltered_normal_vcf)
|
|
223
|
+
created_files.append(unfiltered_normal_vcf + ".tbi")
|
|
224
|
+
|
|
225
|
+
# Set up the output VCF file path
|
|
226
|
+
out_merged_tmp_vcf = pjoin(args.out_directory, f"{args.sample_name}.tumor_normal.merged.tmp.vcf.gz")
|
|
227
|
+
merge_vcf_files(tumor_vcf, unfiltered_normal_vcf, out_merged_tmp_vcf)
|
|
228
|
+
created_files.append(out_merged_tmp_vcf)
|
|
229
|
+
created_files.append(out_merged_tmp_vcf + ".tbi")
|
|
230
|
+
|
|
231
|
+
# Add tumor sample name to merged VCF header
|
|
232
|
+
with pysam.VariantFile(out_merged_tmp_vcf) as vcf_in:
|
|
233
|
+
tumor_sample_name = list(vcf_in.header.samples)[0]
|
|
234
|
+
# Add header line using pysam
|
|
235
|
+
out_merged_vcf = out_merged_tmp_vcf.replace(".tmp.vcf.gz", ".vcf.gz")
|
|
236
|
+
with pysam.VariantFile(out_merged_tmp_vcf) as vcf_in:
|
|
237
|
+
header = vcf_in.header.copy()
|
|
238
|
+
header.add_line(f"##tumor_sample={tumor_sample_name}")
|
|
239
|
+
with pysam.VariantFile(out_merged_vcf, "wz", header=header) as vcf_out:
|
|
240
|
+
for rec in vcf_in:
|
|
241
|
+
vcf_out.write(rec)
|
|
242
|
+
vu.index_vcf(out_merged_vcf)
|
|
243
|
+
logger.info(f"Added tumor sample header: ##tumor_sample={tumor_sample_name}")
|
|
244
|
+
logger.info(f"Output merged VCF file: {out_merged_vcf}")
|
|
245
|
+
|
|
246
|
+
# Clean up intermediate files
|
|
247
|
+
for f in created_files:
|
|
248
|
+
try:
|
|
249
|
+
os.remove(f)
|
|
250
|
+
logger.debug(f"Removed temporary file: {f}")
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.warning(f"Could not remove temporary file {f}: {e}")
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def main():
|
|
256
|
+
run(sys.argv)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
if __name__ == "__main__":
|
|
260
|
+
main()
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/awk -f
|
|
2
|
+
# AWK script for exploding list format fields in VCF TSV output
|
|
3
|
+
# Usage: awk -v list_indices="3,4,5" -f explode_lists.awk input.tsv
|
|
4
|
+
|
|
5
|
+
BEGIN {
|
|
6
|
+
# Parse list_indices parameter (0-based column indices, convert to 1-based for AWK)
|
|
7
|
+
split(list_indices, indices, ",")
|
|
8
|
+
for (i in indices) {
|
|
9
|
+
list_cols[indices[i] + 1] = 1 # Convert to 1-based
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
{
|
|
14
|
+
# Store the current row
|
|
15
|
+
for (i = 1; i <= NF; i++) {
|
|
16
|
+
row[i] = $i
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
# Find the maximum list length among all list columns
|
|
20
|
+
max_len = 1
|
|
21
|
+
for (col_idx in list_cols) {
|
|
22
|
+
if (col_idx <= NF) {
|
|
23
|
+
# Split the list (comma-separated values)
|
|
24
|
+
n = split(row[col_idx], values, ",")
|
|
25
|
+
if (n > max_len) {
|
|
26
|
+
max_len = n
|
|
27
|
+
}
|
|
28
|
+
# Store the split values
|
|
29
|
+
for (j = 1; j <= n; j++) {
|
|
30
|
+
lists[col_idx][j] = values[j]
|
|
31
|
+
}
|
|
32
|
+
# Fill missing values with "."
|
|
33
|
+
for (j = n + 1; j <= max_len; j++) {
|
|
34
|
+
lists[col_idx][j] = "."
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Output exploded rows
|
|
40
|
+
for (i = 1; i <= max_len; i++) {
|
|
41
|
+
for (j = 1; j <= NF; j++) {
|
|
42
|
+
if (j in list_cols) {
|
|
43
|
+
printf "%s", lists[j][i]
|
|
44
|
+
} else {
|
|
45
|
+
printf "%s", row[j]
|
|
46
|
+
}
|
|
47
|
+
if (j < NF) printf "\t"
|
|
48
|
+
}
|
|
49
|
+
printf "\n"
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Clear arrays for next row
|
|
53
|
+
delete lists
|
|
54
|
+
}
|