ugbio-featuremap 1.20.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. ugbio_featuremap-1.20.0/PKG-INFO +15 -0
  2. ugbio_featuremap-1.20.0/README.featuremap.md +3 -0
  3. ugbio_featuremap-1.20.0/pyproject.toml +45 -0
  4. ugbio_featuremap-1.20.0/setup.cfg +4 -0
  5. ugbio_featuremap-1.20.0/ugbio_featuremap/__init__.py +0 -0
  6. ugbio_featuremap-1.20.0/ugbio_featuremap/aggregate_lists.awk +121 -0
  7. ugbio_featuremap-1.20.0/ugbio_featuremap/annotate_featuremap.py +88 -0
  8. ugbio_featuremap-1.20.0/ugbio_featuremap/create_somatic_featuremap.py +260 -0
  9. ugbio_featuremap-1.20.0/ugbio_featuremap/explode_lists.awk +54 -0
  10. ugbio_featuremap-1.20.0/ugbio_featuremap/featuremap_consensus_utils.py +441 -0
  11. ugbio_featuremap-1.20.0/ugbio_featuremap/featuremap_to_dataframe.py +1343 -0
  12. ugbio_featuremap-1.20.0/ugbio_featuremap/featuremap_utils.py +60 -0
  13. ugbio_featuremap-1.20.0/ugbio_featuremap/featuremap_xgb_prediction.py +572 -0
  14. ugbio_featuremap-1.20.0/ugbio_featuremap/filter_dataframe.py +762 -0
  15. ugbio_featuremap-1.20.0/ugbio_featuremap/integrate_mpileup_to_sfm.py +327 -0
  16. ugbio_featuremap-1.20.0/ugbio_featuremap/pileup_featuremap.py +143 -0
  17. ugbio_featuremap-1.20.0/ugbio_featuremap/somatic_featuremap_fields_transformation.py +767 -0
  18. ugbio_featuremap-1.20.0/ugbio_featuremap/somatic_featuremap_inference_utils.py +88 -0
  19. ugbio_featuremap-1.20.0/ugbio_featuremap/somatic_featuremap_utils.py +124 -0
  20. ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/PKG-INFO +15 -0
  21. ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/SOURCES.txt +23 -0
  22. ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/dependency_links.txt +1 -0
  23. ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/entry_points.txt +8 -0
  24. ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/requires.txt +3 -0
  25. ugbio_featuremap-1.20.0/ugbio_featuremap.egg-info/top_level.txt +1 -0
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: ugbio_featuremap
3
+ Version: 1.20.0
4
+ Summary: Ultima Genomics FeatureMap utils
5
+ Author-email: Itai Rusinek <itai.rusinek@ultimagen.com>, Gat Krieger <gat.krieger@ultimagen.com>, Avigail Moldovan <avigail.moldovan@ultimagen.com>
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: ugbio_core[ml,vcfbed]
10
+ Requires-Dist: ugbio_ppmseq
11
+ Requires-Dist: polars>=1.27.1
12
+
13
+ # ugbio_featuremap
14
+
15
+ This module includes featuremap python scripts and utils for bioinformatics pipelines.
@@ -0,0 +1,3 @@
1
+ # ugbio_featuremap
2
+
3
+ This module includes featuremap python scripts and utils for bioinformatics pipelines.
@@ -0,0 +1,45 @@
1
+ [project]
2
+ name = "ugbio_featuremap"
3
+ version = "1.20.0"
4
+ requires-python = ">=3.11"
5
+ dependencies = [
6
+ "ugbio_core[vcfbed,ml]",
7
+ "ugbio_ppmseq",
8
+ "polars>=1.27.1",
9
+ ]
10
+ description = "Ultima Genomics FeatureMap utils"
11
+ authors = [
12
+ { name = "Itai Rusinek", email = "itai.rusinek@ultimagen.com" },
13
+ { name = "Gat Krieger", email = "gat.krieger@ultimagen.com" },
14
+ { name = "Avigail Moldovan", email = "avigail.moldovan@ultimagen.com" },
15
+ ]
16
+ readme = "README.featuremap.md"
17
+
18
+ [project.license]
19
+ text = "Apache-2.0"
20
+
21
+ [project.scripts]
22
+ run_tests = "pytest:main"
23
+ featuremap_to_dataframe = "ugbio_featuremap.featuremap_to_dataframe:main"
24
+ filter_featuremap = "ugbio_featuremap.filter_dataframe:main"
25
+ add_aggregate_params_and_xgb_score_to_pileup_featuremap = "ugbio_featuremap.featuremap_xgb_prediction:main"
26
+ create_somatic_featuremap = "ugbio_featuremap.create_somatic_featuremap:main"
27
+ integrate_mpileup_to_sfm = "ugbio_featuremap.integrate_mpileup_to_sfm:main"
28
+ somatic_featuremap_fields_transformation = "ugbio_featuremap.somatic_featuremap_fields_transformation:main"
29
+
30
+ [tool.uv.sources.ugbio_core]
31
+ workspace = true
32
+
33
+ [tool.uv.sources.ugbio_ppmseq]
34
+ workspace = true
35
+
36
+ [tool.setuptools.package-data]
37
+ ugbio_featuremap = [
38
+ "*.awk",
39
+ ]
40
+
41
+ [build-system]
42
+ requires = [
43
+ "setuptools>=61.0",
44
+ ]
45
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/awk -f
2
+ # AWK script for computing aggregation metrics (mean, min, max, count, count_zero) for list format fields in VCF TSV output
3
+ # Also supports expanding fixed-size columns (e.g., AD -> AD_0, AD_1)
4
+ #
5
+ # Usage:
6
+ # awk -v list_indices="3,4,5" -f aggregate_lists.awk input.tsv
7
+ # awk -v list_indices="3,4" -v expand_indices="5" -v expand_sizes="2" -f aggregate_lists.awk input.tsv
8
+
9
+ BEGIN {
10
+ # Parse list_indices parameter (0-based column indices, convert to 1-based for AWK)
11
+ num_list_cols = split(list_indices, indices, ",")
12
+ for (i = 1; i <= num_list_cols; i++) {
13
+ col_idx = indices[i] + 1 # Convert to 1-based
14
+ list_cols[col_idx] = 1
15
+ ordered_list_cols[i] = col_idx
16
+ }
17
+
18
+ # Parse expand_indices and sizes (for expanding fixed-size columns)
19
+ if (expand_indices != "") {
20
+ num_expand_cols = split(expand_indices, expand_idx_list, ",")
21
+ split(expand_sizes, expand_size_list, ",")
22
+ for (i = 1; i <= num_expand_cols; i++) {
23
+ col_idx = expand_idx_list[i] + 1 # Convert to 1-based
24
+ expand_cols[col_idx] = expand_size_list[i]
25
+ }
26
+ }
27
+ }
28
+
29
+ function compute_aggregations(col_idx, values_str, values, n, i, val, sum, count, min_val, max_val, mean, count_zero) {
30
+ # Split the list (comma-separated values)
31
+ n = split(values_str, values, ",")
32
+
33
+ sum = 0
34
+ count = 0
35
+ count_zero = 0
36
+ min_val = ""
37
+ max_val = ""
38
+
39
+ # Process each value in the list
40
+ for (i = 1; i <= n; i++) {
41
+ # Trim whitespace and try to convert to number
42
+ val = values[i]
43
+ gsub(/^[ \t]+|[ \t]+$/, "", val)
44
+
45
+ # Skip empty values, ".", or non-numeric values
46
+ if (val == "" || val == "." || val !~ /^-?[0-9]+\.?[0-9]*$/) {
47
+ continue
48
+ }
49
+
50
+ val = val + 0 # Convert to number
51
+
52
+ if (val == 0) count_zero++
53
+
54
+ if (count == 0) {
55
+ min_val = val
56
+ max_val = val
57
+ } else {
58
+ if (val < min_val) min_val = val
59
+ if (val > max_val) max_val = val
60
+ }
61
+
62
+ sum += val
63
+ count++
64
+ }
65
+
66
+ # Return results as string: "mean\tmin\tmax\tcount\tcount_zero"
67
+ if (count == 0) {
68
+ return ".\t.\t.\t0\t0"
69
+ }
70
+
71
+ mean = sum / count
72
+ return sprintf("%.6f\t%.6f\t%.6f\t%d\t%d", mean, min_val, max_val, count, count_zero)
73
+ }
74
+
75
+ function expand_column(values_str, size, values, n, i, result) {
76
+ # Expand the column and output individual elements
77
+ n = split(values_str, values, ",")
78
+ result = ""
79
+
80
+ for (i = 1; i <= size; i++) {
81
+ if (i > 1) result = result "\t"
82
+ if (i <= n && values[i] != "" && values[i] != ".") {
83
+ result = result values[i]
84
+ } else {
85
+ result = result "."
86
+ }
87
+ }
88
+
89
+ return result
90
+ }
91
+
92
+ {
93
+ # Store the current row
94
+ for (i = 1; i <= NF; i++) {
95
+ row[i] = $i
96
+ }
97
+
98
+ # Output columns, replacing list columns with aggregated metrics or expanded columns
99
+ first = 1
100
+ for (j = 1; j <= NF; j++) {
101
+ if (j in list_cols) {
102
+ # Replace list column with mean, min, max, count, count_zero
103
+ if (!first) printf "\t"
104
+ aggs = compute_aggregations(j, row[j])
105
+ printf "%s", aggs
106
+ first = 0
107
+ } else if (j in expand_cols) {
108
+ # Expand fixed-size column into individual columns
109
+ if (!first) printf "\t"
110
+ expanded_vals = expand_column(row[j], expand_cols[j])
111
+ printf "%s", expanded_vals
112
+ first = 0
113
+ } else {
114
+ # Output non-list column as-is
115
+ if (!first) printf "\t"
116
+ printf "%s", row[j]
117
+ first = 0
118
+ }
119
+ }
120
+ printf "\n"
121
+ }
@@ -0,0 +1,88 @@
1
+ #!/env/python
2
+ # Copyright 2023 Ultima Genomics Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # DESCRIPTION
16
+ # Add additional feature annotations to featuremap, to be used from single-read SNV qual recalibration
17
+ # CHANGELOG in reverse chronological order
18
+ import argparse
19
+ import sys
20
+
21
+ from ugbio_core.consts import DEFAULT_FLOW_ORDER
22
+
23
+ from ugbio_featuremap.featuremap_utils import annotate_featuremap
24
+
25
+
26
+ def parse_args(argv: list[str]) -> argparse.Namespace:
27
+ parser = argparse.ArgumentParser(prog="annotate featuremap", description=run.__doc__)
28
+ parser.add_argument(
29
+ "-i",
30
+ "--featuremap_path",
31
+ type=str,
32
+ required=True,
33
+ help="input featuremap file",
34
+ )
35
+ parser.add_argument("--ppmSeq_adapter_version", type=str, default=None, help="ppmSeq adapter version")
36
+ parser.add_argument(
37
+ "-o",
38
+ "--output_featuremap",
39
+ type=str,
40
+ required=True,
41
+ help="Path of annotated featuremap file",
42
+ )
43
+ parser.add_argument("-r", "--ref_fasta", type=str, required=True, help="Reference genome fasta file")
44
+ parser.add_argument(
45
+ "--flow_order",
46
+ type=str,
47
+ default=DEFAULT_FLOW_ORDER,
48
+ )
49
+ parser.add_argument(
50
+ "--motif_length_to_annotate",
51
+ type=int,
52
+ default=3,
53
+ )
54
+ parser.add_argument("--max_hmer_length", type=int, default=20)
55
+ parser.add_argument(
56
+ "-@",
57
+ "--process_number",
58
+ type=int,
59
+ default=0,
60
+ help="""Number of processes to use for parallelization.
61
+ If N < 1, use all-available - abs(N) cores. Default 0""",
62
+ )
63
+
64
+ return parser.parse_args(argv[1:])
65
+
66
+
67
+ def run(argv):
68
+ """Add additional feature annotations to featuremap"""
69
+ args = parse_args(argv)
70
+
71
+ annotate_featuremap(
72
+ input_featuremap=args.featuremap_path,
73
+ output_featuremap=args.output_featuremap,
74
+ ref_fasta=args.ref_fasta,
75
+ ppmseq_adapter_version=args.ppmSeq_adapter_version,
76
+ flow_order=args.flow_order,
77
+ motif_length_to_annotate=args.motif_length_to_annotate,
78
+ max_hmer_length=args.max_hmer_length,
79
+ process_number=args.process_number,
80
+ )
81
+
82
+
83
+ def main():
84
+ run(sys.argv)
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
@@ -0,0 +1,260 @@
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import subprocess
5
+ import sys
6
+ from os.path import join as pjoin
7
+
8
+ import pysam
9
+ from ugbio_core.logger import logger
10
+ from ugbio_core.vcf_utils import VcfUtils
11
+
12
+ vu = VcfUtils()
13
+ created_files = []
14
+
15
+
16
+ def add_single_read_filter(input_vcf: str, output_directory: str, n_threads: int = 1) -> str:
17
+ """
18
+ Add SingleRead filter to a VCF file.
19
+
20
+ Parameters
21
+ ----------
22
+ input_vcf : str
23
+ Path to the input VCF file.
24
+ output_directory : str
25
+ Output directory for the filtered VCF file.
26
+ n_threads : int, optional
27
+ Number of threads to use (default is 1).
28
+
29
+ Returns
30
+ -------
31
+ str
32
+ Path to the output VCF file with SingleRead filter applied.
33
+ """
34
+ logger.info("Adding SingleRead filter to the tumor file")
35
+ out_add_filter_vcf = pjoin(
36
+ output_directory, os.path.basename(input_vcf).replace(".vcf.gz", "") + ".with_sr_filter.vcf.gz"
37
+ )
38
+ vu.filter_vcf(
39
+ input_vcf=input_vcf,
40
+ output_vcf=out_add_filter_vcf,
41
+ filter_name="SingleRead",
42
+ exclude_expression="sum(FMT/FILT)<2",
43
+ n_threads=n_threads,
44
+ )
45
+ vu.index_vcf(out_add_filter_vcf)
46
+ logger.info("Adding SingleRead filter to the tumor file: done")
47
+ created_files.append(out_add_filter_vcf)
48
+ created_files.append(f"{out_add_filter_vcf}.tbi")
49
+ return out_add_filter_vcf
50
+
51
+
52
+ def merge_vcf_files(tumor_vcf, normal_vcf, out_merged_vcf, n_cpu: int | None = None):
53
+ """
54
+ Merge tumor and normal VCF files into a single VCF file.
55
+
56
+ Parameters
57
+ ----------
58
+ tumor_vcf : str
59
+ Path to the tumor VCF file with INFO fields moved to FORMAT.
60
+ normal_vcf : str
61
+ Path to the normal VCF file with INFO fields moved to FORMAT.
62
+ out_merged_vcf : str
63
+ Path to the output merged VCF file.
64
+ n_cpu: int
65
+ Number of CPU to use in merge and view
66
+
67
+ Returns
68
+ -------
69
+ str
70
+ Path to the output merged VCF file with tumor-PASS variants only.
71
+ """
72
+ if n_cpu is None:
73
+ n_cpu = os.cpu_count()
74
+
75
+ # Check if tumor and normal VCFs have the same sample name
76
+ with pysam.VariantFile(tumor_vcf) as tumor_vcf_in, pysam.VariantFile(normal_vcf) as normal_vcf_in:
77
+ tumor_sample_name = list(tumor_vcf_in.header.samples)[0]
78
+ normal_sample_name = list(normal_vcf_in.header.samples)[0]
79
+
80
+ # merging T-N VCF files - this results with records from both tumor and normal VCF files
81
+ out_merged_full_vcf = out_merged_vcf.replace(".vcf.gz", "") + ".full.vcf.gz"
82
+ cmd_merge = [
83
+ "bcftools",
84
+ "merge",
85
+ "--threads",
86
+ str(n_cpu),
87
+ "-m",
88
+ "none",
89
+ "-Oz",
90
+ "-o",
91
+ out_merged_full_vcf,
92
+ tumor_vcf,
93
+ normal_vcf,
94
+ ]
95
+ if tumor_sample_name == normal_sample_name:
96
+ logger.warning(
97
+ f"Tumor and normal VCFs have the same sample name ({tumor_sample_name}). "
98
+ "Using --force-samples to allow merging."
99
+ )
100
+ cmd_merge.insert(2, "--force-samples")
101
+ logger.debug(" ".join(cmd_merge))
102
+ subprocess.check_call(cmd_merge)
103
+ vu.index_vcf(out_merged_full_vcf)
104
+
105
+ created_files.append(out_merged_full_vcf)
106
+ created_files.append(out_merged_full_vcf + ".tbi")
107
+
108
+ # Keep only records that are present in the tumor VCF file
109
+ vu.view_vcf(
110
+ input_vcf=out_merged_full_vcf,
111
+ output_vcf=out_merged_vcf,
112
+ n_threads=n_cpu,
113
+ extra_args="-i 'COUNT(FORMAT/RL[0:0]) > 0'",
114
+ )
115
+ vu.index_vcf(out_merged_vcf)
116
+
117
+
118
+ def __parse_args(argv: list[str]) -> argparse.Namespace:
119
+ """
120
+ Parse command line arguments.
121
+
122
+ Parameters
123
+ ----------
124
+ argv : list of str
125
+ Command line arguments.
126
+
127
+ Returns
128
+ -------
129
+ argparse.Namespace
130
+ Parsed arguments.
131
+ """
132
+ parser = argparse.ArgumentParser(
133
+ prog="create_somatic_pileup_featuremap.py",
134
+ description=run.__doc__,
135
+ )
136
+ parser.add_argument("--tumor_vcf", help="tumor vcf file", required=True, type=str)
137
+ parser.add_argument("--normal_vcf", help="normal vcf file", required=True, type=str)
138
+ parser.add_argument("--sample_name", help="sample_name", required=True, type=str)
139
+ parser.add_argument("--cpu", help="number of CPU to use", required=False, type=int, default=8)
140
+ parser.add_argument(
141
+ "--out_directory",
142
+ help="out directory where intermediate and output files will be saved."
143
+ " if not supplied all files will be written to current directory",
144
+ required=False,
145
+ type=str,
146
+ default=".",
147
+ )
148
+ parser.add_argument(
149
+ "--keep-non-pass-tumor-candidates",
150
+ help="If set, the output VCF will also contain non-PASS variants.",
151
+ action="store_true",
152
+ default=False,
153
+ )
154
+ return parser.parse_args(argv[1:])
155
+
156
+
157
+ def run(argv):
158
+ """
159
+ Merge two VCF files (tumor and normal) into a single VCF file.
160
+
161
+ The output VCF file will have tumor records (filtered for SingleRead variants)
162
+ merged with corresponding normal records.
163
+ If the `--keep-non-pass-tumor-candidates` flag is set, non-PASS variants will also be included in the output.
164
+
165
+ Parameters
166
+ ----------
167
+ argv : list of str
168
+ Command line arguments.
169
+
170
+ Returns
171
+ -------
172
+ None
173
+ """
174
+ args = __parse_args(argv)
175
+ logger.setLevel(logging.DEBUG)
176
+ for handler in logger.handlers:
177
+ handler.setLevel(logging.DEBUG)
178
+
179
+ logger.info(f"Output directory: {args.out_directory}")
180
+
181
+ # Create output directory if it doesn't exist
182
+ if not os.path.exists(args.out_directory):
183
+ os.makedirs(args.out_directory)
184
+ logger.info(f"Created output directory: {args.out_directory}")
185
+
186
+ # Add SingleRead filter to the tumor VCF file
187
+ out_add_filter_vcf = add_single_read_filter(args.tumor_vcf, args.out_directory, args.cpu)
188
+
189
+ # Merge the tumor and normal VCF files into a single VCF file
190
+ if args.keep_non_pass_tumor_candidates:
191
+ logger.info("including non-PASS tumor variants.")
192
+ tumor_vcf = out_add_filter_vcf.replace(".with_sr_filter.vcf.gz", ".no_sr.vcf.gz")
193
+ vu.view_vcf(
194
+ input_vcf=out_add_filter_vcf,
195
+ output_vcf=tumor_vcf,
196
+ n_threads=args.cpu,
197
+ extra_args="-i 'FILTER!~\"SingleRead\"'",
198
+ )
199
+ vu.index_vcf(tumor_vcf)
200
+ logger.info("No filtering for tumor-PASS variants. Filtering only for SingleRead variants.")
201
+ created_files.append(tumor_vcf)
202
+ created_files.append(tumor_vcf + ".tbi")
203
+ else:
204
+ logger.info("Filtering for tumor-PASS variants only.")
205
+ tumor_vcf = out_add_filter_vcf.replace(".vcf.gz", ".tumor_PASS.vcf.gz")
206
+ vu.view_vcf(
207
+ input_vcf=out_add_filter_vcf,
208
+ output_vcf=tumor_vcf,
209
+ n_threads=args.cpu,
210
+ extra_args="-f PASS -i 'FILTER!~\"SingleRead\"'",
211
+ )
212
+ vu.index_vcf(tumor_vcf)
213
+ logger.info("Filtering for tumor-PASS variants only: done")
214
+ created_files.append(tumor_vcf)
215
+ created_files.append(tumor_vcf + ".tbi")
216
+
217
+ unfiltered_normal_vcf = pjoin(
218
+ args.out_directory, os.path.basename(args.normal_vcf).replace(".vcf.gz", ".unfiltered.vcf.gz")
219
+ )
220
+ vu.remove_filter_annotations(args.normal_vcf, unfiltered_normal_vcf, args.cpu)
221
+ vu.index_vcf(unfiltered_normal_vcf)
222
+ created_files.append(unfiltered_normal_vcf)
223
+ created_files.append(unfiltered_normal_vcf + ".tbi")
224
+
225
+ # Set up the output VCF file path
226
+ out_merged_tmp_vcf = pjoin(args.out_directory, f"{args.sample_name}.tumor_normal.merged.tmp.vcf.gz")
227
+ merge_vcf_files(tumor_vcf, unfiltered_normal_vcf, out_merged_tmp_vcf)
228
+ created_files.append(out_merged_tmp_vcf)
229
+ created_files.append(out_merged_tmp_vcf + ".tbi")
230
+
231
+ # Add tumor sample name to merged VCF header
232
+ with pysam.VariantFile(out_merged_tmp_vcf) as vcf_in:
233
+ tumor_sample_name = list(vcf_in.header.samples)[0]
234
+ # Add header line using pysam
235
+ out_merged_vcf = out_merged_tmp_vcf.replace(".tmp.vcf.gz", ".vcf.gz")
236
+ with pysam.VariantFile(out_merged_tmp_vcf) as vcf_in:
237
+ header = vcf_in.header.copy()
238
+ header.add_line(f"##tumor_sample={tumor_sample_name}")
239
+ with pysam.VariantFile(out_merged_vcf, "wz", header=header) as vcf_out:
240
+ for rec in vcf_in:
241
+ vcf_out.write(rec)
242
+ vu.index_vcf(out_merged_vcf)
243
+ logger.info(f"Added tumor sample header: ##tumor_sample={tumor_sample_name}")
244
+ logger.info(f"Output merged VCF file: {out_merged_vcf}")
245
+
246
+ # Clean up intermediate files
247
+ for f in created_files:
248
+ try:
249
+ os.remove(f)
250
+ logger.debug(f"Removed temporary file: {f}")
251
+ except Exception as e:
252
+ logger.warning(f"Could not remove temporary file {f}: {e}")
253
+
254
+
255
+ def main():
256
+ run(sys.argv)
257
+
258
+
259
+ if __name__ == "__main__":
260
+ main()
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/awk -f
2
+ # AWK script for exploding list format fields in VCF TSV output
3
+ # Usage: awk -v list_indices="3,4,5" -f explode_lists.awk input.tsv
4
+
5
+ BEGIN {
6
+ # Parse list_indices parameter (0-based column indices, convert to 1-based for AWK)
7
+ split(list_indices, indices, ",")
8
+ for (i in indices) {
9
+ list_cols[indices[i] + 1] = 1 # Convert to 1-based
10
+ }
11
+ }
12
+
13
+ {
14
+ # Store the current row
15
+ for (i = 1; i <= NF; i++) {
16
+ row[i] = $i
17
+ }
18
+
19
+ # Find the maximum list length among all list columns
20
+ max_len = 1
21
+ for (col_idx in list_cols) {
22
+ if (col_idx <= NF) {
23
+ # Split the list (comma-separated values)
24
+ n = split(row[col_idx], values, ",")
25
+ if (n > max_len) {
26
+ max_len = n
27
+ }
28
+ # Store the split values
29
+ for (j = 1; j <= n; j++) {
30
+ lists[col_idx][j] = values[j]
31
+ }
32
+ # Fill missing values with "."
33
+ for (j = n + 1; j <= max_len; j++) {
34
+ lists[col_idx][j] = "."
35
+ }
36
+ }
37
+ }
38
+
39
+ # Output exploded rows
40
+ for (i = 1; i <= max_len; i++) {
41
+ for (j = 1; j <= NF; j++) {
42
+ if (j in list_cols) {
43
+ printf "%s", lists[j][i]
44
+ } else {
45
+ printf "%s", row[j]
46
+ }
47
+ if (j < NF) printf "\t"
48
+ }
49
+ printf "\n"
50
+ }
51
+
52
+ # Clear arrays for next row
53
+ delete lists
54
+ }