split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,299 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+ import logging
25
+ import re
26
+ import sys
27
+ from typing import List, Tuple
28
+
29
+ from Bio.Restriction import RestrictionBatch
30
+ from Bio.Seq import Seq
31
+
32
+ logging.basicConfig(level=logging.INFO)
33
+
34
+
35
+ ############################### Common part #################################
36
+
37
+
38
+ def case_adaptation(List_Enzyme):
39
+ """
40
+ Case sensitive enzymes adaptation
41
+
42
+ Examples
43
+ --------
44
+ >>> case_adaptation(["hindiii"])
45
+ ['HindIII']
46
+ >>> case_adaptation(["dpnii", "bglii", "mboi"])
47
+ ['DpnII', 'BglII', 'MboI']
48
+ >>> case_adaptation(["arima"])
49
+ ['DpnII', 'HinfI']
50
+ >>> case_adaptation([" Foo ", "arima", "DpnII"])
51
+ ['Foo', 'DpnII', 'HinfI', 'DpnII']
52
+ """
53
+ adapted = []
54
+
55
+ for enzyme in List_Enzyme:
56
+ enz = str(enzyme).strip()
57
+ key = enz.lower()
58
+
59
+ if key == "hindiii":
60
+ adapted.append("HindIII")
61
+ elif key == "dpnii":
62
+ adapted.append("DpnII")
63
+ elif key == "bglii":
64
+ adapted.append("BglII")
65
+ elif key == "mboi":
66
+ adapted.append("MboI")
67
+ elif key == "arima":
68
+ # Double enzyme
69
+ adapted.extend(["DpnII", "HinfI"])
70
+ else:
71
+ adapted.append(enz)
72
+
73
+ return adapted
74
+
75
+
76
+ def find_liga_sites(
77
+ List_Enzyme: List[str], borderless: bool = False
78
+ ) -> List[Tuple[re.Pattern, int]]:
79
+ """
80
+ This function finds the ligation sites for a given list of enzymes and
81
+ their length.
82
+
83
+ Parameters:
84
+ List_Enzyme (List[str]): A list of enzymes for which to find the ligation
85
+ sites.
86
+
87
+ borderless (bool, optional): If True, the total length of the give and
88
+ accept sites is used.
89
+ If False, only the length of the give site
90
+ is used. Default is False.
91
+
92
+ Returns:
93
+ List[Tuple[re.Pattern, int]]: A list of tuples, where each tuple contains
94
+ a compiled regular expression
95
+ pattern for the ligation site and the length
96
+ of the site.
97
+
98
+ Examples
99
+ --------
100
+ >>> out = find_liga_sites(["DpnII"])
101
+ >>> isinstance(out, list) and len(out) >= 1
102
+ True
103
+ >>> any(p.pattern == "GATCGATC" and off == 4 for p, off in out)
104
+ True
105
+
106
+ >>> out_b = find_liga_sites(["DpnII"], borderless=True)
107
+ >>> any(p.pattern == "GATCGATC" and off == 8 for p, off in out_b)
108
+ True
109
+
110
+ # This function is inspired by and adapted from the Cutsite function in Hicstuff
111
+ # (https://github.com/koszullab/hicstuff), originally under BSD license.
112
+ # See https://github.com/koszullab/hicstuff/blob/main/LICENSE for the full license.
113
+ """
114
+ restriction_batch = RestrictionBatch(List_Enzyme)
115
+ give_list = []
116
+ accept_list = []
117
+ ligation_site_list = []
118
+
119
+ for enz in restriction_batch:
120
+ site = enz.elucidate()
121
+ fw_cut = site.find("^")
122
+ rev_cut = site.find("_")
123
+
124
+ # Purify give site
125
+ give_site = site[:rev_cut].replace("^", "")
126
+ while give_site[0] == "N":
127
+ give_site = give_site[1:]
128
+ give_list.append(give_site)
129
+
130
+ # Purify accept site
131
+ accept_site = site[fw_cut + 1 :].replace("_", "")
132
+ while accept_site[-1] == "N":
133
+ accept_site = accept_site[:-1]
134
+ accept_list.append(accept_site)
135
+
136
+ # Find ligation site
137
+ for give_site in give_list:
138
+ for accept_site in accept_list:
139
+ ligation_site = (give_site + accept_site).replace("N", ".")
140
+ compiled_regex = re.compile(ligation_site)
141
+
142
+ # Use total lenght for borderless
143
+ if borderless:
144
+ length = len(give_site) + len(accept_site)
145
+ else:
146
+ length = len(give_site)
147
+ ligation_site_list.append((compiled_regex, length))
148
+
149
+ # If ligation site is not palindromic
150
+ reverse_complement_site = str(Seq(ligation_site).reverse_complement())
151
+
152
+ if ligation_site != reverse_complement_site:
153
+ compiled_reverse_regex = re.compile(reverse_complement_site)
154
+ # Use lenght of accept site for reverse complement site
155
+ if borderless:
156
+ length = len(give_site) + len(accept_site)
157
+ else:
158
+ length = len(accept_site)
159
+ ligation_site_list.append((compiled_reverse_regex, length))
160
+
161
+ return ligation_site_list
162
+
163
+
164
+ def search_in_database(enzymes, borderless=False):
165
+ """
166
+ _summary_ : Search enzyme in database and retrieve ligation site
167
+ Examples
168
+ --------
169
+ >>> import io, contextlib
170
+ >>> buf = io.StringIO()
171
+ >>> with contextlib.redirect_stdout(buf):
172
+ ... out = search_in_database("DpnII")
173
+ >>> isinstance(out, list) and len(out) >= 1
174
+ True
175
+ >>> any(p.pattern == "GATCGATC" and off == 4 for p, off in out)
176
+ True
177
+
178
+ >>> buf = io.StringIO()
179
+ >>> with contextlib.redirect_stdout(buf):
180
+ ... out = search_in_database("DpnII", borderless=True)
181
+ >>> any(p.pattern == "GATCGATC" and off == 8 for p, off in out)
182
+ True
183
+ >>> "Mode Borderless" in buf.getvalue()
184
+ True
185
+
186
+ >>> buf = io.StringIO()
187
+ >>> try:
188
+ ... with contextlib.redirect_stdout(buf):
189
+ ... search_in_database("No restriction enzyme found")
190
+ ... except SystemExit as e:
191
+ ... code = e.code
192
+ >>> code
193
+ 0
194
+ >>> "No restriction enzyme found" in buf.getvalue()
195
+ True
196
+ """
197
+ if enzymes == "No restriction enzyme found":
198
+ print(enzymes)
199
+ sys.exit(0)
200
+ else:
201
+ if borderless:
202
+ print("Mode Borderless")
203
+ list_enz = enzymes.split(",")
204
+ try:
205
+ ligation_site_list = find_liga_sites(case_adaptation(list_enz), borderless)
206
+ if len(ligation_site_list) > 1:
207
+ for el in ligation_site_list:
208
+ print(f"Ligation sites: {el[0]}", flush=True)
209
+ else:
210
+ print(f"Ligation sites: {ligation_site_list[0]}", flush=True)
211
+ return ligation_site_list
212
+
213
+ except Exception as e:
214
+ raise RuntimeError(
215
+ f"Error in enzyme identification for input={enzymes!r}"
216
+ ) from e
217
+
218
+
219
+ def _split_two(total: int) -> tuple[int, int]:
220
+ """
221
+ Partage total en deux parts entières, différence ≤ 1, chacune ≥ 1.
222
+ Examples
223
+ --------
224
+ >>> _split_two(2)
225
+ (1, 1)
226
+ >>> _split_two(5)
227
+ (2, 3)
228
+ >>> _split_two(8)
229
+ (4, 4)
230
+ >>> _split_two(1)
231
+ Traceback (most recent call last):
232
+ ...
233
+ ValueError: total doit être ≥ 2, ici 1
234
+ """
235
+ if total < 2:
236
+ raise ValueError(f"total doit être ≥ 2, ici {total}")
237
+ a = total // 2
238
+ b = total - a
239
+ return max(1, a), max(1, b)
240
+
241
+
242
+ def partition_threads(
243
+ num_threads: int, oversubscribe_factor: float = 1.35
244
+ ) -> tuple[int, int, int]:
245
+ """
246
+ Retourne (TRead_total, TFrag, TWrite_total), avec TRead/TWrite toujours PAIRS.
247
+ Palier minimal 5: 1R/flux, 1Frag, 1W/flux.
248
+ Extra par paires: d'abord W, puis R, en alternance. Reste impair → Frag.
249
+
250
+ Overallocation added (program doesn't use 1 CPU for 1 threads)
251
+ """
252
+ import math
253
+
254
+ if num_threads < 5:
255
+ raise ValueError(f"num_threads doit être ≥ 5, ici {num_threads}")
256
+
257
+ # base: 1 par flux pour lecture/écriture, 1 pour frag
258
+ rpf = 1 # read per file
259
+ wpf = 1 # write per file
260
+ frag = 1
261
+
262
+ extra = num_threads - 5
263
+ turn = "write"
264
+ while extra >= 2:
265
+ if turn == "write":
266
+ wpf += 1
267
+ turn = "read"
268
+ else:
269
+ rpf += 1
270
+ turn = "write"
271
+ extra -= 2
272
+
273
+ if extra == 1:
274
+ frag += 1
275
+
276
+ tread = 2 * rpf
277
+ twrite = 2 * wpf
278
+ tfrag = frag
279
+
280
+ nominal_total = tread + twrite + tfrag
281
+ target_total = math.floor(num_threads * oversubscribe_factor)
282
+ surplus = target_total - nominal_total
283
+
284
+ # Surallocation préférentielle vers écriture puis lecture, par paires
285
+ turn = "write"
286
+ while surplus >= 2:
287
+ if turn == "write":
288
+ twrite += 2
289
+ turn = "read"
290
+ else:
291
+ tread += 2
292
+ turn = "write"
293
+ surplus -= 2
294
+
295
+ # Reste impair vers fragmentation
296
+ if surplus == 1:
297
+ tfrag += 1
298
+
299
+ return tread, tfrag, twrite
split3c/resite/read.py ADDED
@@ -0,0 +1,91 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+ import logging
25
+ import subprocess
26
+
27
+ logging.basicConfig(level=logging.INFO)
28
+
29
+
30
+ def stop_signal(Queue, NumThreadFragmentation):
31
+ """
32
+ _summary_ : Add a stop signal to the queue for each thread
33
+ """
34
+ for _ in range(NumThreadFragmentation):
35
+ Queue.put(None)
36
+
37
+
38
+ def read_fastq_gzip_simultaneously(
39
+ fileA: str, fileB: str, Queue, num_threads, NumThreadFragmentation
40
+ ):
41
+ """
42
+ _summary_ : Read two fastq files simultaneously, decompress them with pigz,
43
+ take a couple a read and put them into a queue by block
44
+ """
45
+ from .pretreatment import _split_two
46
+
47
+ tA, tB = _split_two(num_threads)
48
+ # Use pigz to decompress the input files
49
+ procA = subprocess.Popen(
50
+ ["pigz", "-dc", "-p", str(tA), fileA],
51
+ stdout=subprocess.PIPE,
52
+ text=True,
53
+ )
54
+ procB = subprocess.Popen(
55
+ ["pigz", "-dc", "-p", str(tB), fileB],
56
+ stdout=subprocess.PIPE,
57
+ text=True,
58
+ )
59
+
60
+ Stacker = []
61
+ try:
62
+ while True:
63
+ NomA = (procA.stdout.readline()).rstrip()
64
+ seqA = (procA.stdout.readline()).rstrip()
65
+ procA.stdout.readline() # Skip +
66
+ qualA = (procA.stdout.readline()).rstrip()
67
+
68
+ NomB = (procB.stdout.readline()).rstrip()
69
+ seqB = (procB.stdout.readline()).rstrip()
70
+ procB.stdout.readline() # Skip +
71
+ qualB = (procB.stdout.readline()).rstrip()
72
+
73
+ if not seqA or not seqB:
74
+ break
75
+
76
+ Stacker.append([[NomA, NomB], [seqA, seqB], [qualA, qualB]])
77
+
78
+ if len(Stacker) > 256:
79
+ Queue.put(Stacker)
80
+ Stacker = []
81
+
82
+ if len(Stacker) > 0:
83
+ Queue.put(Stacker)
84
+
85
+ except Exception as e:
86
+ logging.error(f"Error in TakeOneItem: {e}")
87
+
88
+ finally:
89
+ stop_signal(Queue, NumThreadFragmentation)
90
+ procA.wait()
91
+ procB.wait()
@@ -0,0 +1,111 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+ import logging
25
+ import signal
26
+ import subprocess
27
+ import sys
28
+
29
+ # Setup logging
30
+ logging.basicConfig(level=logging.INFO)
31
+
32
+
33
+ def signal_handler(sig, frame, outF, outR):
34
+ print(f"\nReceived signal {sig}. Terminating gracefully...")
35
+ outF.terminate() # Terminate the pigz processes
36
+ outR.terminate()
37
+ logging.info("\nProcess termination requested by signal")
38
+ sys.exit(0)
39
+
40
+
41
+ def open_output(TWrite, output_forward, output_reverse):
42
+ from .pretreatment import _split_two
43
+
44
+ tF, tR = _split_two(TWrite)
45
+
46
+ # Open output files for writing
47
+ outF = subprocess.Popen(
48
+ ["pigz", "-c", "-p", str(tF)],
49
+ stdin=subprocess.PIPE,
50
+ stdout=open(output_forward, "wb"),
51
+ )
52
+
53
+ outR = subprocess.Popen(
54
+ ["pigz", "-c", "-p", str(tR)],
55
+ stdin=subprocess.PIPE,
56
+ stdout=open(output_reverse, "wb"),
57
+ )
58
+
59
+ # Register signal handlers
60
+ signal.signal(
61
+ signal.SIGINT,
62
+ lambda sig, frame: signal_handler(sig, frame, outF, outR),
63
+ ) # Ctrl+C
64
+ signal.signal(
65
+ signal.SIGTSTP,
66
+ lambda sig, frame: signal_handler(sig, frame, outF, outR),
67
+ ) # Ctrl+Z
68
+
69
+ return outF, outR
70
+
71
+
72
+ def manage_pigz_problems(outF, outR, output_forward, output_reverse):
73
+ outF.stdin.close()
74
+ outR.stdin.close()
75
+
76
+ outF.wait()
77
+ outR.wait()
78
+
79
+ stdoutF, stderrF = outF.communicate()
80
+ if stderrF:
81
+ print(
82
+ f"Error in pigz command for file {output_forward}: {stderrF}",
83
+ flush=True,
84
+ )
85
+
86
+ stdoutR, stderrR = outR.communicate()
87
+ if stderrR:
88
+ print(
89
+ f"Error in pigz command for file {output_reverse}: {stderrR}",
90
+ flush=True,
91
+ )
92
+
93
+
94
+ def write_pairs(
95
+ Output_buffer,
96
+ outF: subprocess.Popen,
97
+ outR: subprocess.Popen,
98
+ TFrag,
99
+ ) -> None:
100
+ finished_processes = 0
101
+ while finished_processes < TFrag:
102
+ try:
103
+ data = Output_buffer.get()
104
+ if data is None:
105
+ finished_processes += 1
106
+ else:
107
+ outF.stdin.write("".join(data[0]).encode("utf-8"))
108
+ outR.stdin.write("".join(data[1]).encode("utf-8"))
109
+
110
+ except Exception as e:
111
+ logging.error(f"Error in write_pairs: {e}")
File without changes
split3c/resolve/bam.py ADDED
@@ -0,0 +1,129 @@
1
+ from typing import Iterator
2
+
3
+ import pysam
4
+
5
+
6
+ def get_bam_headers(bam_for_path: str, bam_rev_path: str) -> tuple[dict, dict]:
7
+ """
8
+ Read both BAM headers and return them as dictionaries.
9
+
10
+ Examples
11
+ --------
12
+ No doctest here because it requires real BAM files.
13
+ """
14
+ with pysam.AlignmentFile(bam_for_path, "rb") as bf:
15
+ header_for = bf.header.to_dict()
16
+ with pysam.AlignmentFile(bam_rev_path, "rb") as br:
17
+ header_rev = br.header.to_dict()
18
+ return header_for, header_rev
19
+
20
+
21
+ def get_bam_header_single(bam_path: str) -> dict:
22
+ """
23
+ Read one BAM header and return it as a dictionary.
24
+
25
+ Examples
26
+ --------
27
+ No doctest here because it requires a real BAM file.
28
+ """
29
+ with pysam.AlignmentFile(bam_path, "rb") as bam:
30
+ return bam.header.to_dict()
31
+
32
+
33
+ def iter_bam_pairs(
34
+ bam_for_path: str,
35
+ bam_rev_path: str,
36
+ bam_threads: int = 1,
37
+ ) -> Iterator[tuple[pysam.AlignedSegment, pysam.AlignedSegment]]:
38
+ """
39
+ Iterate over synchronized forward/reverse BAM records.
40
+
41
+ The two BAMs must contain the same qname-sorted records in the same order.
42
+
43
+ Examples
44
+ --------
45
+ No doctest here because it requires real BAM files.
46
+ """
47
+ with (
48
+ pysam.AlignmentFile(bam_for_path, "rb", threads=bam_threads) as bam_for,
49
+ pysam.AlignmentFile(bam_rev_path, "rb", threads=bam_threads) as bam_rev,
50
+ ):
51
+ for idx, (read_for, read_rev) in enumerate(zip(bam_for, bam_rev), start=1):
52
+ if read_for is None or read_rev is None:
53
+ raise ValueError(
54
+ "Forward and reverse BAMs do not have the same number of records "
55
+ f"(first mismatch at record {idx})."
56
+ )
57
+ if read_for.query_name != read_rev.query_name:
58
+ raise ValueError(
59
+ "Forward and reverse BAMs are not synchronized by qname "
60
+ f"at record {idx}: {read_for.query_name!r} != {read_rev.query_name!r}."
61
+ )
62
+ yield read_for, read_rev
63
+
64
+
65
+ def iter_bam_pairs_single(
66
+ bam_path: str,
67
+ bam_threads: int = 1,
68
+ ) -> Iterator[tuple[pysam.AlignedSegment, pysam.AlignedSegment]]:
69
+ """
70
+ Iterate over pairs from one interleaved BAM.
71
+
72
+ Assumptions
73
+ -----------
74
+ - records are written as consecutive pairs
75
+ - the two mates of one logical pair have the same query_name
76
+ - the BAM contains an even number of records
77
+
78
+ Examples
79
+ --------
80
+ No doctest here because it requires real BAM files.
81
+ """
82
+ with pysam.AlignmentFile(bam_path, "rb", threads=int(bam_threads * 2)) as bam:
83
+ it = iter(bam)
84
+ pair_idx = 0
85
+
86
+ while True:
87
+ try:
88
+ read1 = next(it)
89
+ except StopIteration:
90
+ break
91
+
92
+ try:
93
+ read2 = next(it)
94
+ except StopIteration as exc:
95
+ raise ValueError(
96
+ "Single BAM contains an odd number of records; "
97
+ f"dangling read at pair index {pair_idx + 1}: "
98
+ f"{read1.query_name!r}."
99
+ ) from exc
100
+
101
+ pair_idx += 1
102
+
103
+ if read1.query_name != read2.query_name:
104
+ raise ValueError(
105
+ "Single BAM is not properly interleaved by qname "
106
+ f"at pair index {pair_idx}: "
107
+ f"{read1.query_name!r} != {read2.query_name!r}."
108
+ )
109
+
110
+ yield read1, read2
111
+
112
+
113
+ def chromsizes_from_header(header_dict: dict) -> list[tuple[str, int]]:
114
+ """
115
+ Extract chromosome names and lengths from a BAM header dictionary.
116
+
117
+ Examples
118
+ --------
119
+ >>> chromsizes_from_header({"SQ": [{"SN": "chr1", "LN": 100}, {"SN": "chr2", "LN": 50}]})
120
+ [('chr1', 100), ('chr2', 50)]
121
+ """
122
+ chromsizes: list[tuple[str, int]] = []
123
+ for sq in header_dict.get("SQ", []):
124
+ sn = sq.get("SN")
125
+ ln = sq.get("LN")
126
+ if sn is None or ln is None:
127
+ continue
128
+ chromsizes.append((str(sn), int(ln)))
129
+ return chromsizes
@@ -0,0 +1,77 @@
1
+ import io
2
+ import os
3
+ import shlex
4
+ import shutil
5
+ import subprocess
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import TextIO
9
+
10
+
11
+ @dataclass
12
+ class TextWriter:
13
+ handle: TextIO
14
+ process: subprocess.Popen | None = None
15
+ outfile: None = None
16
+
17
+ def write(self, text: str) -> int:
18
+ return self.handle.write(text)
19
+
20
+ def flush(self) -> None:
21
+ self.handle.flush()
22
+ if self.outfile is not None:
23
+ self.outfile.flush()
24
+
25
+ def close(self) -> None:
26
+ try:
27
+ self.handle.close()
28
+ finally:
29
+ if self.process is not None:
30
+ ret = self.process.wait()
31
+ if ret != 0:
32
+ raise RuntimeError(
33
+ f"Compression command failed with exit code {ret}."
34
+ )
35
+ if self.outfile is not None and not self.outfile.closed:
36
+ self.outfile.close()
37
+
38
+
39
+ def _pick_gzip_command(nproc: int) -> list[str] | None:
40
+ candidates = []
41
+ if shutil.which("pbgzip"):
42
+ candidates.append(["pbgzip", "-c", "-n", str(max(1, nproc))])
43
+ if shutil.which("bgzip"):
44
+ candidates.append(["bgzip", "-c", "-@", str(max(1, nproc))])
45
+ if shutil.which("pigz"):
46
+ candidates.append(["pigz", "-c", "-p", str(max(1, nproc))])
47
+ if shutil.which("gzip"):
48
+ candidates.append(["gzip", "-c"])
49
+ return candidates[0] if candidates else None
50
+
51
+
52
+ def open_text_output(path: str | Path, nproc: int = 1) -> TextWriter:
53
+ path = str(path)
54
+ if path == "-":
55
+ return TextWriter(
56
+ handle=io.TextIOWrapper(os.fdopen(os.dup(1), "wb"), encoding="utf-8")
57
+ )
58
+
59
+ if path.endswith(".gz"):
60
+ cmd = _pick_gzip_command(nproc)
61
+ if cmd is None:
62
+ raise RuntimeError(
63
+ "No gzip-compatible compressor found (tried pbgzip, bgzip, pigz, gzip)."
64
+ )
65
+ outfile = open(path, "wb")
66
+ proc = subprocess.Popen(
67
+ cmd, stdin=subprocess.PIPE, stdout=outfile, stderr=subprocess.PIPE
68
+ )
69
+ if proc.stdin is None:
70
+ outfile.close()
71
+ raise RuntimeError(
72
+ f"Failed to open compressor stdin for: {shlex.join(cmd)}"
73
+ )
74
+ handle = io.TextIOWrapper(proc.stdin, encoding="utf-8")
75
+ return TextWriter(handle=handle, process=proc, outfile=outfile)
76
+
77
+ return TextWriter(handle=open(path, "w", encoding="utf-8"))