split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
split3c/nssite/main.py ADDED
@@ -0,0 +1,368 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+ import argparse
25
+ import logging
26
+ import os
27
+ import sys
28
+ from typing import Any, Optional
29
+
30
+ from .split import cut
31
+
32
+ __version__ = "1.1.0"
33
+
34
+
35
+ def _try_rich() -> Optional[dict[str, Any]]:
36
+ try:
37
+ from rich import box
38
+ from rich.console import Console
39
+ from rich.panel import Panel
40
+ from rich.table import Table
41
+ from rich.theme import Theme
42
+ from rich.traceback import install
43
+ from rich_argparse import RichHelpFormatter
44
+ except Exception:
45
+ return None
46
+
47
+ console = Console(
48
+ theme=Theme({"info": "dim cyan", "error": "bold red", "warning": "magenta"}),
49
+ width=100,
50
+ )
51
+ install(console=console)
52
+ return {
53
+ "console": console,
54
+ "Panel": Panel,
55
+ "Table": Table,
56
+ "box": box,
57
+ "RichHelpFormatter": RichHelpFormatter,
58
+ }
59
+
60
+
61
+ _R = _try_rich()
62
+
63
+
64
+ class MyArgumentParser(argparse.ArgumentParser):
65
+ def error(self, message: str) -> None:
66
+ if _R is not None:
67
+ console = _R["console"]
68
+ Panel = _R["Panel"]
69
+ console.print(
70
+ Panel(
71
+ f"[bold red]Error:[/bold red] {message}",
72
+ title="Incorrect arguments",
73
+ expand=True,
74
+ width=100,
75
+ )
76
+ )
77
+ self.print_help()
78
+ self.exit(2)
79
+ self.print_usage(sys.stderr)
80
+ self.exit(2, f"{self.prog}: error: {message}\n")
81
+
82
+
83
+ def validate_args(args: argparse.Namespace) -> None:
84
+ def _file_exists(path: str, what: str) -> None:
85
+ if not os.path.exists(path):
86
+ raise ValueError(f"{what}: file not found: {path}")
87
+ if not os.path.isfile(path):
88
+ raise ValueError(f"{what}: not a file: {path}")
89
+
90
+ def _parent_writable(path: str, what: str) -> None:
91
+ parent = os.path.dirname(os.path.abspath(path)) or os.getcwd()
92
+ if not os.path.exists(parent):
93
+ raise ValueError(f"{what}: parent directory does not exist: {parent}")
94
+ if not os.access(parent, os.W_OK):
95
+ raise ValueError(f"{what}: parent directory not writable: {parent}")
96
+
97
+ _file_exists(args.bam_1, "Input BAM")
98
+
99
+ if not args.single_bam:
100
+ if args.bam_2 is None:
101
+ raise ValueError("--bam-2 is required unless --single-bam is used")
102
+ _file_exists(args.bam_2, "Reverse BAM")
103
+
104
+ _parent_writable(args.output_forward, "Output R1")
105
+ _parent_writable(args.output_reverse, "Output R2")
106
+
107
+ if args.num_threads < 7:
108
+ raise ValueError("--num-threads must be >= 7 (recommended >= 12)")
109
+
110
+ if args.seed_size < 0:
111
+ raise ValueError("--seed-size must be >= 0")
112
+
113
+ if args.lenght_added < 0:
114
+ raise ValueError("--length-added must be >= 0")
115
+
116
+ if args.pairing_mode not in ("all", "cover"):
117
+ raise ValueError("--pairing-mode must be one of: all, cover")
118
+
119
+ if args.output_forward == args.output_reverse:
120
+ raise ValueError("Output R1 and R2 must be different files")
121
+
122
+ if not args.force:
123
+ for p in (args.output_forward, args.output_reverse):
124
+ if os.path.exists(p):
125
+ raise ValueError(
126
+ f"Output already exists: {p} (use --force to overwrite)"
127
+ )
128
+
129
+
130
+ def _formatter_class():
131
+ if _R is None:
132
+ return argparse.RawTextHelpFormatter
133
+
134
+ from rich_argparse import RawTextRichHelpFormatter
135
+
136
+ return lambda prog: RawTextRichHelpFormatter(
137
+ prog,
138
+ max_help_position=42,
139
+ width=110,
140
+ )
141
+
142
+
143
+ def _print_banner() -> None:
144
+ if _R is None:
145
+ return
146
+ console = _R["console"]
147
+ Panel = _R["Panel"]
148
+ console.print(
149
+ Panel(
150
+ "[bold blue]Microsplit[/bold blue]\n"
151
+ "Process paired BAM (Micro-C) into paired FASTQ.\n\n"
152
+ "Use --help to see detailed options.",
153
+ title="[bold green]microsplit-cut[/bold green]",
154
+ subtitle=f"Version: {__version__}",
155
+ expand=True,
156
+ width=100,
157
+ )
158
+ )
159
+ console.print("")
160
+
161
+
162
+ def _print_summary(args: argparse.Namespace) -> None:
163
+ if _R is None:
164
+ return
165
+ console = _R["console"]
166
+ Table = _R["Table"]
167
+ box = _R["box"]
168
+
169
+ t = Table(
170
+ show_edge=True,
171
+ title="[bold green]Summary[/bold green]",
172
+ box=box.HEAVY,
173
+ width=100,
174
+ )
175
+ t.add_column("Key", style="cyan", no_wrap=True)
176
+ t.add_column("Value", style="magenta")
177
+
178
+ t.add_row("--single-bam", str(args.single_bam))
179
+ t.add_row("--bam-1", str(args.bam_1))
180
+ t.add_row("--bam-2", str(args.bam_2))
181
+ t.add_row("--output-forward", str(args.output_forward))
182
+ t.add_row("--output-reverse", str(args.output_reverse))
183
+ t.add_row("--num-threads", str(args.num_threads))
184
+ t.add_row("--seed-size", str(args.seed_size))
185
+ t.add_row("--length-added", str(args.lenght_added))
186
+ t.add_row("--pairing-mode", str(args.pairing_mode))
187
+ t.add_row("--tags", str(args.tags))
188
+ t.add_row("--force", str(args.force))
189
+ t.add_row("--verbose", str(args.verbose))
190
+
191
+ console.print(t)
192
+ console.print("")
193
+
194
+
195
+ def main_cli(argv: Optional[list[str]] = None) -> int:
196
+ _print_banner()
197
+
198
+ parser = MyArgumentParser(
199
+ description=(
200
+ "Process Micro-C BAM input to paired FASTQ.\n"
201
+ "Supports either two BAM files (forward/reverse) or one interleaved BAM."
202
+ ),
203
+ epilog=(
204
+ "Examples:\n"
205
+ " \tmicrosplit -1 fwd.bam -2 rev.bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 -l 0 --pairing-mode cover \n"
206
+ " \tmicrosplit -1 merged.bam --single-bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 --pairing-mode all\n"
207
+ ),
208
+ formatter_class=_formatter_class(),
209
+ )
210
+
211
+ req = parser.add_argument_group("Required inputs")
212
+ out = parser.add_argument_group("Outputs")
213
+ perf = parser.add_argument_group("Performance")
214
+ split = parser.add_argument_group("Split parameters")
215
+ misc = parser.add_argument_group("Misc")
216
+
217
+ req.add_argument(
218
+ "-1",
219
+ "--bam_1",
220
+ type=str,
221
+ required=True,
222
+ help="Path to forward BAM file, or to the single interleaved BAM if --single-bam is used.",
223
+ )
224
+ req.add_argument(
225
+ "-2",
226
+ "--bam_2",
227
+ type=str,
228
+ required=False,
229
+ default=None,
230
+ help="Path to reverse BAM file. Not required if --single-bam is used.",
231
+ )
232
+ req.add_argument(
233
+ "--single-bam",
234
+ action="store_true",
235
+ help="Use a single interleaved BAM as input (R1 line followed by R2 line with same read name).",
236
+ )
237
+
238
+ out.add_argument(
239
+ "-o1",
240
+ "--output_forward",
241
+ type=str,
242
+ required=True,
243
+ help="Path to output forward FASTQ (R1).",
244
+ )
245
+ out.add_argument(
246
+ "-o2",
247
+ "--output_reverse",
248
+ type=str,
249
+ required=True,
250
+ help="Path to output reverse FASTQ (R2).",
251
+ )
252
+ out.add_argument(
253
+ "--force",
254
+ action="store_true",
255
+ help="Overwrite output files if they already exist.",
256
+ )
257
+
258
+ perf.add_argument(
259
+ "-t",
260
+ "--num_threads",
261
+ type=int,
262
+ default=12,
263
+ help="Total CPU threads budget (>= 7).",
264
+ )
265
+
266
+ split.add_argument(
267
+ "-s",
268
+ "--seed_size",
269
+ type=int,
270
+ default=20,
271
+ help="Minimum fragment length to keep after splitting (0 disables filtering). We strongly advise against using a seed size of less than 10bp for the rest of the treatment (long computing time).",
272
+ )
273
+
274
+ split.add_argument(
275
+ "-l",
276
+ "--lenght_added",
277
+ "--length-added",
278
+ dest="lenght_added",
279
+ type=int,
280
+ default=0,
281
+ help="Bases added around split boundary (soft-clip extension).",
282
+ )
283
+
284
+ split.add_argument(
285
+ "--pairing-mode",
286
+ choices=["all", "cover"],
287
+ default="cover",
288
+ help=(
289
+ "Pair generation strategy.\n"
290
+ " cover: minimal/quasi-minimal number of pairs so each fragment appears at least once (Doesn't lose data and is optimal for computation time).\n"
291
+ " all: all pairwise combinations."
292
+ ),
293
+ )
294
+
295
+ split.add_argument(
296
+ "--tags",
297
+ choices=["origin", "no_annot", "o", "na"],
298
+ default="o",
299
+ help=(
300
+ "Header tagging mode for split reads.\n"
301
+ " origin/o : include fragment origin tags (F0,R1,...)\n"
302
+ " no_annot/na: keep base name only / No annotation (Not recommended if you wish to filter the pairs after the process.)"
303
+ ),
304
+ )
305
+
306
+ misc.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
307
+ misc.add_argument(
308
+ "-v",
309
+ "--verbose",
310
+ action="count",
311
+ default=0,
312
+ help="Increase verbosity (-v, -vv).",
313
+ )
314
+
315
+ args = parser.parse_args(argv)
316
+
317
+ level = logging.WARNING
318
+ if args.verbose == 1:
319
+ level = logging.INFO
320
+ elif args.verbose >= 2:
321
+ level = logging.DEBUG
322
+ logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
323
+
324
+ try:
325
+ validate_args(args)
326
+ except ValueError as e:
327
+ if _R is not None:
328
+ Panel = _R["Panel"]
329
+ _R["console"].print(
330
+ Panel(
331
+ f"[bold red]{e}[/bold red]",
332
+ title="Validation",
333
+ expand=True,
334
+ width=100,
335
+ )
336
+ )
337
+ parser.print_help()
338
+ else:
339
+ print(f"ERROR: {e}", file=sys.stderr)
340
+ parser.print_help(sys.stderr)
341
+ return 2
342
+
343
+ _print_summary(args)
344
+
345
+ try:
346
+ cut(args)
347
+ except KeyboardInterrupt:
348
+ return 130
349
+ except Exception as e:
350
+ if _R is not None:
351
+ Panel = _R["Panel"]
352
+ _R["console"].print(
353
+ Panel(
354
+ f"[bold red]{e}[/bold red]",
355
+ title="Runtime error",
356
+ expand=True,
357
+ width=100,
358
+ )
359
+ )
360
+ else:
361
+ print(f"ERROR: {e}", file=sys.stderr)
362
+ return 1
363
+
364
+ return 0
365
+
366
+
367
+ if __name__ == "__main__":
368
+ main_cli()
@@ -0,0 +1,51 @@
1
+ from multiprocessing import Process, Queue
2
+ import sys
3
+ import traceback
4
+
5
+ class WorkerProcess(Process):
6
+ def __init__(self, target, args, error_queue):
7
+ super().__init__(target=target, args=args)
8
+ self.error_queue = error_queue
9
+
10
+ def run(self):
11
+ try:
12
+ super().run()
13
+ except Exception as e:
14
+ print(f"Worker error: {e}")
15
+ self.error_queue.put((str(e), traceback.format_exc()))
16
+ sys.exit(1)
17
+
18
+ class ProcessManager:
19
+ def __init__(self):
20
+ self.processes = []
21
+ self.error_queue = Queue()
22
+
23
+ def start_worker(self, target, args):
24
+ self.processes.append(WorkerProcess(target, args, self.error_queue))
25
+ self.processes[-1].start()
26
+
27
+ def running(self):
28
+ return any([p.is_alive() for p in self.processes])
29
+
30
+ def check_processes(self):
31
+ for p in self.processes:
32
+ if not p.is_alive() and p.exitcode != 0:
33
+ print("Worker process crashed!")
34
+ return False
35
+
36
+ if not self.error_queue.empty():
37
+ error, tb = self.error_queue.get()
38
+ print(f"Worker error: {error}")
39
+ self.shutdown()
40
+ return True
41
+
42
+ def shutdown(self):
43
+ for i, p in enumerate(self.processes):
44
+ if p.is_alive():
45
+ p.terminate()
46
+ p.join()
47
+
48
+ def handle_signal(self, signum, frame):
49
+ print("Received shutdown signal")
50
+ self.shutdown()
51
+ sys.exit(1)