split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
split3c/resite/main.py ADDED
@@ -0,0 +1,506 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+ import argparse
25
+ import logging
26
+ import os
27
+ import sys
28
+ from multiprocessing import Process, Queue
29
+ from typing import Any, List, Optional
30
+
31
+ from .frag import process_items
32
+ from .pretreatment import partition_threads, search_in_database
33
+ from .read import read_fastq_gzip_simultaneously
34
+ from .write_control import manage_pigz_problems, open_output, write_pairs
35
+
36
+ # Setup logging
37
+ logging.basicConfig(level=logging.INFO)
38
+
39
+ from importlib.metadata import PackageNotFoundError
40
+ from importlib.metadata import version as _version
41
+
42
+ try:
43
+ __version__ = _version("parasplit")
44
+ except PackageNotFoundError:
45
+ __version__ = "0+unknown"
46
+
47
+
48
+ def _try_rich() -> Optional[dict[str, Any]]:
49
+ try:
50
+ from rich import box
51
+ from rich.console import Console
52
+ from rich.panel import Panel
53
+ from rich.table import Table
54
+ from rich.theme import Theme
55
+ from rich.traceback import install
56
+ from rich_argparse import RichHelpFormatter
57
+ except Exception:
58
+ return None
59
+
60
+ console = Console(
61
+ theme=Theme({"info": "dim cyan", "error": "bold red", "warning": "magenta"}),
62
+ width=100,
63
+ )
64
+ install(console=console)
65
+ return {
66
+ "console": console,
67
+ "Panel": Panel,
68
+ "Table": Table,
69
+ "box": box,
70
+ "RichHelpFormatter": RichHelpFormatter,
71
+ }
72
+
73
+
74
+ _R = _try_rich()
75
+
76
+
77
+ class MyArgumentParser(argparse.ArgumentParser):
78
+ def error(self, message: str) -> None:
79
+ if _R is not None:
80
+ console = _R["console"]
81
+ Panel = _R["Panel"]
82
+ console.print(
83
+ Panel(
84
+ f"[bold red]Error:[/bold red] {message}",
85
+ title="Incorrect arguments",
86
+ expand=True,
87
+ width=100,
88
+ )
89
+ )
90
+ self.print_help()
91
+ self.exit(2)
92
+ self.print_usage(sys.stderr)
93
+ self.exit(2, f"{self.prog}: error: {message}\n")
94
+
95
+
96
+ def _formatter_class():
97
+ if _R is None:
98
+ return argparse.RawTextHelpFormatter
99
+
100
+ from rich_argparse import RawTextRichHelpFormatter
101
+
102
+ return lambda prog: RawTextRichHelpFormatter(
103
+ prog,
104
+ max_help_position=42,
105
+ width=110,
106
+ )
107
+
108
+
109
+ def _print_banner() -> None:
110
+ if _R is None:
111
+ return
112
+ console = _R["console"]
113
+ Panel = _R["Panel"]
114
+ console.print(
115
+ Panel(
116
+ "[bold blue]Parasplit[/bold blue]\n"
117
+ "Split paired FASTQ at restriction enzyme ligation sites.\n\n"
118
+ """Features \n
119
+ Find and Utilize Restriction Enzyme Sites: Automatically identify ligation sites from provided enzyme names and generate regex patterns to locate these sites in sequences.
120
+ Fragmentation: Split sequences at restriction enzyme sites, creating smaller fragments.
121
+ Multi-threading: Efficiently handle large datasets by utilizing multiple threads for decompression, fragmentation, and compression.
122
+ Custom Modes: Supports different pairing modes for sequence fragments.\n\n
123
+ """
124
+ "Use --help to see detailed options.",
125
+ title="[bold green]parasplit[/bold green]",
126
+ subtitle=f"Version: {__version__}",
127
+ expand=True,
128
+ width=100,
129
+ )
130
+ )
131
+ console.print("")
132
+
133
+
134
+ def _print_summary(args: argparse.Namespace) -> None:
135
+ if _R is None:
136
+ return
137
+ console = _R["console"]
138
+ Table = _R["Table"]
139
+ box = _R["box"]
140
+
141
+ t = Table(
142
+ show_edge=True,
143
+ title="[bold green]Summary[/bold green]",
144
+ box=box.HEAVY,
145
+ width=100,
146
+ )
147
+ t.add_column("Key", style="cyan", no_wrap=True)
148
+ t.add_column("Value", style="magenta")
149
+
150
+ t.add_row("--source-forward", str(args.source_forward))
151
+ t.add_row("--source-reverse", str(args.source_reverse))
152
+ t.add_row("--output-forward", str(args.output_forward))
153
+ t.add_row("--output-reverse", str(args.output_reverse))
154
+ t.add_row("--enzymes", str(args.enzymes))
155
+ t.add_row("--mode", str(args.mode))
156
+ t.add_row("--seed-size", str(args.seed_size))
157
+ t.add_row("--buffer-size", str(args.buffer_size))
158
+ t.add_row("--num-threads", str(args.num_threads))
159
+ t.add_row("--borderless", str(args.borderless))
160
+ t.add_row("--tags", str(args.tags))
161
+ t.add_row("--force", str(args.force))
162
+ t.add_row("--verbose", str(args.verbose))
163
+
164
+ console.print(t)
165
+ console.print("")
166
+
167
+
168
+ def _parse_enzymes(raw: str) -> list[str]:
169
+ raw = (raw or "").strip()
170
+ if not raw or raw == "No restriction enzyme found":
171
+ return []
172
+ return [x.strip() for x in raw.split(",") if x.strip()]
173
+
174
+
175
+ def validate_args(args: argparse.Namespace) -> None:
176
+ def _file_exists(path: str, what: str) -> None:
177
+ if not os.path.exists(path):
178
+ raise ValueError(f"{what}: file not found: {path}")
179
+ if not os.path.isfile(path):
180
+ raise ValueError(f"{what}: not a file: {path}")
181
+
182
+ def _parent_writable(path: str, what: str) -> None:
183
+ parent = os.path.dirname(os.path.abspath(path)) or os.getcwd()
184
+ if not os.path.exists(parent):
185
+ raise ValueError(f"{what}: parent directory does not exist: {parent}")
186
+ if not os.access(parent, os.W_OK):
187
+ raise ValueError(f"{what}: parent directory not writable: {parent}")
188
+
189
+ _file_exists(args.source_forward, "Forward FASTQ")
190
+ _file_exists(args.source_reverse, "Reverse FASTQ")
191
+ _parent_writable(args.output_forward, "Output R1")
192
+ _parent_writable(args.output_reverse, "Output R2")
193
+
194
+ if args.output_forward == args.output_reverse:
195
+ raise ValueError("Output R1 and R2 must be different files")
196
+
197
+ if args.num_threads < 5:
198
+ raise ValueError("--num-threads must be >= 5 (recommended >= 8)")
199
+
200
+ if args.seed_size < 0:
201
+ raise ValueError("--seed-size must be >= 0")
202
+
203
+ if args.buffer_size <= 0:
204
+ raise ValueError("--buffer-size must be >= 1")
205
+
206
+ if args.mode not in ("fr", "all", "cover"):
207
+ raise ValueError("--mode must be one of: fr, all, cover")
208
+
209
+ if not args.force:
210
+ for p in (args.output_forward, args.output_reverse):
211
+ if os.path.exists(p):
212
+ raise ValueError(
213
+ f"Output already exists: {p} (use --force to overwrite)"
214
+ )
215
+
216
+
217
+ def cut(
218
+ source_forward: str,
219
+ source_reverse: str,
220
+ output_forward: str,
221
+ output_reverse: str,
222
+ list_enzyme: List[str],
223
+ mode,
224
+ seed_size,
225
+ buffer_size: int = 100,
226
+ num_threads: int = 8,
227
+ borderless: bool = False,
228
+ tags=None,
229
+ ) -> None:
230
+ """
231
+ Main function to process sequences based on enzyme restriction sites.
232
+
233
+ Parameters:
234
+ source_forward (str): Input file path for forward reads.
235
+ source_reverse (str): Input file path for reverse reads.
236
+ output_forward (str): Output file path for processed forward reads.
237
+ output_reverse (str): Output file path for processed reverse reads.
238
+ list_enzyme (List[str]): List of restriction enzymes.
239
+ mode (str): Mode of pairing fragments, "all" or "fr".
240
+ seed_size (int): Minimum length of fragments to keep.
241
+ buffer_size (int, optional): Size of buffer. Defaults to 100.
242
+ num_threads (int, optional): Number of threads to use for processing. Defaults to 8.
243
+ borderless (bool, optional): Whether to discard ligation sites (borders). Defaults to False.
244
+
245
+ Returns:
246
+ None
247
+ """
248
+ # Threads allocations :
249
+ TRead, TFrag, TWrite = partition_threads(num_threads)
250
+
251
+ # Take the enzyme list and make the ligation site list
252
+ ligation_site_list = search_in_database(list_enzyme, borderless)
253
+
254
+ try:
255
+ # Input and Output Queues
256
+ Input_Buffer = Queue(maxsize=2048)
257
+ Output_buffer = Queue(maxsize=512)
258
+
259
+ def read_process():
260
+ read_fastq_gzip_simultaneously(
261
+ source_forward, source_reverse, Input_Buffer, TRead, TFrag
262
+ )
263
+
264
+ def process_process_all():
265
+ process_items(
266
+ Input_Buffer,
267
+ Output_buffer,
268
+ ligation_site_list,
269
+ seed_size,
270
+ buffer_size,
271
+ mode,
272
+ borderless,
273
+ tags,
274
+ )
275
+
276
+ def write_process():
277
+ # IMPORTANT: ouvrir/fermer pigz DANS le process writer
278
+ outF, outR = open_output(TWrite, output_forward, output_reverse)
279
+ try:
280
+ write_pairs(Output_buffer, outF, outR, TFrag)
281
+ finally:
282
+ manage_pigz_problems(outF, outR, output_forward, output_reverse)
283
+
284
+ # Read fastq files in parallel and asynchronous
285
+ read_p = Process(target=read_process)
286
+
287
+ # Choose mode and Create the executor and dispatch work to it
288
+ if mode == "all":
289
+ print("Mode ALL selected")
290
+ process_p_list = [Process(target=process_process_all) for _ in range(TFrag)]
291
+ elif mode == "fr":
292
+ print("Mode FR selected")
293
+ process_p_list = [Process(target=process_process_all) for _ in range(TFrag)]
294
+ elif mode == "cover":
295
+ print("Mode COVER selected")
296
+ process_p_list = [Process(target=process_process_all) for _ in range(TFrag)]
297
+ else:
298
+ print(f"Unknown mode: {mode}")
299
+ sys.exit(1)
300
+
301
+ # Create asynchronous writing
302
+ write_p = Process(target=write_process)
303
+
304
+ # Start processes
305
+ read_p.start()
306
+ for p in process_p_list:
307
+ p.start()
308
+ write_p.start()
309
+
310
+ # Wait for all processes to finish
311
+ read_p.join()
312
+ for p in process_p_list:
313
+ p.join()
314
+ write_p.join()
315
+
316
+ procs = [
317
+ ("read", read_p),
318
+ *[(f"proc{i}", p) for i, p in enumerate(process_p_list)],
319
+ ("write", write_p),
320
+ ]
321
+ bad = [(name, p.exitcode) for name, p in procs if p.exitcode not in (0, None)]
322
+ if bad:
323
+ raise RuntimeError(f"Subprocess failure(s): {bad}")
324
+
325
+ except KeyboardInterrupt:
326
+ print("Keyboard interrupt detected. Terminating...")
327
+ sys.exit(0)
328
+
329
+
330
+ def build_parser() -> argparse.ArgumentParser:
331
+ parser = MyArgumentParser(
332
+ description=(
333
+ "Split paired-end FASTQ at restriction enzyme ligation sites.\n"
334
+ "Example: parasplit -sf R1.fq.gz -sr R2.fq.gz -of out_R1.fq.gz -or out_R2.fq.gz -le DpnII,MboI -m fr -nt 12"
335
+ ),
336
+ epilog=(
337
+ "Examples:\n"
338
+ " parasplit -sf R1.fq.gz -sr R2.fq.gz -of out_R1.fq.gz -or out_R2.fq.gz -le DpnII -m fr -nt 12 -sz 20\n"
339
+ " parasplit -sf R1.fq.gz -sr R2.fq.gz -of out_R1.fq.gz -or out_R2.fq.gz -le DpnII,MboI -m all -nt 24 --tags o\n"
340
+ ),
341
+ formatter_class=_formatter_class(),
342
+ )
343
+
344
+ req = parser.add_argument_group("Inputs")
345
+ out = parser.add_argument_group("Outputs")
346
+ perf = parser.add_argument_group("Performance")
347
+ split = parser.add_argument_group("Split parameters")
348
+ misc = parser.add_argument_group("Misc")
349
+
350
+ req.add_argument(
351
+ "-sf", "--source_forward", required=True, help="Input FASTQ R1 (gz)."
352
+ )
353
+ req.add_argument(
354
+ "-sr", "--source_reverse", required=True, help="Input FASTQ R2 (gz)."
355
+ )
356
+ req.add_argument(
357
+ "-le",
358
+ "--list_enzyme",
359
+ default="",
360
+ help="Comma-separated restriction enzyme names (e.g. DpnII,MboI). Empty means 'no enzyme'.",
361
+ )
362
+
363
+ out.add_argument(
364
+ "-of", "--output_forward", required=True, help="Output FASTQ R1 (gz)."
365
+ )
366
+ out.add_argument(
367
+ "-or", "--output_reverse", required=True, help="Output FASTQ R2 (gz)."
368
+ )
369
+ out.add_argument(
370
+ "--force",
371
+ action="store_true",
372
+ help="Overwrite output files if they already exist.",
373
+ )
374
+
375
+ perf.add_argument(
376
+ "-nt", "--num_threads", type=int, default=8, help="Total CPU threads budget."
377
+ )
378
+
379
+ split.add_argument(
380
+ "-m",
381
+ "--mode",
382
+ choices=["fr", "all", "cover"],
383
+ default="cover",
384
+ help=(
385
+ "Pairing mode:\n"
386
+ " fr : one forward fragment + one reverse fragment\n"
387
+ " all : all pairwise fragment combinations\n"
388
+ " cover : minimal / near-minimal pairs so every fragment appears at least once (Sufficient to recover all post-processing multiplexe)"
389
+ ),
390
+ )
391
+ split.add_argument(
392
+ "-sz",
393
+ "--seed_size",
394
+ type=int,
395
+ default=20,
396
+ help="Minimum fragment length to keep after splitting (0 disables filtering).",
397
+ )
398
+ split.add_argument(
399
+ "--buffer-size",
400
+ type=int,
401
+ default=100,
402
+ help="Chunk size flushed to writer.",
403
+ )
404
+ split.add_argument(
405
+ "-b",
406
+ "--borderless",
407
+ action="store_true",
408
+ help="Discard ligation site borders.",
409
+ )
410
+ split.add_argument(
411
+ "--tags",
412
+ choices=["origin", "no_annot", "o", "na"],
413
+ default="o",
414
+ help=(
415
+ "Header tagging mode for split reads.\n"
416
+ " origin/o : include fragment origin tags (F1,R1,...)\n"
417
+ " no_annot/na: keep base name only / No annotation (Not recommended if you wish to filter the pairs after the process.)"
418
+ ),
419
+ )
420
+
421
+ misc.add_argument(
422
+ "-v",
423
+ "--verbose",
424
+ action="count",
425
+ default=0,
426
+ help="Increase verbosity (-v: INFO, -vv: DEBUG).",
427
+ )
428
+ misc.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
429
+
430
+ return parser
431
+
432
+
433
+ def main_cli(argv: Optional[list[str]] = None) -> int:
434
+ _print_banner()
435
+
436
+ parser = build_parser()
437
+ args = parser.parse_args(argv)
438
+
439
+ # logging level
440
+ level = logging.WARNING
441
+ if args.verbose == 1:
442
+ level = logging.INFO
443
+ elif args.verbose >= 2:
444
+ level = logging.DEBUG
445
+ logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
446
+
447
+ # normalize enzymes
448
+ args.enzymes = _parse_enzymes(args.list_enzyme)
449
+
450
+ try:
451
+ validate_args(args)
452
+ except ValueError as e:
453
+ if _R is not None:
454
+ Panel = _R["Panel"]
455
+ _R["console"].print(
456
+ Panel(
457
+ f"[bold red]{e}[/bold red]",
458
+ title="Validation",
459
+ expand=True,
460
+ width=100,
461
+ )
462
+ )
463
+ parser.print_help()
464
+ else:
465
+ print(f"ERROR: {e}", file=sys.stderr)
466
+ parser.print_help(sys.stderr)
467
+ return 2
468
+
469
+ _print_summary(args)
470
+
471
+ try:
472
+ cut(
473
+ source_forward=args.source_forward,
474
+ source_reverse=args.source_reverse,
475
+ output_forward=args.output_forward,
476
+ output_reverse=args.output_reverse,
477
+ list_enzyme=args.enzymes,
478
+ mode=args.mode,
479
+ seed_size=args.seed_size,
480
+ tags=args.tags,
481
+ buffer_size=args.buffer_size,
482
+ num_threads=args.num_threads,
483
+ borderless=args.borderless,
484
+ )
485
+ except KeyboardInterrupt:
486
+ return 130
487
+ except Exception as e:
488
+ if _R is not None:
489
+ Panel = _R["Panel"]
490
+ _R["console"].print(
491
+ Panel(
492
+ f"[bold red]{e}[/bold red]",
493
+ title="Runtime error",
494
+ expand=True,
495
+ width=100,
496
+ )
497
+ )
498
+ else:
499
+ print(f"ERROR: {e}", file=sys.stderr)
500
+ return 1
501
+
502
+ return 0
503
+
504
+
505
+ if __name__ == "__main__":
506
+ raise SystemExit(main_cli())