split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
split3c/__init__.py ADDED
File without changes
split3c/cli.py ADDED
@@ -0,0 +1,336 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+
23
+
24
+
25
+ split3c root command-line interface.
26
+
27
+ This module dispatches the three user-facing subcommands:
28
+
29
+ - split3c re-site
30
+ - split3c ns-site
31
+ - split3c resolve
32
+ """
33
+
34
+ import difflib
35
+ import sys
36
+ from importlib.metadata import PackageNotFoundError
37
+ from importlib.metadata import version as _pkg_version
38
+ from typing import Any, Callable, Optional
39
+
40
+ from .nssite.main import main_cli as nssite_main
41
+ from .resite.main import main_cli as resite_main
42
+ from .resolve.main import main_cli as resolve_main
43
+
44
+ try:
45
+ __version__ = _pkg_version("split3c")
46
+ except PackageNotFoundError:
47
+ __version__ = "0+unknown"
48
+
49
+ DOC_URL = "https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/blob/master/README.md?ref_type=heads"
50
+
51
+ CommandRunner = Callable[[list[str] | None], int]
52
+
53
+ COMMANDS: dict[str, dict[str, object]] = {
54
+ "re-site": {
55
+ "runner": resite_main,
56
+ "summary": (
57
+ "Restriction-based preprocessing for Hi-C / HiChIP / 3C-like libraries."
58
+ ),
59
+ "when": (
60
+ "Use when ligation junctions can be derived from known restriction enzymes."
61
+ ),
62
+ "inputs": "paired FASTQ",
63
+ "outputs": "split / multiplex FASTQ for remapping",
64
+ },
65
+ "ns-site": {
66
+ "runner": nssite_main,
67
+ "summary": ("Non-specific ligation preprocessing for Micro-C-like libraries."),
68
+ "when": (
69
+ "Use when ligation junctions must be inferred from mapped BAM structure."
70
+ ),
71
+ "inputs": "mapped BAM",
72
+ "outputs": "split FASTQ for remapping",
73
+ },
74
+ "resolve": {
75
+ "runner": resolve_main,
76
+ "summary": "Convert mapped or remapped BAM into standard .pairs.",
77
+ "when": (
78
+ "Use after mapping or remapping, in either simple or split-aware mode."
79
+ ),
80
+ "inputs": "mapped / remapped BAM",
81
+ "outputs": ".pairs",
82
+ },
83
+ }
84
+
85
+ ALIASES: dict[str, str] = {
86
+ "resite": "re-site",
87
+ "nssite": "ns-site",
88
+ }
89
+
90
+
91
+ def _try_rich() -> Optional[dict[str, Any]]:
92
+ try:
93
+ from rich import box
94
+ from rich.console import Console
95
+ from rich.panel import Panel
96
+ from rich.table import Table
97
+ from rich.theme import Theme
98
+ from rich.traceback import install
99
+ except Exception:
100
+ return None
101
+
102
+ console = Console(
103
+ theme=Theme(
104
+ {
105
+ "info": "dim cyan",
106
+ "error": "bold red",
107
+ "warning": "magenta",
108
+ "ok": "bold green",
109
+ "title": "bold green",
110
+ "cmd": "bold cyan",
111
+ }
112
+ ),
113
+ width=110,
114
+ )
115
+ install(console=console)
116
+ return {
117
+ "console": console,
118
+ "Panel": Panel,
119
+ "Table": Table,
120
+ "box": box,
121
+ }
122
+
123
+
124
+ _R = _try_rich()
125
+
126
+
127
+ def _resolve_command_name(name: str) -> str | None:
128
+ if name in COMMANDS:
129
+ return name
130
+ return ALIASES.get(name)
131
+
132
+
133
+ def _print_banner() -> None:
134
+ if _R is None:
135
+ print(f"split3c {__version__}")
136
+ print("Preprocess 3C-derived libraries and convert BAM to .pairs.")
137
+ print("")
138
+ return
139
+
140
+ console = _R["console"]
141
+ Panel = _R["Panel"]
142
+ console.print(
143
+ Panel(
144
+ "[bold blue]split3c[/bold blue]\n"
145
+ "Preprocess 3C-derived sequencing libraries and convert mapped/remapped BAM into `.pairs` to retrive multiplexes events.\n\n"
146
+ "Subcommands:\n"
147
+ " [cmd]re-site[/cmd] restriction-enzyme workflow\n"
148
+ " [cmd]ns-site[/cmd] non-specific ligation workflow\n"
149
+ " [cmd]resolve[/cmd] BAM to `.pairs` conversion",
150
+ title="[title]split3c[/title]",
151
+ subtitle=f"Version: {__version__}",
152
+ expand=True,
153
+ width=110,
154
+ )
155
+ )
156
+ console.print("")
157
+
158
+
159
+ def _print_command_table() -> None:
160
+ if _R is None:
161
+ print("Commands:")
162
+ for name, meta in COMMANDS.items():
163
+ print(f" {name:<10} {meta['summary']}")
164
+ print("")
165
+ return
166
+
167
+ console = _R["console"]
168
+ Table = _R["Table"]
169
+ box = _R["box"]
170
+
171
+ table = Table(
172
+ show_edge=True,
173
+ title="[title]Commands[/title]",
174
+ box=box.HEAVY,
175
+ width=110,
176
+ )
177
+ table.add_column("Command", style="cyan", no_wrap=True)
178
+ table.add_column("Use case", style="magenta")
179
+ table.add_column("Input", style="green", no_wrap=True)
180
+ table.add_column("Output", style="yellow")
181
+
182
+ for name, meta in COMMANDS.items():
183
+ table.add_row(
184
+ name,
185
+ str(meta["summary"]),
186
+ str(meta["inputs"]),
187
+ str(meta["outputs"]),
188
+ )
189
+
190
+ console.print(table)
191
+ console.print("")
192
+
193
+
194
+ def _print_workflows() -> None:
195
+ if _R is None:
196
+ print("Typical workflows: How to catch multiplexes ")
197
+ print("")
198
+ print(" Restriction-based libraries:")
199
+ print(" FASTQ -> split3c re-site -> MAPPING -> split3c resolve --split")
200
+ print("")
201
+ print(" Micro-C-like libraries:")
202
+ print(
203
+ " first-pass MAPPING -> split3c ns-site -> REMAPPING -> split3c resolve --split"
204
+ )
205
+ print("")
206
+ print(" Classic BAM to .pairs: Only duplex informations")
207
+ print(" mapped BAM -> split3c resolve --simple")
208
+ print("")
209
+ return
210
+
211
+ console = _R["console"]
212
+ Panel = _R["Panel"]
213
+
214
+ console.print(
215
+ Panel(
216
+ "[bold]Restriction-based libraries[/bold]\n"
217
+ " FASTQ -> split3c re-site -> MAPPING -> split3c resolve --split\n\n"
218
+ "[bold]Micro-C-like libraries[/bold]\n"
219
+ " first-pass MAPPING -> split3c ns-site -> REMAPPING -> split3c resolve --split\n\n"
220
+ "[bold]Classic BAM to `.pairs`[/bold]\n"
221
+ " mapped BAM -> split3c resolve --simple",
222
+ title="[title]Typical workflows[/title]",
223
+ expand=True,
224
+ width=110,
225
+ )
226
+ )
227
+ console.print("")
228
+
229
+
230
+ def _print_footer() -> None:
231
+ if _R is None:
232
+ print("Examples:")
233
+ print(" split3c re-site --help")
234
+ print(" split3c ns-site --help")
235
+ print(" split3c resolve --help")
236
+ print(" split3c help resolve")
237
+ print("")
238
+ print(f"Documentation:\n {DOC_URL}")
239
+ print("")
240
+ return
241
+
242
+ console = _R["console"]
243
+ Panel = _R["Panel"]
244
+
245
+ console.print(
246
+ Panel(
247
+ "[bold]Examples[/bold]\n"
248
+ " split3c re-site --help\n"
249
+ " split3c ns-site --help\n"
250
+ " split3c resolve --help\n"
251
+ " split3c help resolve\n\n"
252
+ "[bold]Documentation[/bold]\n"
253
+ f" {DOC_URL}",
254
+ title="[title]Getting started[/title]",
255
+ expand=True,
256
+ width=110,
257
+ )
258
+ )
259
+ console.print("")
260
+
261
+
262
+ def _print_root_help(return_code: int = 0) -> int:
263
+ _print_banner()
264
+ _print_command_table()
265
+ _print_workflows()
266
+ _print_footer()
267
+ return return_code
268
+
269
+
270
+ def _print_error(message: str, title: str = "Error") -> None:
271
+ if _R is None:
272
+ print(f"{title}: {message}", file=sys.stderr)
273
+ return
274
+
275
+ console = _R["console"]
276
+ Panel = _R["Panel"]
277
+ console.print(
278
+ Panel(
279
+ f"[bold red]{message}[/bold red]",
280
+ title=title,
281
+ expand=True,
282
+ width=110,
283
+ )
284
+ )
285
+
286
+
287
+ def _dispatch(command_name: str, argv: list[str]) -> int:
288
+ resolved = _resolve_command_name(command_name)
289
+ if resolved is None:
290
+ known = list(COMMANDS) + list(ALIASES)
291
+ suggestions = difflib.get_close_matches(command_name, known, n=1, cutoff=0.55)
292
+ msg = f"Unknown command: {command_name}"
293
+ if suggestions:
294
+ best = _resolve_command_name(suggestions[0]) or suggestions[0]
295
+ msg += f"\nDid you mean: {best} ?"
296
+ _print_error(msg, title="Unknown command")
297
+ _print_root_help(return_code=2)
298
+ return 2
299
+
300
+ runner = COMMANDS[resolved]["runner"]
301
+ assert callable(runner)
302
+
303
+ try:
304
+ return int(runner(argv) or 0)
305
+ except SystemExit as exc:
306
+ if isinstance(exc.code, int):
307
+ return exc.code
308
+ return 1
309
+
310
+
311
+ def main(argv: list[str] | None = None) -> int:
312
+ if argv is None:
313
+ argv = sys.argv[1:]
314
+
315
+ if not argv:
316
+ return _print_root_help(return_code=1)
317
+
318
+ first = argv[0]
319
+
320
+ if first in {"-h", "--help"}:
321
+ return _print_root_help(return_code=0)
322
+
323
+ if first == "--version":
324
+ print(f"split3c {__version__}")
325
+ return 0
326
+
327
+ if first == "help":
328
+ if len(argv) == 1:
329
+ return _print_root_help(return_code=0)
330
+ return _dispatch(argv[1], ["--help", *argv[2:]])
331
+
332
+ return _dispatch(first, argv[1:])
333
+
334
+
335
+ if __name__ == "__main__":
336
+ raise SystemExit(main())
File without changes
@@ -0,0 +1,190 @@
1
+ def signal_handler(sig, frame, out_f, out_r=None):
2
+ """
3
+ Handle termination signals to gracefully terminate processes.
4
+
5
+ Parameters:
6
+ sig (int): Signal number.
7
+ frame (frame object): Current stack frame.
8
+ out_f (subprocess.Popen): Process for the forward output.
9
+ out_r (subprocess.Popen | None): Process for the reverse output.
10
+
11
+ Examples
12
+ --------
13
+ >>> class _P:
14
+ ... def __init__(self):
15
+ ... self.terminated = False
16
+ ... def terminate(self):
17
+ ... self.terminated = True
18
+ ...
19
+ >>> pf, pr = _P(), _P()
20
+ >>> try:
21
+ ... signal_handler(None, None, pf, pr)
22
+ ... except SystemExit:
23
+ ... pass
24
+ >>> pf.terminated, pr.terminated
25
+ (True, True)
26
+
27
+ >>> pf = _P()
28
+ >>> try:
29
+ ... signal_handler(None, None, pf, None)
30
+ ... except SystemExit:
31
+ ... pass
32
+ >>> pf.terminated
33
+ True
34
+ """
35
+ import sys
36
+
37
+ if out_f is not None:
38
+ out_f.terminate()
39
+ if out_r is not None:
40
+ out_r.terminate()
41
+ sys.exit()
42
+
43
+
44
+ def partitionning(num_threads: int, single_bam: bool = False) -> tuple[int, int, int]:
45
+ """
46
+ Heuristique empirique de partition des ressources pour microsplit.
47
+
48
+ Retourne:
49
+ pigz_threads_per_file : threads pigz par fichier (F et R)
50
+ compute_processes : nb de workers process_items
51
+ bam_threads : threads pysam/htslib par fichier (lecture ET écriture)
52
+
53
+ IMPORTANT
54
+ ---------
55
+ Cette fonction est volontairement empirique (surallocation CPU acceptée).
56
+ `num_threads` est un *hint* de cœurs disponibles, pas un budget strict.
57
+
58
+ Points de calibration (bench observés)
59
+ --------------------------------------
60
+ - 4 cœurs -> (1, 1, 1)
61
+ - 8 cœurs -> (2, 3, 1)
62
+ - 16 cœurs -> (3, 4, 3)
63
+
64
+ En mode single_bam=True :
65
+ - on double les threads BAM, car un seul flux BAM doit alimenter toute la pipeline
66
+ - pigz_per_file et compute_processes restent inchangés
67
+
68
+ Doctests
69
+ --------
70
+ >>> partitionning(3)
71
+ Traceback (most recent call last):
72
+ ...
73
+ ValueError: Run with --threads >= 4.
74
+ >>> partitionning(4)
75
+ (1, 1, 1)
76
+ >>> partitionning(8)
77
+ (2, 3, 1)
78
+ >>> partitionning(8, single_bam=True)
79
+ (2, 3, 2)
80
+ >>> partitionning(12)
81
+ (2, 3, 2)
82
+ >>> partitionning(12, single_bam=True)
83
+ (2, 3, 4)
84
+ >>> partitionning(16)
85
+ (3, 4, 3)
86
+ >>> partitionning(16, single_bam=True)
87
+ (3, 4, 6)
88
+ """
89
+ if num_threads < 4:
90
+ raise ValueError("Run with --threads >= 4.")
91
+
92
+ # 4c
93
+ if num_threads <= 5:
94
+ pigz_threads_per_file, compute_processes, bam_threads = (1, 1, 1)
95
+
96
+ # Transition vers 8c
97
+ elif num_threads <= 7:
98
+ pigz_threads_per_file, compute_processes, bam_threads = (1, 2, 1)
99
+
100
+ # 8c
101
+ elif num_threads <= 10:
102
+ pigz_threads_per_file, compute_processes, bam_threads = (2, 3, 1)
103
+
104
+ # Transition vers 16c
105
+ elif num_threads <= 12:
106
+ pigz_threads_per_file, compute_processes, bam_threads = (2, 3, 2)
107
+
108
+ elif num_threads <= 14:
109
+ pigz_threads_per_file, compute_processes, bam_threads = (3, 4, 2)
110
+
111
+ # 16c et plus
112
+ else:
113
+ pigz_threads_per_file, compute_processes, bam_threads = (3, 4, 3)
114
+
115
+ if single_bam:
116
+ bam_threads *= 2
117
+
118
+ return pigz_threads_per_file, compute_processes, bam_threads
119
+
120
+
121
+ def check_data(els):
122
+ for element in els:
123
+ if element is None:
124
+ return False
125
+ return True
126
+
127
+
128
+ def write_command_txt(args, resolved: dict):
129
+ """
130
+ Write a small execution report into ``command.txt``.
131
+
132
+ The report contains:
133
+ - UTC timestamp
134
+ - current working directory
135
+ - reconstructed command line
136
+ - raw CLI arguments
137
+ - resolved runtime values
138
+
139
+ Examples
140
+ --------
141
+ >>> import os, sys, tempfile
142
+ >>> from types import SimpleNamespace
143
+ >>> old_cwd = os.getcwd()
144
+ >>> old_argv = sys.argv[:]
145
+ >>> with tempfile.TemporaryDirectory() as td:
146
+ ... os.chdir(td)
147
+ ... sys.argv = ["prog", "--foo", "bar"]
148
+ ... args = SimpleNamespace(alpha=1, beta="x")
149
+ ... write_command_txt(args, {"gamma": 3})
150
+ ... txt = open("command.txt", "r", encoding="utf-8").read()
151
+ ... ok = all(x in txt for x in ["command:", "cli_args:", "resolved:", "alpha: 1", "beta: x", "gamma: 3"])
152
+ ... os.chdir(old_cwd)
153
+ ... sys.argv = old_argv
154
+ ... ok
155
+ True
156
+ """
157
+ import os
158
+ import shlex
159
+ import sys
160
+ from datetime import datetime, timezone
161
+
162
+ cmd = " ".join(shlex.quote(x) for x in sys.argv)
163
+
164
+ p = os.path.join("./", "command.txt")
165
+ with open(p, "w") as f:
166
+ f.write(f"timestamp_utc: {datetime.now(timezone.utc).isoformat()}\n")
167
+ f.write(f"cwd: {os.getcwd()}\n\n")
168
+ f.write("command:\n")
169
+ f.write(cmd + "\n\n")
170
+ f.write("cli_args:\n")
171
+ for k, v in sorted(vars(args).items()):
172
+ f.write(f" {k}: {v}\n")
173
+ f.write("\nresolved:\n")
174
+ for k, v in sorted(resolved.items()):
175
+ f.write(f" {k}: {v}\n")
176
+
177
+
178
+ def handle_write_cmd(
179
+ bam_1, bam_2, output_fq1, output_fq2, output_bam1, output_bam2, args
180
+ ):
181
+ resolved = {
182
+ "BAM_R1_or_single": bam_1,
183
+ "BAM_R2": bam_2,
184
+ "output_Fastq_R1": output_fq1,
185
+ "output_Fastq_R2": output_fq2,
186
+ "output_bam1_or_single_unsplit": output_bam1,
187
+ "output_bam2_unsplit": output_bam2,
188
+ "single_bam": getattr(args, "single_bam", False),
189
+ }
190
+ write_command_txt(args=args, resolved=resolved)