split3c 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- split3c/__init__.py +0 -0
- split3c/cli.py +336 -0
- split3c/nssite/__init__.py +0 -0
- split3c/nssite/auxiliary.py +190 -0
- split3c/nssite/bam.py +299 -0
- split3c/nssite/fastq.py +148 -0
- split3c/nssite/main.py +368 -0
- split3c/nssite/processmanager.py +51 -0
- split3c/nssite/split.py +849 -0
- split3c/resite/__init__.py +33 -0
- split3c/resite/frag.py +576 -0
- split3c/resite/header.py +91 -0
- split3c/resite/index.py +236 -0
- split3c/resite/main.py +506 -0
- split3c/resite/pretreatment.py +299 -0
- split3c/resite/read.py +91 -0
- split3c/resite/write_control.py +111 -0
- split3c/resolve/__init__.py +0 -0
- split3c/resolve/bam.py +129 -0
- split3c/resolve/io_utils.py +77 -0
- split3c/resolve/main.py +506 -0
- split3c/resolve/pairs.py +56 -0
- split3c/resolve/parse.py +1218 -0
- split3c-0.0.1.dist-info/METADATA +100 -0
- split3c-0.0.1.dist-info/RECORD +29 -0
- split3c-0.0.1.dist-info/WHEEL +5 -0
- split3c-0.0.1.dist-info/entry_points.txt +5 -0
- split3c-0.0.1.dist-info/licenses/LICENSE +235 -0
- split3c-0.0.1.dist-info/top_level.txt +1 -0
split3c/nssite/main.py
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
import logging
|
|
26
|
+
import os
|
|
27
|
+
import sys
|
|
28
|
+
from typing import Any, Optional
|
|
29
|
+
|
|
30
|
+
from .split import cut
|
|
31
|
+
|
|
32
|
+
__version__ = "1.1.0"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _try_rich() -> Optional[dict[str, Any]]:
|
|
36
|
+
try:
|
|
37
|
+
from rich import box
|
|
38
|
+
from rich.console import Console
|
|
39
|
+
from rich.panel import Panel
|
|
40
|
+
from rich.table import Table
|
|
41
|
+
from rich.theme import Theme
|
|
42
|
+
from rich.traceback import install
|
|
43
|
+
from rich_argparse import RichHelpFormatter
|
|
44
|
+
except Exception:
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
console = Console(
|
|
48
|
+
theme=Theme({"info": "dim cyan", "error": "bold red", "warning": "magenta"}),
|
|
49
|
+
width=100,
|
|
50
|
+
)
|
|
51
|
+
install(console=console)
|
|
52
|
+
return {
|
|
53
|
+
"console": console,
|
|
54
|
+
"Panel": Panel,
|
|
55
|
+
"Table": Table,
|
|
56
|
+
"box": box,
|
|
57
|
+
"RichHelpFormatter": RichHelpFormatter,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
_R = _try_rich()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class MyArgumentParser(argparse.ArgumentParser):
|
|
65
|
+
def error(self, message: str) -> None:
|
|
66
|
+
if _R is not None:
|
|
67
|
+
console = _R["console"]
|
|
68
|
+
Panel = _R["Panel"]
|
|
69
|
+
console.print(
|
|
70
|
+
Panel(
|
|
71
|
+
f"[bold red]Error:[/bold red] {message}",
|
|
72
|
+
title="Incorrect arguments",
|
|
73
|
+
expand=True,
|
|
74
|
+
width=100,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
self.print_help()
|
|
78
|
+
self.exit(2)
|
|
79
|
+
self.print_usage(sys.stderr)
|
|
80
|
+
self.exit(2, f"{self.prog}: error: {message}\n")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def validate_args(args: argparse.Namespace) -> None:
|
|
84
|
+
def _file_exists(path: str, what: str) -> None:
|
|
85
|
+
if not os.path.exists(path):
|
|
86
|
+
raise ValueError(f"{what}: file not found: {path}")
|
|
87
|
+
if not os.path.isfile(path):
|
|
88
|
+
raise ValueError(f"{what}: not a file: {path}")
|
|
89
|
+
|
|
90
|
+
def _parent_writable(path: str, what: str) -> None:
|
|
91
|
+
parent = os.path.dirname(os.path.abspath(path)) or os.getcwd()
|
|
92
|
+
if not os.path.exists(parent):
|
|
93
|
+
raise ValueError(f"{what}: parent directory does not exist: {parent}")
|
|
94
|
+
if not os.access(parent, os.W_OK):
|
|
95
|
+
raise ValueError(f"{what}: parent directory not writable: {parent}")
|
|
96
|
+
|
|
97
|
+
_file_exists(args.bam_1, "Input BAM")
|
|
98
|
+
|
|
99
|
+
if not args.single_bam:
|
|
100
|
+
if args.bam_2 is None:
|
|
101
|
+
raise ValueError("--bam-2 is required unless --single-bam is used")
|
|
102
|
+
_file_exists(args.bam_2, "Reverse BAM")
|
|
103
|
+
|
|
104
|
+
_parent_writable(args.output_forward, "Output R1")
|
|
105
|
+
_parent_writable(args.output_reverse, "Output R2")
|
|
106
|
+
|
|
107
|
+
if args.num_threads < 7:
|
|
108
|
+
raise ValueError("--num-threads must be >= 7 (recommended >= 12)")
|
|
109
|
+
|
|
110
|
+
if args.seed_size < 0:
|
|
111
|
+
raise ValueError("--seed-size must be >= 0")
|
|
112
|
+
|
|
113
|
+
if args.lenght_added < 0:
|
|
114
|
+
raise ValueError("--length-added must be >= 0")
|
|
115
|
+
|
|
116
|
+
if args.pairing_mode not in ("all", "cover"):
|
|
117
|
+
raise ValueError("--pairing-mode must be one of: all, cover")
|
|
118
|
+
|
|
119
|
+
if args.output_forward == args.output_reverse:
|
|
120
|
+
raise ValueError("Output R1 and R2 must be different files")
|
|
121
|
+
|
|
122
|
+
if not args.force:
|
|
123
|
+
for p in (args.output_forward, args.output_reverse):
|
|
124
|
+
if os.path.exists(p):
|
|
125
|
+
raise ValueError(
|
|
126
|
+
f"Output already exists: {p} (use --force to overwrite)"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _formatter_class():
|
|
131
|
+
if _R is None:
|
|
132
|
+
return argparse.RawTextHelpFormatter
|
|
133
|
+
|
|
134
|
+
from rich_argparse import RawTextRichHelpFormatter
|
|
135
|
+
|
|
136
|
+
return lambda prog: RawTextRichHelpFormatter(
|
|
137
|
+
prog,
|
|
138
|
+
max_help_position=42,
|
|
139
|
+
width=110,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _print_banner() -> None:
|
|
144
|
+
if _R is None:
|
|
145
|
+
return
|
|
146
|
+
console = _R["console"]
|
|
147
|
+
Panel = _R["Panel"]
|
|
148
|
+
console.print(
|
|
149
|
+
Panel(
|
|
150
|
+
"[bold blue]Microsplit[/bold blue]\n"
|
|
151
|
+
"Process paired BAM (Micro-C) into paired FASTQ.\n\n"
|
|
152
|
+
"Use --help to see detailed options.",
|
|
153
|
+
title="[bold green]microsplit-cut[/bold green]",
|
|
154
|
+
subtitle=f"Version: {__version__}",
|
|
155
|
+
expand=True,
|
|
156
|
+
width=100,
|
|
157
|
+
)
|
|
158
|
+
)
|
|
159
|
+
console.print("")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _print_summary(args: argparse.Namespace) -> None:
|
|
163
|
+
if _R is None:
|
|
164
|
+
return
|
|
165
|
+
console = _R["console"]
|
|
166
|
+
Table = _R["Table"]
|
|
167
|
+
box = _R["box"]
|
|
168
|
+
|
|
169
|
+
t = Table(
|
|
170
|
+
show_edge=True,
|
|
171
|
+
title="[bold green]Summary[/bold green]",
|
|
172
|
+
box=box.HEAVY,
|
|
173
|
+
width=100,
|
|
174
|
+
)
|
|
175
|
+
t.add_column("Key", style="cyan", no_wrap=True)
|
|
176
|
+
t.add_column("Value", style="magenta")
|
|
177
|
+
|
|
178
|
+
t.add_row("--single-bam", str(args.single_bam))
|
|
179
|
+
t.add_row("--bam-1", str(args.bam_1))
|
|
180
|
+
t.add_row("--bam-2", str(args.bam_2))
|
|
181
|
+
t.add_row("--output-forward", str(args.output_forward))
|
|
182
|
+
t.add_row("--output-reverse", str(args.output_reverse))
|
|
183
|
+
t.add_row("--num-threads", str(args.num_threads))
|
|
184
|
+
t.add_row("--seed-size", str(args.seed_size))
|
|
185
|
+
t.add_row("--length-added", str(args.lenght_added))
|
|
186
|
+
t.add_row("--pairing-mode", str(args.pairing_mode))
|
|
187
|
+
t.add_row("--tags", str(args.tags))
|
|
188
|
+
t.add_row("--force", str(args.force))
|
|
189
|
+
t.add_row("--verbose", str(args.verbose))
|
|
190
|
+
|
|
191
|
+
console.print(t)
|
|
192
|
+
console.print("")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def main_cli(argv: Optional[list[str]] = None) -> int:
|
|
196
|
+
_print_banner()
|
|
197
|
+
|
|
198
|
+
parser = MyArgumentParser(
|
|
199
|
+
description=(
|
|
200
|
+
"Process Micro-C BAM input to paired FASTQ.\n"
|
|
201
|
+
"Supports either two BAM files (forward/reverse) or one interleaved BAM."
|
|
202
|
+
),
|
|
203
|
+
epilog=(
|
|
204
|
+
"Examples:\n"
|
|
205
|
+
" \tmicrosplit -1 fwd.bam -2 rev.bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 -l 0 --pairing-mode cover \n"
|
|
206
|
+
" \tmicrosplit -1 merged.bam --single-bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 --pairing-mode all\n"
|
|
207
|
+
),
|
|
208
|
+
formatter_class=_formatter_class(),
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
req = parser.add_argument_group("Required inputs")
|
|
212
|
+
out = parser.add_argument_group("Outputs")
|
|
213
|
+
perf = parser.add_argument_group("Performance")
|
|
214
|
+
split = parser.add_argument_group("Split parameters")
|
|
215
|
+
misc = parser.add_argument_group("Misc")
|
|
216
|
+
|
|
217
|
+
req.add_argument(
|
|
218
|
+
"-1",
|
|
219
|
+
"--bam_1",
|
|
220
|
+
type=str,
|
|
221
|
+
required=True,
|
|
222
|
+
help="Path to forward BAM file, or to the single interleaved BAM if --single-bam is used.",
|
|
223
|
+
)
|
|
224
|
+
req.add_argument(
|
|
225
|
+
"-2",
|
|
226
|
+
"--bam_2",
|
|
227
|
+
type=str,
|
|
228
|
+
required=False,
|
|
229
|
+
default=None,
|
|
230
|
+
help="Path to reverse BAM file. Not required if --single-bam is used.",
|
|
231
|
+
)
|
|
232
|
+
req.add_argument(
|
|
233
|
+
"--single-bam",
|
|
234
|
+
action="store_true",
|
|
235
|
+
help="Use a single interleaved BAM as input (R1 line followed by R2 line with same read name).",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
out.add_argument(
|
|
239
|
+
"-o1",
|
|
240
|
+
"--output_forward",
|
|
241
|
+
type=str,
|
|
242
|
+
required=True,
|
|
243
|
+
help="Path to output forward FASTQ (R1).",
|
|
244
|
+
)
|
|
245
|
+
out.add_argument(
|
|
246
|
+
"-o2",
|
|
247
|
+
"--output_reverse",
|
|
248
|
+
type=str,
|
|
249
|
+
required=True,
|
|
250
|
+
help="Path to output reverse FASTQ (R2).",
|
|
251
|
+
)
|
|
252
|
+
out.add_argument(
|
|
253
|
+
"--force",
|
|
254
|
+
action="store_true",
|
|
255
|
+
help="Overwrite output files if they already exist.",
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
perf.add_argument(
|
|
259
|
+
"-t",
|
|
260
|
+
"--num_threads",
|
|
261
|
+
type=int,
|
|
262
|
+
default=12,
|
|
263
|
+
help="Total CPU threads budget (>= 7).",
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
split.add_argument(
|
|
267
|
+
"-s",
|
|
268
|
+
"--seed_size",
|
|
269
|
+
type=int,
|
|
270
|
+
default=20,
|
|
271
|
+
help="Minimum fragment length to keep after splitting (0 disables filtering). We strongly advise against using a seed size of less than 10bp for the rest of the treatment (long computing time).",
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
split.add_argument(
|
|
275
|
+
"-l",
|
|
276
|
+
"--lenght_added",
|
|
277
|
+
"--length-added",
|
|
278
|
+
dest="lenght_added",
|
|
279
|
+
type=int,
|
|
280
|
+
default=0,
|
|
281
|
+
help="Bases added around split boundary (soft-clip extension).",
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
split.add_argument(
|
|
285
|
+
"--pairing-mode",
|
|
286
|
+
choices=["all", "cover"],
|
|
287
|
+
default="cover",
|
|
288
|
+
help=(
|
|
289
|
+
"Pair generation strategy.\n"
|
|
290
|
+
" cover: minimal/quasi-minimal number of pairs so each fragment appears at least once (Doesn't lose data and is optimal for computation time).\n"
|
|
291
|
+
" all: all pairwise combinations."
|
|
292
|
+
),
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
split.add_argument(
|
|
296
|
+
"--tags",
|
|
297
|
+
choices=["origin", "no_annot", "o", "na"],
|
|
298
|
+
default="o",
|
|
299
|
+
help=(
|
|
300
|
+
"Header tagging mode for split reads.\n"
|
|
301
|
+
" origin/o : include fragment origin tags (F0,R1,...)\n"
|
|
302
|
+
" no_annot/na: keep base name only / No annotation (Not recommended if you wish to filter the pairs after the process.)"
|
|
303
|
+
),
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
misc.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
307
|
+
misc.add_argument(
|
|
308
|
+
"-v",
|
|
309
|
+
"--verbose",
|
|
310
|
+
action="count",
|
|
311
|
+
default=0,
|
|
312
|
+
help="Increase verbosity (-v, -vv).",
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
args = parser.parse_args(argv)
|
|
316
|
+
|
|
317
|
+
level = logging.WARNING
|
|
318
|
+
if args.verbose == 1:
|
|
319
|
+
level = logging.INFO
|
|
320
|
+
elif args.verbose >= 2:
|
|
321
|
+
level = logging.DEBUG
|
|
322
|
+
logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
|
|
323
|
+
|
|
324
|
+
try:
|
|
325
|
+
validate_args(args)
|
|
326
|
+
except ValueError as e:
|
|
327
|
+
if _R is not None:
|
|
328
|
+
Panel = _R["Panel"]
|
|
329
|
+
_R["console"].print(
|
|
330
|
+
Panel(
|
|
331
|
+
f"[bold red]{e}[/bold red]",
|
|
332
|
+
title="Validation",
|
|
333
|
+
expand=True,
|
|
334
|
+
width=100,
|
|
335
|
+
)
|
|
336
|
+
)
|
|
337
|
+
parser.print_help()
|
|
338
|
+
else:
|
|
339
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
340
|
+
parser.print_help(sys.stderr)
|
|
341
|
+
return 2
|
|
342
|
+
|
|
343
|
+
_print_summary(args)
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
cut(args)
|
|
347
|
+
except KeyboardInterrupt:
|
|
348
|
+
return 130
|
|
349
|
+
except Exception as e:
|
|
350
|
+
if _R is not None:
|
|
351
|
+
Panel = _R["Panel"]
|
|
352
|
+
_R["console"].print(
|
|
353
|
+
Panel(
|
|
354
|
+
f"[bold red]{e}[/bold red]",
|
|
355
|
+
title="Runtime error",
|
|
356
|
+
expand=True,
|
|
357
|
+
width=100,
|
|
358
|
+
)
|
|
359
|
+
)
|
|
360
|
+
else:
|
|
361
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
362
|
+
return 1
|
|
363
|
+
|
|
364
|
+
return 0
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
if __name__ == "__main__":
|
|
368
|
+
main_cli()
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from multiprocessing import Process, Queue
|
|
2
|
+
import sys
|
|
3
|
+
import traceback
|
|
4
|
+
|
|
5
|
+
class WorkerProcess(Process):
|
|
6
|
+
def __init__(self, target, args, error_queue):
|
|
7
|
+
super().__init__(target=target, args=args)
|
|
8
|
+
self.error_queue = error_queue
|
|
9
|
+
|
|
10
|
+
def run(self):
|
|
11
|
+
try:
|
|
12
|
+
super().run()
|
|
13
|
+
except Exception as e:
|
|
14
|
+
print(f"Worker error: {e}")
|
|
15
|
+
self.error_queue.put((str(e), traceback.format_exc()))
|
|
16
|
+
sys.exit(1)
|
|
17
|
+
|
|
18
|
+
class ProcessManager:
|
|
19
|
+
def __init__(self):
|
|
20
|
+
self.processes = []
|
|
21
|
+
self.error_queue = Queue()
|
|
22
|
+
|
|
23
|
+
def start_worker(self, target, args):
|
|
24
|
+
self.processes.append(WorkerProcess(target, args, self.error_queue))
|
|
25
|
+
self.processes[-1].start()
|
|
26
|
+
|
|
27
|
+
def running(self):
|
|
28
|
+
return any([p.is_alive() for p in self.processes])
|
|
29
|
+
|
|
30
|
+
def check_processes(self):
|
|
31
|
+
for p in self.processes:
|
|
32
|
+
if not p.is_alive() and p.exitcode != 0:
|
|
33
|
+
print("Worker process crashed!")
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
if not self.error_queue.empty():
|
|
37
|
+
error, tb = self.error_queue.get()
|
|
38
|
+
print(f"Worker error: {error}")
|
|
39
|
+
self.shutdown()
|
|
40
|
+
return True
|
|
41
|
+
|
|
42
|
+
def shutdown(self):
|
|
43
|
+
for i, p in enumerate(self.processes):
|
|
44
|
+
if p.is_alive():
|
|
45
|
+
p.terminate()
|
|
46
|
+
p.join()
|
|
47
|
+
|
|
48
|
+
def handle_signal(self, signum, frame):
|
|
49
|
+
print("Received shutdown signal")
|
|
50
|
+
self.shutdown()
|
|
51
|
+
sys.exit(1)
|