split3c 0.0.1__tar.gz → 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {split3c-0.0.1/src/split3c.egg-info → split3c-0.0.2}/PKG-INFO +6 -7
- {split3c-0.0.1 → split3c-0.0.2}/README.md +5 -6
- {split3c-0.0.1 → split3c-0.0.2}/pyproject.toml +1 -1
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/cli.py +2 -2
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/auxiliary.py +49 -15
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/bam.py +76 -43
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/fastq.py +24 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/main.py +4 -4
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/processmanager.py +26 -1
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/split.py +48 -26
- split3c-0.0.2/src/split3c/resite/__init__.py +4 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/frag.py +2 -2
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/main.py +5 -5
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/pretreatment.py +27 -15
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/bam.py +26 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/io_utils.py +26 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/main.py +239 -122
- split3c-0.0.2/src/split3c/resolve/pairs.py +146 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/parse.py +682 -302
- {split3c-0.0.1 → split3c-0.0.2/src/split3c.egg-info}/PKG-INFO +6 -7
- split3c-0.0.1/src/split3c/resite/__init__.py +0 -33
- split3c-0.0.1/src/split3c/resolve/pairs.py +0 -56
- {split3c-0.0.1 → split3c-0.0.2}/LICENSE +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/setup.cfg +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/__init__.py +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/__init__.py +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/header.py +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/index.py +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/read.py +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/write_control.py +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/__init__.py +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/SOURCES.txt +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/dependency_links.txt +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/entry_points.txt +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/requires.txt +0 -0
- {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: split3c
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.2
|
|
4
4
|
Summary: Toolkit to split and resolve chimeric 3C/Hi-C/Micro-C reads
|
|
5
5
|
Author-email: Samir Bertache <samir.bertache.djenadi@gmail.com>
|
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
|
@@ -23,8 +23,8 @@ Requires-Dist: build>=1.2.0; extra == "dev"
|
|
|
23
23
|
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
24
24
|
Dynamic: license-file
|
|
25
25
|
|
|
26
|
-
[](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/pipelines)
|
|
27
|
+
[](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/commits/main)
|
|
28
28
|
|
|
29
29
|
# `split3c`
|
|
30
30
|
|
|
@@ -76,7 +76,7 @@ split3c resolve --help
|
|
|
76
76
|
|
|
77
77
|
Restriction enzyme-based workflow for Hi-C / HiChIP / 3C-like libraries.
|
|
78
78
|
|
|
79
|
-

|
|
80
80
|
|
|
81
81
|
---
|
|
82
82
|
|
|
@@ -84,17 +84,16 @@ Restriction enzyme-based workflow for Hi-C / HiChIP / 3C-like libraries.
|
|
|
84
84
|
|
|
85
85
|
Non-specific ligation workflow for Micro-C-like libraries.
|
|
86
86
|
|
|
87
|
-

|
|
88
88
|
|
|
89
89
|
---
|
|
90
90
|
|
|
91
91
|
## Benchmark
|
|
92
92
|
|
|
93
|
-

|
|
94
94
|
|
|
95
95
|
---
|
|
96
96
|
|
|
97
97
|
## License
|
|
98
98
|
|
|
99
99
|
split3c is released under the AGPLv3 license.
|
|
100
|
-
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
[](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/pipelines)
|
|
2
|
+
[](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/commits/main)
|
|
3
3
|
|
|
4
4
|
# `split3c`
|
|
5
5
|
|
|
@@ -51,7 +51,7 @@ split3c resolve --help
|
|
|
51
51
|
|
|
52
52
|
Restriction enzyme-based workflow for Hi-C / HiChIP / 3C-like libraries.
|
|
53
53
|
|
|
54
|
-

|
|
55
55
|
|
|
56
56
|
---
|
|
57
57
|
|
|
@@ -59,17 +59,16 @@ Restriction enzyme-based workflow for Hi-C / HiChIP / 3C-like libraries.
|
|
|
59
59
|
|
|
60
60
|
Non-specific ligation workflow for Micro-C-like libraries.
|
|
61
61
|
|
|
62
|
-

|
|
63
63
|
|
|
64
64
|
---
|
|
65
65
|
|
|
66
66
|
## Benchmark
|
|
67
67
|
|
|
68
|
-

|
|
69
69
|
|
|
70
70
|
---
|
|
71
71
|
|
|
72
72
|
## License
|
|
73
73
|
|
|
74
74
|
split3c is released under the AGPLv3 license.
|
|
75
|
-
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
|
-
This script is a the split3c project
|
|
2
|
+
This script is a the split3c project ; split3c is a toolkit for preprocessing 3C-type sequencing libraries and converting BAM alignments into .pairs files for chromatin contact analysis.
|
|
3
3
|
|
|
4
|
-
Copyright ©
|
|
4
|
+
Copyright © 2026 Samir Bertache
|
|
5
5
|
|
|
6
6
|
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
7
|
|
|
@@ -1,3 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
1
25
|
def signal_handler(sig, frame, out_f, out_r=None):
|
|
2
26
|
"""
|
|
3
27
|
Handle termination signals to gracefully terminate processes.
|
|
@@ -43,27 +67,37 @@ def signal_handler(sig, frame, out_f, out_r=None):
|
|
|
43
67
|
|
|
44
68
|
def partitionning(num_threads: int, single_bam: bool = False) -> tuple[int, int, int]:
|
|
45
69
|
"""
|
|
46
|
-
|
|
70
|
+
Empirical resource partitioning heuristic for microsplit.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
|
|
74
|
+
pigz_threads_per_file: pigz threads per file (F and R)
|
|
47
75
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
bam_threads : threads pysam/htslib par fichier (lecture ET écriture)
|
|
76
|
+
compute_processes: number of process_items
|
|
77
|
+
|
|
78
|
+
bam_threads: pysam/htslib threads per file (read AND write)
|
|
52
79
|
|
|
53
80
|
IMPORTANT
|
|
54
|
-
---------
|
|
55
|
-
Cette fonction est volontairement empirique (surallocation CPU acceptée).
|
|
56
|
-
`num_threads` est un *hint* de cœurs disponibles, pas un budget strict.
|
|
57
81
|
|
|
58
|
-
|
|
82
|
+
--------- This function is intentionally empirical (CPU overallocation is accepted).
|
|
83
|
+
|
|
84
|
+
`num_threads` is a *hint* of available cores, not a strict budget.
|
|
85
|
+
|
|
86
|
+
Calibration points (observed benchmarks)
|
|
87
|
+
|
|
59
88
|
--------------------------------------
|
|
60
|
-
- 4 cœurs -> (1, 1, 1)
|
|
61
|
-
- 8 cœurs -> (2, 3, 1)
|
|
62
|
-
- 16 cœurs -> (3, 4, 3)
|
|
63
89
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
-
|
|
90
|
+
- 4 cores -> (1, 1, 1)
|
|
91
|
+
|
|
92
|
+
- 8 cores -> (2, 3, 1)
|
|
93
|
+
|
|
94
|
+
- 16 cores -> (3, 4, 3)
|
|
95
|
+
|
|
96
|
+
In single_bam=True mode:
|
|
97
|
+
|
|
98
|
+
- The number of BAM threads is doubled, as a single BAM stream must feed the entire pipeline.
|
|
99
|
+
|
|
100
|
+
- pigz_per_file and compute_processes remain unchanged.
|
|
67
101
|
|
|
68
102
|
Doctests
|
|
69
103
|
--------
|
|
@@ -1,7 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
1
25
|
def get_bam_headers(bam_for_path, bam_rev_path):
|
|
2
26
|
"""
|
|
3
|
-
|
|
4
|
-
|
|
27
|
+
Open the BAM files, extract their headers as a dictionary,
|
|
28
|
+
and close the files immediately.
|
|
5
29
|
|
|
6
30
|
Returns:
|
|
7
31
|
tuple: (header_dict_forward, header_dict_reverse)
|
|
@@ -76,7 +100,7 @@ def write_bam_pair_from_sam(
|
|
|
76
100
|
bam_threads=1,
|
|
77
101
|
):
|
|
78
102
|
"""
|
|
79
|
-
|
|
103
|
+
Writes the BAM pairs using the provided header dictionaries.
|
|
80
104
|
"""
|
|
81
105
|
import sys
|
|
82
106
|
|
|
@@ -122,8 +146,9 @@ def write_bam_pair_from_sam(
|
|
|
122
146
|
|
|
123
147
|
def get_bam_header_single(bam_path):
|
|
124
148
|
"""
|
|
125
|
-
|
|
126
|
-
|
|
149
|
+
Opens a single BAM file, extracts its header as a dictionary,
|
|
150
|
+
then closes the file.
|
|
151
|
+
|
|
127
152
|
|
|
128
153
|
Returns
|
|
129
154
|
-------
|
|
@@ -138,23 +163,25 @@ def get_bam_header_single(bam_path):
|
|
|
138
163
|
|
|
139
164
|
def _pair_reads_from_single_bam(read_a, read_b, strict=True):
|
|
140
165
|
"""
|
|
141
|
-
|
|
166
|
+
Orders two reads from a BAM interleaved as (forward/read1, reverse/read2).
|
|
142
167
|
|
|
143
|
-
|
|
168
|
+
Parameters
|
|
144
169
|
----------
|
|
145
|
-
read_a, read_b
|
|
146
|
-
strict
|
|
147
|
-
|
|
148
|
-
|
|
170
|
+
read_a, read_b: pysam.AlignedSegment
|
|
171
|
+
strict: bool
|
|
172
|
+
If True, raises an error in case of a strong inconsistency.
|
|
173
|
+
|
|
174
|
+
If False, attempts a fallback in order of appearance.
|
|
149
175
|
|
|
150
176
|
Returns
|
|
151
|
-
|
|
177
|
+
------
|
|
152
178
|
tuple
|
|
153
|
-
|
|
179
|
+
|
|
180
|
+
(read_for, read_rev)
|
|
154
181
|
|
|
155
182
|
Notes
|
|
156
183
|
-----
|
|
157
|
-
|
|
184
|
+
Flags 0x40 / 0x80 (is_read1 / is_read2) are used as the primary method.
|
|
158
185
|
"""
|
|
159
186
|
if read_a is None or read_b is None:
|
|
160
187
|
raise ValueError("Pairing failure: one of the reads is None.")
|
|
@@ -178,28 +205,30 @@ def read_bam_interleaved(
|
|
|
178
205
|
strict=True,
|
|
179
206
|
):
|
|
180
207
|
"""
|
|
181
|
-
|
|
208
|
+
Reads a single interleaved BAM (a forward/read1 line followed by a reverse/read2 line) and sends batches of SAM pairs to input_queue.
|
|
209
|
+
|
|
210
|
+
Output contract identical to read_bam_pair:
|
|
182
211
|
|
|
183
|
-
|
|
184
|
-
batch = list[(sam_f, sam_r)]
|
|
212
|
+
batch = list[(sam_f, sam_r)]
|
|
185
213
|
|
|
186
|
-
|
|
214
|
+
Parameters
|
|
187
215
|
----------
|
|
188
|
-
bam_file
|
|
189
|
-
|
|
190
|
-
input_queue
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
216
|
+
bam_file: str
|
|
217
|
+
Path to a single interleaved BAM.
|
|
218
|
+
input_queue: multiprocessing.Queue
|
|
219
|
+
|
|
220
|
+
num_processes: int
|
|
221
|
+
Number of compute workers, to send sentinels. None.
|
|
222
|
+
bam_threads: int
|
|
223
|
+
Pysam/htslib threads.
|
|
224
|
+
batch_size: int
|
|
225
|
+
Batch size.
|
|
226
|
+
strict: bool
|
|
227
|
+
If True, fails if the pairs are not perfectly matched.
|
|
228
|
+
|
|
229
|
+
Requirements
|
|
201
230
|
---------
|
|
202
|
-
|
|
231
|
+
The BAM must be ordered by name or, at a minimum, have two consecutive pairs.
|
|
203
232
|
"""
|
|
204
233
|
import sys
|
|
205
234
|
|
|
@@ -250,21 +279,25 @@ def write_bam_interleaved_from_sam(
|
|
|
250
279
|
bam_threads=1,
|
|
251
280
|
):
|
|
252
281
|
"""
|
|
253
|
-
|
|
282
|
+
Writes non-splittable BAM pairs into a single interleaved BAM.
|
|
254
283
|
|
|
255
|
-
|
|
256
|
-
queue contient des batchs list[(sam_f, sam_r)]
|
|
284
|
+
Input contract:
|
|
257
285
|
|
|
258
|
-
|
|
259
|
-
un seul BAM avec read1 puis read2 à la suite.
|
|
286
|
+
queue contains batches `list[(sam_f, sam_r)]`
|
|
260
287
|
|
|
261
|
-
|
|
288
|
+
Output:
|
|
289
|
+
|
|
290
|
+
a single BAM with `read1` followed by `read2`.
|
|
291
|
+
|
|
292
|
+
Parameters
|
|
262
293
|
----------
|
|
263
|
-
queue
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
294
|
+
queue: multiprocessing.Queue
|
|
295
|
+
|
|
296
|
+
out_bam_path: str
|
|
297
|
+
|
|
298
|
+
header_dict: dict
|
|
299
|
+
num_procs_finished_signal: int
|
|
300
|
+
bam_threads: int
|
|
268
301
|
"""
|
|
269
302
|
import sys
|
|
270
303
|
|
|
@@ -1,3 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
1
25
|
def open_output(output_forward, output_reverse, write_processes):
|
|
2
26
|
"""
|
|
3
27
|
Open output files for writing with pigz compression.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
|
|
3
3
|
|
|
4
4
|
Copyright © 2024 Samir Bertache
|
|
5
5
|
|
|
@@ -150,7 +150,7 @@ def _print_banner() -> None:
|
|
|
150
150
|
"[bold blue]Microsplit[/bold blue]\n"
|
|
151
151
|
"Process paired BAM (Micro-C) into paired FASTQ.\n\n"
|
|
152
152
|
"Use --help to see detailed options.",
|
|
153
|
-
title="[bold green]
|
|
153
|
+
title="[bold green]split3c nssite-cut[/bold green]",
|
|
154
154
|
subtitle=f"Version: {__version__}",
|
|
155
155
|
expand=True,
|
|
156
156
|
width=100,
|
|
@@ -202,8 +202,8 @@ def main_cli(argv: Optional[list[str]] = None) -> int:
|
|
|
202
202
|
),
|
|
203
203
|
epilog=(
|
|
204
204
|
"Examples:\n"
|
|
205
|
-
" \
|
|
206
|
-
" \
|
|
205
|
+
" \tsplit3c nssite -1 fwd.bam -2 rev.bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 -l 0 --pairing-mode cover \n"
|
|
206
|
+
" \tsplit3c nssite -1 merged.bam --single-bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 --pairing-mode all\n"
|
|
207
207
|
),
|
|
208
208
|
formatter_class=_formatter_class(),
|
|
209
209
|
)
|
|
@@ -1,6 +1,30 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
2
24
|
import sys
|
|
3
25
|
import traceback
|
|
26
|
+
from multiprocessing import Process, Queue
|
|
27
|
+
|
|
4
28
|
|
|
5
29
|
class WorkerProcess(Process):
|
|
6
30
|
def __init__(self, target, args, error_queue):
|
|
@@ -15,6 +39,7 @@ class WorkerProcess(Process):
|
|
|
15
39
|
self.error_queue.put((str(e), traceback.format_exc()))
|
|
16
40
|
sys.exit(1)
|
|
17
41
|
|
|
42
|
+
|
|
18
43
|
class ProcessManager:
|
|
19
44
|
def __init__(self):
|
|
20
45
|
self.processes = []
|
|
@@ -1,3 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
|
|
3
|
+
|
|
4
|
+
Copyright © 2024 Samir Bertache
|
|
5
|
+
|
|
6
|
+
SPDX-License-Identifier: AGPL-3.0-or-later
|
|
7
|
+
|
|
8
|
+
===============================================================================
|
|
9
|
+
|
|
10
|
+
This program is free software: you can redistribute it and/or modify it under
|
|
11
|
+
the terms of the GNU Affero General Public License as published by the
|
|
12
|
+
Free Software Foundation, either version 3 of the License, or (at your option)
|
|
13
|
+
any later version.
|
|
14
|
+
|
|
15
|
+
This program is distributed in the hope that it will be useful,
|
|
16
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
17
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
18
|
+
See the GNU Affero General Public License for more details.
|
|
19
|
+
|
|
20
|
+
You should have received a copy of the GNU Affero General Public License
|
|
21
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
22
|
+
"""
|
|
23
|
+
|
|
1
24
|
import logging
|
|
2
25
|
import os
|
|
3
26
|
import signal
|
|
@@ -256,14 +279,15 @@ def process_cigard(name, sequence, quality, cigar, seed_size, len_add):
|
|
|
256
279
|
|
|
257
280
|
def read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=None):
|
|
258
281
|
"""
|
|
259
|
-
|
|
282
|
+
Constructs a pair header from a read name and two tags.
|
|
283
|
+
|
|
284
|
+
`base_name`: logical name of the read (e.g., '@READ')
|
|
285
|
+
`tag_i`, `tag_j`: fragment identifiers, typically 'F1', 'R1', etc. (1-based)
|
|
286
|
+
`tot_for`, `tot_rev`: total number of forward/reverse fragments
|
|
260
287
|
|
|
261
|
-
|
|
262
|
-
tag_i, tag_j : identifiants de fragments, typiquement 'F1', 'R1', etc. (1-based)
|
|
263
|
-
tot_for, tot_rev : nombres totaux de fragments forward / reverse
|
|
288
|
+
Return (origin/reverse mode):
|
|
264
289
|
|
|
265
|
-
|
|
266
|
-
'<base_name>:[<tag_i>,<tag_j>:FT<tot_for>,RT<tot_rev>]'
|
|
290
|
+
'<base_name>:[<tag_i>,<tag_j>:FT<tot_for>,RT<tot_rev>]'
|
|
267
291
|
|
|
268
292
|
Examples
|
|
269
293
|
--------
|
|
@@ -288,7 +312,7 @@ def read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=None):
|
|
|
288
312
|
|
|
289
313
|
def _fraglist_to_entries(frag_list, origin):
|
|
290
314
|
"""
|
|
291
|
-
|
|
315
|
+
Transforms a list of FastQ into tuples (origin, idx, seq, qual).
|
|
292
316
|
|
|
293
317
|
Examples
|
|
294
318
|
--------
|
|
@@ -308,7 +332,7 @@ def _fraglist_to_entries(frag_list, origin):
|
|
|
308
332
|
|
|
309
333
|
def _emit_pair(base_name, e1, e2, tot_for, tot_rev, tags=None):
|
|
310
334
|
"""
|
|
311
|
-
|
|
335
|
+
Constructs a textual FASTQ pair from two inputs.
|
|
312
336
|
|
|
313
337
|
e = (origin, idx, seq, qual)
|
|
314
338
|
"""
|
|
@@ -326,15 +350,15 @@ def _emit_pair(base_name, e1, e2, tot_for, tot_rev, tags=None):
|
|
|
326
350
|
|
|
327
351
|
def gen_read_pairs_from_frags_cover(base_name, frags_f, frags_r, tags=None):
|
|
328
352
|
"""
|
|
329
|
-
|
|
330
|
-
|
|
353
|
+
Generates a minimal (or near-minimal) number of pairs so that
|
|
354
|
+
each fragment appears at least once.
|
|
331
355
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
356
|
+
Strategy:
|
|
357
|
+
1. Match F-R whenever possible
|
|
358
|
+
2. Match remainders within the same side
|
|
359
|
+
3. If a fragment remains alone, reattach it to an already used anchor
|
|
336
360
|
|
|
337
|
-
|
|
361
|
+
Complexity: O(F + R)
|
|
338
362
|
|
|
339
363
|
Examples
|
|
340
364
|
--------
|
|
@@ -425,14 +449,14 @@ def gen_read_pairs_from_frags_cover(base_name, frags_f, frags_r, tags=None):
|
|
|
425
449
|
|
|
426
450
|
def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
|
|
427
451
|
"""
|
|
428
|
-
|
|
452
|
+
Generates two FastQ (forward/reverse) chains from already split fragments.
|
|
429
453
|
|
|
430
|
-
`frags_f` et `frags_r`
|
|
454
|
+
`frags_f` et `frags_r` are lists of complete FASTQ fragments :
|
|
431
455
|
'@name\\nSEQ\\n+\\nQUAL\\n'
|
|
432
456
|
|
|
433
457
|
Examples
|
|
434
458
|
--------
|
|
435
|
-
|
|
459
|
+
Simple case : 1 fragment forward, 1 fragment reverse (only one combinaison).
|
|
436
460
|
>>> frags_f = ["@x\\nAC\\n+\\n??\\n"]
|
|
437
461
|
>>> frags_r = ["@x\\nTG\\n+\\n!!\\n"]
|
|
438
462
|
>>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r)
|
|
@@ -441,14 +465,14 @@ def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
|
|
|
441
465
|
>>> R
|
|
442
466
|
'@READ:[F1,R1:FT1,RT1]\\nTG\\n+\\n!!\\n'
|
|
443
467
|
|
|
444
|
-
|
|
468
|
+
Same one, without tag (nt).
|
|
445
469
|
>>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r, tags="nt")
|
|
446
470
|
>>> F
|
|
447
471
|
'@READ\\nAC\\n+\\n??\\n'
|
|
448
472
|
>>> R
|
|
449
473
|
'@READ\\nTG\\n+\\n!!\\n'
|
|
450
474
|
|
|
451
|
-
|
|
475
|
+
Combinatorial case :
|
|
452
476
|
- forward: 2 fragments (F0, F1)
|
|
453
477
|
- reverse: 1 fragment (R0)
|
|
454
478
|
-> combinaisons : (F0,F1), (F0,R0), (F1,R0)
|
|
@@ -470,7 +494,7 @@ def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
|
|
|
470
494
|
>>> R
|
|
471
495
|
'@READ:[F1,F2:FT2,RT1]\\nBCD\\n+\\n===\\n@READ:[F1,R1:FT2,RT1]\\nWXYZ\\n+\\n>>>>\\n@READ:[F2,R1:FT2,RT1]\\nWXYZ\\n+\\n>>>>\\n'
|
|
472
496
|
|
|
473
|
-
|
|
497
|
+
Symmetric combinatorial case (2 fragments forward, 2 fragments reverse).
|
|
474
498
|
>>> frags_f = ["@x\\nA\\n+\\n!\\n", "@x\\nBC\\n+\\n!!\\n"]
|
|
475
499
|
>>> frags_r = ["@x\\nD\\n+\\n#\\n", "@x\\nEF\\n+\\n##\\n"]
|
|
476
500
|
>>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r)
|
|
@@ -480,17 +504,16 @@ def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
|
|
|
480
504
|
"""
|
|
481
505
|
from itertools import combinations
|
|
482
506
|
|
|
483
|
-
# Cas combinatoire: on annote chaque fragment avec son origine (F/R) et un index local
|
|
484
507
|
def _to_entries(frag_list, origin):
|
|
485
508
|
"""
|
|
486
|
-
|
|
509
|
+
Transforms a list of FastQ into tuples (origin, idx, seq, qual).
|
|
487
510
|
frag = '@smth\\nSEQ\\n+\\nQUAL\\n'
|
|
488
511
|
"""
|
|
489
512
|
entries = []
|
|
490
513
|
for idx, frag in enumerate(frag_list):
|
|
491
514
|
lines = frag.strip().split("\n")
|
|
492
515
|
if len(lines) != 4 or lines[2] != "+":
|
|
493
|
-
raise ValueError("Fragment
|
|
516
|
+
raise ValueError("FastQ Fragment invalid in process_cigard.")
|
|
494
517
|
seq = lines[1]
|
|
495
518
|
qual = lines[3]
|
|
496
519
|
entries.append((origin, idx, seq, qual))
|
|
@@ -520,7 +543,7 @@ def gen_read_pairs_from_frags(
|
|
|
520
543
|
base_name, frags_f, frags_r, tags=None, pairing_mode="all"
|
|
521
544
|
):
|
|
522
545
|
"""
|
|
523
|
-
|
|
546
|
+
Dispatch between several pair generation strategies.
|
|
524
547
|
"""
|
|
525
548
|
if pairing_mode == "all":
|
|
526
549
|
return gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=tags)
|
|
@@ -530,7 +553,6 @@ def gen_read_pairs_from_frags(
|
|
|
530
553
|
raise ValueError(f"Unknown pairing_mode: {pairing_mode}")
|
|
531
554
|
|
|
532
555
|
|
|
533
|
-
# Pensez à la gestion de seq = "*"
|
|
534
556
|
def sam_fields(sam_line: str):
|
|
535
557
|
"""
|
|
536
558
|
Return minimal information
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified
|
|
2
|
+
This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped.
|
|
3
3
|
|
|
4
4
|
Copyright © 2024 Samir Bertache
|
|
5
5
|
|
|
@@ -286,7 +286,7 @@ def processing_fr(
|
|
|
286
286
|
"""
|
|
287
287
|
Process the sequences to generate buffers for forward and reverse reads
|
|
288
288
|
selon le mode FR (un fragment forward + un fragment reverse).
|
|
289
|
-
|
|
289
|
+
Do not add the suffix :ij if there is only one pair.
|
|
290
290
|
|
|
291
291
|
Doctests:
|
|
292
292
|
>>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]
|