split3c 0.0.1__tar.gz → 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {split3c-0.0.1/src/split3c.egg-info → split3c-0.0.2}/PKG-INFO +6 -7
  2. {split3c-0.0.1 → split3c-0.0.2}/README.md +5 -6
  3. {split3c-0.0.1 → split3c-0.0.2}/pyproject.toml +1 -1
  4. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/cli.py +2 -2
  5. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/auxiliary.py +49 -15
  6. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/bam.py +76 -43
  7. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/fastq.py +24 -0
  8. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/main.py +4 -4
  9. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/processmanager.py +26 -1
  10. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/split.py +48 -26
  11. split3c-0.0.2/src/split3c/resite/__init__.py +4 -0
  12. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/frag.py +2 -2
  13. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/main.py +5 -5
  14. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/pretreatment.py +27 -15
  15. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/bam.py +26 -0
  16. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/io_utils.py +26 -0
  17. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/main.py +239 -122
  18. split3c-0.0.2/src/split3c/resolve/pairs.py +146 -0
  19. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/parse.py +682 -302
  20. {split3c-0.0.1 → split3c-0.0.2/src/split3c.egg-info}/PKG-INFO +6 -7
  21. split3c-0.0.1/src/split3c/resite/__init__.py +0 -33
  22. split3c-0.0.1/src/split3c/resolve/pairs.py +0 -56
  23. {split3c-0.0.1 → split3c-0.0.2}/LICENSE +0 -0
  24. {split3c-0.0.1 → split3c-0.0.2}/setup.cfg +0 -0
  25. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/__init__.py +0 -0
  26. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/nssite/__init__.py +0 -0
  27. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/header.py +0 -0
  28. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/index.py +0 -0
  29. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/read.py +0 -0
  30. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resite/write_control.py +0 -0
  31. {split3c-0.0.1 → split3c-0.0.2}/src/split3c/resolve/__init__.py +0 -0
  32. {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/SOURCES.txt +0 -0
  33. {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/dependency_links.txt +0 -0
  34. {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/entry_points.txt +0 -0
  35. {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/requires.txt +0 -0
  36. {split3c-0.0.1 → split3c-0.0.2}/src/split3c.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: split3c
3
- Version: 0.0.1
3
+ Version: 0.0.2
4
4
  Summary: Toolkit to split and resolve chimeric 3C/Hi-C/Micro-C reads
5
5
  Author-email: Samir Bertache <samir.bertache.djenadi@gmail.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -23,8 +23,8 @@ Requires-Dist: build>=1.2.0; extra == "dev"
23
23
  Requires-Dist: twine>=5.0.0; extra == "dev"
24
24
  Dynamic: license-file
25
25
 
26
- [![pipeline status](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/badges/main/pipeline.svg)](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/pipelines)
27
- [![coverage report](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/badges/main/coverage.svg?job=tests)](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/commits/main)
26
+ [![pipeline status](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/badges/master/pipeline.svg)](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/pipelines)
27
+ [![coverage report](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/badges/master/coverage.svg?job=tests)](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/commits/main)
28
28
 
29
29
  # `split3c`
30
30
 
@@ -76,7 +76,7 @@ split3c resolve --help
76
76
 
77
77
  Restriction enzyme-based workflow for Hi-C / HiChIP / 3C-like libraries.
78
78
 
79
- ![split3c re-site workflow](docs/images/resite-workflow.png)
79
+ ![split3c re-site workflow](doc/img/resite-workflow.png)
80
80
 
81
81
  ---
82
82
 
@@ -84,17 +84,16 @@ Restriction enzyme-based workflow for Hi-C / HiChIP / 3C-like libraries.
84
84
 
85
85
  Non-specific ligation workflow for Micro-C-like libraries.
86
86
 
87
- ![split3c ns-site workflow](docs/images/nssite-workflow.png)
87
+ ![split3c ns-site workflow](doc/img/nssite-workflow.png)
88
88
 
89
89
  ---
90
90
 
91
91
  ## Benchmark
92
92
 
93
- ![split3c benchmark](docs/images/benchmark.png)
93
+ ![split3c benchmark](doc/img/benchmark.png)
94
94
 
95
95
  ---
96
96
 
97
97
  ## License
98
98
 
99
99
  split3c is released under the AGPLv3 license.
100
-
@@ -1,5 +1,5 @@
1
- [![pipeline status](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/badges/main/pipeline.svg)](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/pipelines)
2
- [![coverage report](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/badges/main/coverage.svg?job=tests)](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/commits/main)
1
+ [![pipeline status](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/badges/master/pipeline.svg)](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/pipelines)
2
+ [![coverage report](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/badges/master/coverage.svg?job=tests)](https://gitbio.ens-lyon.fr/LBMC/physbio/split3c/-/commits/main)
3
3
 
4
4
  # `split3c`
5
5
 
@@ -51,7 +51,7 @@ split3c resolve --help
51
51
 
52
52
  Restriction enzyme-based workflow for Hi-C / HiChIP / 3C-like libraries.
53
53
 
54
- ![split3c re-site workflow](docs/images/resite-workflow.png)
54
+ ![split3c re-site workflow](doc/img/resite-workflow.png)
55
55
 
56
56
  ---
57
57
 
@@ -59,17 +59,16 @@ Restriction enzyme-based workflow for Hi-C / HiChIP / 3C-like libraries.
59
59
 
60
60
  Non-specific ligation workflow for Micro-C-like libraries.
61
61
 
62
- ![split3c ns-site workflow](docs/images/nssite-workflow.png)
62
+ ![split3c ns-site workflow](doc/img/nssite-workflow.png)
63
63
 
64
64
  ---
65
65
 
66
66
  ## Benchmark
67
67
 
68
- ![split3c benchmark](docs/images/benchmark.png)
68
+ ![split3c benchmark](doc/img/benchmark.png)
69
69
 
70
70
  ---
71
71
 
72
72
  ## License
73
73
 
74
74
  split3c is released under the AGPLv3 license.
75
-
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "split3c"
7
- version = "0.0.1"
7
+ version = "0.0.2"
8
8
  description = "Toolkit to split and resolve chimeric 3C/Hi-C/Micro-C reads"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -1,7 +1,7 @@
1
1
  """
2
- This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
2
+ This script is a the split3c project ; split3c is a toolkit for preprocessing 3C-type sequencing libraries and converting BAM alignments into .pairs files for chromatin contact analysis.
3
3
 
4
- Copyright © 2024 Samir Bertache
4
+ Copyright © 2026 Samir Bertache
5
5
 
6
6
  SPDX-License-Identifier: AGPL-3.0-or-later
7
7
 
@@ -1,3 +1,27 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+
1
25
  def signal_handler(sig, frame, out_f, out_r=None):
2
26
  """
3
27
  Handle termination signals to gracefully terminate processes.
@@ -43,27 +67,37 @@ def signal_handler(sig, frame, out_f, out_r=None):
43
67
 
44
68
  def partitionning(num_threads: int, single_bam: bool = False) -> tuple[int, int, int]:
45
69
  """
46
- Heuristique empirique de partition des ressources pour microsplit.
70
+ Empirical resource partitioning heuristic for microsplit.
71
+
72
+ Returns:
73
+
74
+ pigz_threads_per_file: pigz threads per file (F and R)
47
75
 
48
- Retourne:
49
- pigz_threads_per_file : threads pigz par fichier (F et R)
50
- compute_processes : nb de workers process_items
51
- bam_threads : threads pysam/htslib par fichier (lecture ET écriture)
76
+ compute_processes: number of process_items
77
+
78
+ bam_threads: pysam/htslib threads per file (read AND write)
52
79
 
53
80
  IMPORTANT
54
- ---------
55
- Cette fonction est volontairement empirique (surallocation CPU acceptée).
56
- `num_threads` est un *hint* de cœurs disponibles, pas un budget strict.
57
81
 
58
- Points de calibration (bench observés)
82
+ --------- This function is intentionally empirical (CPU overallocation is accepted).
83
+
84
+ `num_threads` is a *hint* of available cores, not a strict budget.
85
+
86
+ Calibration points (observed benchmarks)
87
+
59
88
  --------------------------------------
60
- - 4 cœurs -> (1, 1, 1)
61
- - 8 cœurs -> (2, 3, 1)
62
- - 16 cœurs -> (3, 4, 3)
63
89
 
64
- En mode single_bam=True :
65
- - on double les threads BAM, car un seul flux BAM doit alimenter toute la pipeline
66
- - pigz_per_file et compute_processes restent inchangés
90
+ - ​​4 cores -> (1, 1, 1)
91
+
92
+ - 8 cores -> (2, 3, 1)
93
+
94
+ - 16 cores -> (3, 4, 3)
95
+
96
+ In single_bam=True mode:
97
+
98
+ - The number of BAM threads is doubled, as a single BAM stream must feed the entire pipeline.
99
+
100
+ - pigz_per_file and compute_processes remain unchanged.
67
101
 
68
102
  Doctests
69
103
  --------
@@ -1,7 +1,31 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+
1
25
  def get_bam_headers(bam_for_path, bam_rev_path):
2
26
  """
3
- Ouvre les fichiers BAM, extrait leurs headers sous forme de dictionnaire,
4
- et referme les fichiers immédiatement.
27
+ Open the BAM files, extract their headers as a dictionary,
28
+ and close the files immediately.
5
29
 
6
30
  Returns:
7
31
  tuple: (header_dict_forward, header_dict_reverse)
@@ -76,7 +100,7 @@ def write_bam_pair_from_sam(
76
100
  bam_threads=1,
77
101
  ):
78
102
  """
79
- Écrit les paires BAM en utilisant les dictionnaires de header fournis.
103
+ Writes the BAM pairs using the provided header dictionaries.
80
104
  """
81
105
  import sys
82
106
 
@@ -122,8 +146,9 @@ def write_bam_pair_from_sam(
122
146
 
123
147
  def get_bam_header_single(bam_path):
124
148
  """
125
- Ouvre un BAM unique, extrait son header sous forme de dictionnaire,
126
- puis referme le fichier.
149
+ Opens a single BAM file, extracts its header as a dictionary,
150
+ then closes the file.
151
+
127
152
 
128
153
  Returns
129
154
  -------
@@ -138,23 +163,25 @@ def get_bam_header_single(bam_path):
138
163
 
139
164
  def _pair_reads_from_single_bam(read_a, read_b, strict=True):
140
165
  """
141
- Ordonne deux lectures provenant d'un BAM interleavé en (forward/read1, reverse/read2).
166
+ Orders two reads from a BAM interleaved as (forward/read1, reverse/read2).
142
167
 
143
- Paramètres
168
+ Parameters
144
169
  ----------
145
- read_a, read_b : pysam.AlignedSegment
146
- strict : bool
147
- Si True, lève une erreur en cas d'incohérence forte.
148
- Si False, tente un fallback par ordre d'apparition.
170
+ read_a, read_b: pysam.AlignedSegment
171
+ strict: bool
172
+ If True, raises an error in case of a strong inconsistency.
173
+
174
+ If False, attempts a fallback in order of appearance.
149
175
 
150
176
  Returns
151
- -------
177
+ ------
152
178
  tuple
153
- (read_for, read_rev)
179
+
180
+ (read_for, read_rev)
154
181
 
155
182
  Notes
156
183
  -----
157
- On utilise en priorité les flags 0x40 / 0x80 (is_read1 / is_read2).
184
+ Flags 0x40 / 0x80 (is_read1 / is_read2) are used as the primary method.
158
185
  """
159
186
  if read_a is None or read_b is None:
160
187
  raise ValueError("Pairing failure: one of the reads is None.")
@@ -178,28 +205,30 @@ def read_bam_interleaved(
178
205
  strict=True,
179
206
  ):
180
207
  """
181
- Lit un BAM unique interleavé (une ligne forward/read1 suivie de la ligne reverse/read2) et envoie des batchs de paires SAM dans input_queue.
208
+ Reads a single interleaved BAM (a forward/read1 line followed by a reverse/read2 line) and sends batches of SAM pairs to input_queue.
209
+
210
+ Output contract identical to read_bam_pair:
182
211
 
183
- Contrat de sortie identique à read_bam_pair:
184
- batch = list[(sam_f, sam_r)]
212
+ batch = list[(sam_f, sam_r)]
185
213
 
186
- Paramètres
214
+ Parameters
187
215
  ----------
188
- bam_file : str
189
- Chemin vers un BAM unique interleavé.
190
- input_queue : multiprocessing.Queue
191
- num_processes : int
192
- Nombre de workers compute, pour envoyer les sentinelles None.
193
- bam_threads : int
194
- Threads pysam/htslib.
195
- batch_size : int
196
- Taille des batchs.
197
- strict : bool
198
- Si True, échoue si les paires ne sont pas parfaitement cohérentes.
199
-
200
- Exigences
216
+ bam_file: str
217
+ Path to a single interleaved BAM.
218
+ input_queue: multiprocessing.Queue
219
+
220
+ num_processes: int
221
+ Number of compute workers, to send sentinels. None.
222
+ bam_threads: int
223
+ Pysam/htslib threads.
224
+ batch_size: int
225
+ Batch size.
226
+ strict: bool
227
+ If True, fails if the pairs are not perfectly matched.
228
+
229
+ Requirements
201
230
  ---------
202
- Le BAM doit être ordonné par nom ou au minimum avoir les deux mates consécutives.
231
+ The BAM must be ordered by name or, at a minimum, have two consecutive pairs.
203
232
  """
204
233
  import sys
205
234
 
@@ -250,21 +279,25 @@ def write_bam_interleaved_from_sam(
250
279
  bam_threads=1,
251
280
  ):
252
281
  """
253
- Écrit les paires BAM non splittables dans un BAM unique interleavé.
282
+ Writes non-splittable BAM pairs into a single interleaved BAM.
254
283
 
255
- Contrat d'entrée:
256
- queue contient des batchs list[(sam_f, sam_r)]
284
+ Input contract:
257
285
 
258
- Sortie:
259
- un seul BAM avec read1 puis read2 à la suite.
286
+ queue contains batches `list[(sam_f, sam_r)]`
260
287
 
261
- Paramètres
288
+ Output:
289
+
290
+ a single BAM with `read1` followed by `read2`.
291
+
292
+ Parameters
262
293
  ----------
263
- queue : multiprocessing.Queue
264
- out_bam_path : str
265
- header_dict : dict
266
- num_procs_finished_signal : int
267
- bam_threads : int
294
+ queue: multiprocessing.Queue
295
+
296
+ out_bam_path: str
297
+
298
+ header_dict: dict
299
+ num_procs_finished_signal: int
300
+ bam_threads: int
268
301
  """
269
302
  import sys
270
303
 
@@ -1,3 +1,27 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
24
+
1
25
  def open_output(output_forward, output_reverse, write_processes):
2
26
  """
3
27
  Open output files for writing with pigz compression.
@@ -1,5 +1,5 @@
1
1
  """
2
- This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
3
3
 
4
4
  Copyright © 2024 Samir Bertache
5
5
 
@@ -150,7 +150,7 @@ def _print_banner() -> None:
150
150
  "[bold blue]Microsplit[/bold blue]\n"
151
151
  "Process paired BAM (Micro-C) into paired FASTQ.\n\n"
152
152
  "Use --help to see detailed options.",
153
- title="[bold green]microsplit-cut[/bold green]",
153
+ title="[bold green]split3c nssite-cut[/bold green]",
154
154
  subtitle=f"Version: {__version__}",
155
155
  expand=True,
156
156
  width=100,
@@ -202,8 +202,8 @@ def main_cli(argv: Optional[list[str]] = None) -> int:
202
202
  ),
203
203
  epilog=(
204
204
  "Examples:\n"
205
- " \tmicrosplit -1 fwd.bam -2 rev.bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 -l 0 --pairing-mode cover \n"
206
- " \tmicrosplit -1 merged.bam --single-bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 --pairing-mode all\n"
205
+ " \tsplit3c nssite -1 fwd.bam -2 rev.bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 -l 0 --pairing-mode cover \n"
206
+ " \tsplit3c nssite -1 merged.bam --single-bam -o1 R1.fastq.gz -o2 R2.fastq.gz -t 12 -s 20 --pairing-mode all\n"
207
207
  ),
208
208
  formatter_class=_formatter_class(),
209
209
  )
@@ -1,6 +1,30 @@
1
- from multiprocessing import Process, Queue
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
2
24
  import sys
3
25
  import traceback
26
+ from multiprocessing import Process, Queue
27
+
4
28
 
5
29
  class WorkerProcess(Process):
6
30
  def __init__(self, target, args, error_queue):
@@ -15,6 +39,7 @@ class WorkerProcess(Process):
15
39
  self.error_queue.put((str(e), traceback.format_exc()))
16
40
  sys.exit(1)
17
41
 
42
+
18
43
  class ProcessManager:
19
44
  def __init__(self):
20
45
  self.processes = []
@@ -1,3 +1,26 @@
1
+ """
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped site. Constructs to analyse Micro-C/CAD-C data
3
+
4
+ Copyright © 2024 Samir Bertache
5
+
6
+ SPDX-License-Identifier: AGPL-3.0-or-later
7
+
8
+ ===============================================================================
9
+
10
+ This program is free software: you can redistribute it and/or modify it under
11
+ the terms of the GNU Affero General Public License as published by the
12
+ Free Software Foundation, either version 3 of the License, or (at your option)
13
+ any later version.
14
+
15
+ This program is distributed in the hope that it will be useful,
16
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
+ See the GNU Affero General Public License for more details.
19
+
20
+ You should have received a copy of the GNU Affero General Public License
21
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
22
+ """
23
+
1
24
  import logging
2
25
  import os
3
26
  import signal
@@ -256,14 +279,15 @@ def process_cigard(name, sequence, quality, cigar, seed_size, len_add):
256
279
 
257
280
  def read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=None):
258
281
  """
259
- Construit un header de paire à partir d'un nom de read et de deux tags.
282
+ Constructs a pair header from a read name and two tags.
283
+
284
+ `base_name`: logical name of the read (e.g., '@READ')
285
+ `tag_i`, `tag_j`: fragment identifiers, typically 'F1', 'R1', etc. (1-based)
286
+ `tot_for`, `tot_rev`: total number of forward/reverse fragments
260
287
 
261
- base_name : nom logique du read (ex: '@READ')
262
- tag_i, tag_j : identifiants de fragments, typiquement 'F1', 'R1', etc. (1-based)
263
- tot_for, tot_rev : nombres totaux de fragments forward / reverse
288
+ Return (origin/reverse mode):
264
289
 
265
- Retour (mode origin/o):
266
- '<base_name>:[<tag_i>,<tag_j>:FT<tot_for>,RT<tot_rev>]'
290
+ '<base_name>:[<tag_i>,<tag_j>:FT<tot_for>,RT<tot_rev>]'
267
291
 
268
292
  Examples
269
293
  --------
@@ -288,7 +312,7 @@ def read_name(base_name, tag_i, tag_j, tot_for, tot_rev, tags=None):
288
312
 
289
313
  def _fraglist_to_entries(frag_list, origin):
290
314
  """
291
- Transforme une liste de FastQ en tuples (origin, idx, seq, qual).
315
+ Transforms a list of FastQ into tuples (origin, idx, seq, qual).
292
316
 
293
317
  Examples
294
318
  --------
@@ -308,7 +332,7 @@ def _fraglist_to_entries(frag_list, origin):
308
332
 
309
333
  def _emit_pair(base_name, e1, e2, tot_for, tot_rev, tags=None):
310
334
  """
311
- Construit une paire FASTQ textuelle à partir de deux entrées.
335
+ Constructs a textual FASTQ pair from two inputs.
312
336
 
313
337
  e = (origin, idx, seq, qual)
314
338
  """
@@ -326,15 +350,15 @@ def _emit_pair(base_name, e1, e2, tot_for, tot_rev, tags=None):
326
350
 
327
351
  def gen_read_pairs_from_frags_cover(base_name, frags_f, frags_r, tags=None):
328
352
  """
329
- Génère un nombre minimal (ou quasi minimal) de paires pour que
330
- chaque fragment apparaisse au moins une fois.
353
+ Generates a minimal (or near-minimal) number of pairs so that
354
+ each fragment appears at least once.
331
355
 
332
- Stratégie:
333
- 1. appariement F-R tant que possible
334
- 2. appariement des restes au sein du même côté
335
- 3. si un fragment reste seul, on le rattache à un anchor déjà utilisé
356
+ Strategy:
357
+ 1. Match F-R whenever possible
358
+ 2. Match remainders within the same side
359
+ 3. If a fragment remains alone, reattach it to an already used anchor
336
360
 
337
- Complexité: O(F + R)
361
+ Complexity: O(F + R)
338
362
 
339
363
  Examples
340
364
  --------
@@ -425,14 +449,14 @@ def gen_read_pairs_from_frags_cover(base_name, frags_f, frags_r, tags=None):
425
449
 
426
450
  def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
427
451
  """
428
- Génère deux chaînes FastQ (forward/reverse) à partir de fragments déjà splittés.
452
+ Generates two FastQ (forward/reverse) chains from already split fragments.
429
453
 
430
- `frags_f` et `frags_r` sont des listes de fragments FASTQ complets:
454
+ `frags_f` et `frags_r` are lists of complete FASTQ fragments :
431
455
  '@name\\nSEQ\\n+\\nQUAL\\n'
432
456
 
433
457
  Examples
434
458
  --------
435
- Cas simple : 1 fragment forward, 1 fragment reverse (une seule combinaison).
459
+ Simple case : 1 fragment forward, 1 fragment reverse (only one combinaison).
436
460
  >>> frags_f = ["@x\\nAC\\n+\\n??\\n"]
437
461
  >>> frags_r = ["@x\\nTG\\n+\\n!!\\n"]
438
462
  >>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r)
@@ -441,14 +465,14 @@ def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
441
465
  >>> R
442
466
  '@READ:[F1,R1:FT1,RT1]\\nTG\\n+\\n!!\\n'
443
467
 
444
- Même cas, sans tag (nt).
468
+ Same one, without tag (nt).
445
469
  >>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r, tags="nt")
446
470
  >>> F
447
471
  '@READ\\nAC\\n+\\n??\\n'
448
472
  >>> R
449
473
  '@READ\\nTG\\n+\\n!!\\n'
450
474
 
451
- Cas combinatoire :
475
+ Combinatorial case :
452
476
  - forward: 2 fragments (F0, F1)
453
477
  - reverse: 1 fragment (R0)
454
478
  -> combinaisons : (F0,F1), (F0,R0), (F1,R0)
@@ -470,7 +494,7 @@ def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
470
494
  >>> R
471
495
  '@READ:[F1,F2:FT2,RT1]\\nBCD\\n+\\n===\\n@READ:[F1,R1:FT2,RT1]\\nWXYZ\\n+\\n>>>>\\n@READ:[F2,R1:FT2,RT1]\\nWXYZ\\n+\\n>>>>\\n'
472
496
 
473
- Cas combinatoire symétrique (2 fragments forward, 2 fragments reverse).
497
+ Symmetric combinatorial case (2 fragments forward, 2 fragments reverse).
474
498
  >>> frags_f = ["@x\\nA\\n+\\n!\\n", "@x\\nBC\\n+\\n!!\\n"]
475
499
  >>> frags_r = ["@x\\nD\\n+\\n#\\n", "@x\\nEF\\n+\\n##\\n"]
476
500
  >>> F, R = gen_read_pairs_from_frags_all("@READ", frags_f, frags_r)
@@ -480,17 +504,16 @@ def gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=None):
480
504
  """
481
505
  from itertools import combinations
482
506
 
483
- # Cas combinatoire: on annote chaque fragment avec son origine (F/R) et un index local
484
507
  def _to_entries(frag_list, origin):
485
508
  """
486
- Transforme une liste de FastQ en tuples (origin, idx, seq, qual).
509
+ Transforms a list of FastQ into tuples (origin, idx, seq, qual).
487
510
  frag = '@smth\\nSEQ\\n+\\nQUAL\\n'
488
511
  """
489
512
  entries = []
490
513
  for idx, frag in enumerate(frag_list):
491
514
  lines = frag.strip().split("\n")
492
515
  if len(lines) != 4 or lines[2] != "+":
493
- raise ValueError("Fragment FastQ invalide dans process_cigard.")
516
+ raise ValueError("FastQ Fragment invalid in process_cigard.")
494
517
  seq = lines[1]
495
518
  qual = lines[3]
496
519
  entries.append((origin, idx, seq, qual))
@@ -520,7 +543,7 @@ def gen_read_pairs_from_frags(
520
543
  base_name, frags_f, frags_r, tags=None, pairing_mode="all"
521
544
  ):
522
545
  """
523
- Dispatcher entre plusieurs stratégies de génération de paires.
546
+ Dispatch between several pair generation strategies.
524
547
  """
525
548
  if pairing_mode == "all":
526
549
  return gen_read_pairs_from_frags_all(base_name, frags_f, frags_r, tags=tags)
@@ -530,7 +553,6 @@ def gen_read_pairs_from_frags(
530
553
  raise ValueError(f"Unknown pairing_mode: {pairing_mode}")
531
554
 
532
555
 
533
- # Pensez à la gestion de seq = "*"
534
556
  def sam_fields(sam_line: str):
535
557
  """
536
558
  Return minimal information
@@ -0,0 +1,4 @@
1
+ from .frag import process_items
2
+ from .pretreatment import partition_threads, search_in_database
3
+ from .read import read_fastq_gzip_simultaneously
4
+ from .write_control import manage_pigz_problems, open_output, write_pairs
@@ -1,5 +1,5 @@
1
1
  """
2
- This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
2
+ This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified unmapped.
3
3
 
4
4
  Copyright © 2024 Samir Bertache
5
5
 
@@ -286,7 +286,7 @@ def processing_fr(
286
286
  """
287
287
  Process the sequences to generate buffers for forward and reverse reads
288
288
  selon le mode FR (un fragment forward + un fragment reverse).
289
- N'ajoute pas de suffixe :ij si une seule paire.
289
+ Do not add the suffix :ij if there is only one pair.
290
290
 
291
291
  Doctests:
292
292
  >>> seqs = ["AAAACCCCGGGG", "TTTTGGGGCCCC"]