toulligqc 2.5.7__tar.gz → 2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {toulligqc-2.5.7 → toulligqc-2.7}/PKG-INFO +4 -4
- {toulligqc-2.5.7 → toulligqc-2.7}/README.md +71 -17
- {toulligqc-2.5.7 → toulligqc-2.7}/setup.py +7 -7
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/bam_extractor.py +81 -38
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/common.py +28 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/extractor_common.py +26 -5
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/fast5_extractor.py +54 -77
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/fastq_extractor.py +67 -19
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/html_report_generator.py +2 -2
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/plotly_graph_common.py +8 -3
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/plotly_graph_generator.py +12 -8
- toulligqc-2.7/toulligqc/pod5_extractor.py +234 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/sequencing_summary_extractor.py +66 -34
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/sequencing_summary_onedsquare_extractor.py +7 -11
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/toulligqc.py +42 -8
- toulligqc-2.7/toulligqc/version.py +1 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/PKG-INFO +4 -4
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/SOURCES.txt +1 -0
- toulligqc-2.7/toulligqc.egg-info/requires.txt +11 -0
- toulligqc-2.5.7/toulligqc/version.py +0 -1
- toulligqc-2.5.7/toulligqc.egg-info/requires.txt +0 -10
- {toulligqc-2.5.7 → toulligqc-2.7}/AUTHORS +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/LICENSE-CeCILL.txt +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/LICENSE.txt +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/MANIFEST.in +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/setup.cfg +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/test/test_sequencing_summary_extractor.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/__init__.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/common_statistics.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/configuration.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/fastq_bam_common.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/plotly_graph_onedsquare_generator.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/report_data_file_generator.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/resources/plotly-latest.min.js +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/resources/toulligqc.css +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/resources/toulligqc.png +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/sequencing_telemetry_extractor.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/toulligqc_info_extractor.py +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/dependency_links.txt +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/entry_points.txt +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/not-zip-safe +0 -0
- {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: toulligqc
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7
|
|
4
4
|
Summary: A post sequencing QC tool for Oxford Nanopore sequencers
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/GenomiqueENS/toulligQC
|
|
6
6
|
Author: Genomic Paris Centre team
|
|
7
|
-
Author-email: toulligqc@
|
|
7
|
+
Author-email: toulligqc@bio.ens.psl.eu
|
|
8
8
|
License: GPL V3
|
|
9
9
|
Keywords: Nanopore MinION QC report
|
|
10
10
|
Platform: ALL
|
|
@@ -15,7 +15,7 @@ Classifier: Intended Audience :: Science/Research
|
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
16
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
17
17
|
Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Requires-Python: >=3.11.0
|
|
20
20
|
License-File: LICENSE-CeCILL.txt
|
|
21
21
|
License-File: LICENSE.txt
|
|
@@ -24,6 +24,7 @@ Support is availlable on [GitHub issue page](https://github.com/GenomicParisCent
|
|
|
24
24
|
* 1.3 [Docker](#docker)
|
|
25
25
|
* [Docker image recovery](#docker-image-recovery)
|
|
26
26
|
* [Launching Docker image with docker run](#launching-Docker-image-with-docker-run)
|
|
27
|
+
* 1.4 [nf-core module](#nfcore-module)
|
|
27
28
|
|
|
28
29
|
* 2.[Usage](#usage)
|
|
29
30
|
* 2.1 [Command line](#command-line)
|
|
@@ -93,14 +94,25 @@ $ docker run -ti \
|
|
|
93
94
|
-v /path/to/basecaller/sequencing/summary/file:/path/to/basecaller/sequencing/summary/file \
|
|
94
95
|
-v /path/to/basecaller/sequencing/telemetry/file:/path/to/basecaller/telemetry/summary/file \
|
|
95
96
|
-v /path/to/result/directory:/path/to/result/directory \
|
|
96
|
-
toulligqc:latest
|
|
97
|
+
genomicpariscentre/toulligqc:latest
|
|
97
98
|
```
|
|
99
|
+
|
|
100
|
+
<a name="nfcore-module"></a>
|
|
101
|
+
### 1.4 Using nf-core module
|
|
102
|
+
ToulligQC is also available on nf-core as a module written in nextflow. To install nf-core on your system, please visit their website (<https://nf-co.re/docs/usage/introduction>).
|
|
103
|
+
|
|
104
|
+
The following command line will install the latest version of the ToulligQC module:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
$ nf-core modules install toulligqc
|
|
108
|
+
```
|
|
109
|
+
|
|
98
110
|
<a name="usage"></a>
|
|
99
111
|
## 2. Usage
|
|
100
112
|
<a name="command-line"></a>
|
|
101
113
|
|
|
102
114
|
ToulligQC is adapted to RNA-Seq along with DNA-Seq and it is compatible with 1D² runs.
|
|
103
|
-
This QC tool supports only Guppy basecalling ouput files.
|
|
115
|
+
This QC tool supports only Guppy and Dorado basecalling ouput files.
|
|
104
116
|
It also needs a single FAST5 file (to catch the flowcell ID and the run date) if a telemetry file is not provided.
|
|
105
117
|
Flow cells and kits version are retrieved using the telemetry file.
|
|
106
118
|
ToulligQC can take barcoding samples by adding the barcode list as a command line option.
|
|
@@ -111,7 +123,7 @@ To do so, ToulligQC deals with different file formats: gz, tar.gz, bz2, tar.bz2
|
|
|
111
123
|
This tool will produce a set of graphs, statistic file in plain text format and a HTML report.
|
|
112
124
|
|
|
113
125
|
|
|
114
|
-
To run ToulligQC you need the Guppy basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
|
|
126
|
+
To run ToulligQC you need the Guppy/ Dorado basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
|
|
115
127
|
This can be compressed with gzip or bzip2.
|
|
116
128
|
You can use your initial Fast5 ONT file too.
|
|
117
129
|
ToulligQC can perform analyses on your data if the directory is organised as the following:
|
|
@@ -132,7 +144,7 @@ RUN_ID
|
|
|
132
144
|
└── sequencing_1dsq_summary.txt
|
|
133
145
|
```
|
|
134
146
|
|
|
135
|
-
For a barcoded run you can add the barcoding files generated by Guppy ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
|
|
147
|
+
For a barcoded run you can add the barcoding files generated by Guppy/ Dorado ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
|
|
136
148
|
|
|
137
149
|
For the barcode list to use in the command line options, ToulligQC handle the following naming schemes: BCXX, RBXX, NBXX and barcodeXX where XX is the number of the barcode.
|
|
138
150
|
The barcode naming schemes are case insensitive.
|
|
@@ -156,14 +168,16 @@ This is a directory for 1D² analysis with barcoding files:
|
|
|
156
168
|
|
|
157
169
|
General Options:
|
|
158
170
|
```
|
|
159
|
-
usage: ToulligQC V2.
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
171
|
+
usage: ToulligQC V2.6 [-a SEQUENCING_SUMMARY_SOURCE] [-t TELEMETRY_SOURCE]
|
|
172
|
+
[-f FAST5_SOURCE] [-p POD5_SOURCE] [-q FASTQ] [-u BAM]
|
|
173
|
+
[--thread THREAD] [--batch-size BATCH_SIZE] [--qscore-threshold THRESHOLD]
|
|
174
|
+
[-n REPORT_NAME] [--output-directory OUTPUT] [-o HTML_REPORT_PATH]
|
|
175
|
+
[--data-report-path DATA_REPORT_PATH]
|
|
176
|
+
[--images-directory IMAGES_DIRECTORY]
|
|
177
|
+
[-d SEQUENCING_SUMMARY_1DSQR_SOURCE]
|
|
178
|
+
[-s SAMPLESHEET]
|
|
179
|
+
[-b] [-l BARCODES]
|
|
180
|
+
[--quiet] [--force] [-h] [--version]
|
|
167
181
|
|
|
168
182
|
required arguments:
|
|
169
183
|
-a SEQUENCING_SUMMARY_SOURCE, --sequencing-summary-source SEQUENCING_SUMMARY_SOURCE
|
|
@@ -175,6 +189,9 @@ required arguments:
|
|
|
175
189
|
-f FAST5_SOURCE, --fast5-source FAST5_SOURCE
|
|
176
190
|
Fast5 file source (necessary if no telemetry file),
|
|
177
191
|
can also be in a tar.gz/tar.bz2 archive or a directory
|
|
192
|
+
-p POD5_SOURCE, --pod5-source POD5_SOURCE
|
|
193
|
+
pod5 file source (necessary if no telemetry file),
|
|
194
|
+
can also be in a tar.gz/tar.bz2 archive or a directory
|
|
178
195
|
-q FASTQ, --fastq FASTQ
|
|
179
196
|
FASTQ file (necessary if no sequencing summary file),
|
|
180
197
|
can also be in a .gz archive
|
|
@@ -183,6 +200,8 @@ required arguments:
|
|
|
183
200
|
can also be a SAM format
|
|
184
201
|
|
|
185
202
|
optional arguments:
|
|
203
|
+
-s SAMPLESHEET, --samplesheet SAMPLESHEET
|
|
204
|
+
Samplesheet (.csv file) to fill out sample names in MinKNOW.
|
|
186
205
|
-n REPORT_NAME, --report-name REPORT_NAME
|
|
187
206
|
Report name
|
|
188
207
|
--output-directory OUTPUT
|
|
@@ -197,8 +216,9 @@ optional arguments:
|
|
|
197
216
|
Basecaller 1dsq summary source
|
|
198
217
|
-b, --barcoding Option for barcode usage
|
|
199
218
|
-l BARCODES, --barcodes BARCODES
|
|
200
|
-
|
|
201
|
-
BC05,RB09,NB01,barcode10)
|
|
219
|
+
Comma-separated barcode list (e.g.,
|
|
220
|
+
BC05,RB09,NB01,barcode10) or a range separated with ':' (e.g.,
|
|
221
|
+
barcode01:barcode19)
|
|
202
222
|
--thread THREAD Number of threads for parsing FASTQ or BAM files (default: 2).
|
|
203
223
|
--batch-size BATCH_SIZE Batch size for each threads (default: 500).
|
|
204
224
|
--qscore-threshold THRESHOLD Q-score threshold to distinguish between passing filter and
|
|
@@ -213,7 +233,41 @@ optional arguments:
|
|
|
213
233
|
* #### Examples
|
|
214
234
|
|
|
215
235
|
|
|
216
|
-
|
|
236
|
+
* Sequencing summary alone \
|
|
237
|
+
Note that the fowcell ID and run date will be missing from report, found in telemetry file or single fast5 file
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
$ toulligqc --report-name summary_only \
|
|
241
|
+
--sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
|
|
242
|
+
--html-report-path /path/to/output/report.html
|
|
243
|
+
```
|
|
244
|
+
* Sequencing summary + telemetry file
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
$ toulligqc --report-name summary_plus_telemetry \
|
|
248
|
+
--telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
|
|
249
|
+
--sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
|
|
250
|
+
--html-report-path /path/to/output/report.html
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
* Telemetry file + fast5 files
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
$ toulligqc --report-name telemetry_plus_fast5 \
|
|
257
|
+
--telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
|
|
258
|
+
--fast5-source /path/to/basecaller/output/fast5_files.fast5.gz \
|
|
259
|
+
--html-report-path /path/to/output/report.html
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
* Fastq/ bam files only
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
$ toulligqc --report-name FAF0256 \
|
|
266
|
+
--fastq /path/to/basecaller/output/fastq_files.fq.gz \ # (replace with --bam)
|
|
267
|
+
--html-report-path /path/to/output/report.html
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
* Optional arguments for 1D² analysis
|
|
217
271
|
|
|
218
272
|
```bash
|
|
219
273
|
$ toulligqc --report-name FAF0256 \
|
|
@@ -223,7 +277,7 @@ $ toulligqc --report-name FAF0256 \
|
|
|
223
277
|
--html-report-path /path/to/output/report.html
|
|
224
278
|
```
|
|
225
279
|
|
|
226
|
-
|
|
280
|
+
* Optional arguments to deal with barcoded samples
|
|
227
281
|
|
|
228
282
|
```bash
|
|
229
283
|
$ toulligqc --report-name FAF0256 \
|
|
@@ -271,7 +325,7 @@ $ toulligqc \
|
|
|
271
325
|
--sequencing-summary-source sequencing_summary.txt \
|
|
272
326
|
--sequencing-summary-source barcoding_summary_pass.txt \
|
|
273
327
|
--sequencing-summary-source barcoding_summary_fail.txt \
|
|
274
|
-
--barcodes BC01
|
|
328
|
+
--barcodes BC01:BC07 \
|
|
275
329
|
--output-directory output
|
|
276
330
|
```
|
|
277
331
|
|
|
@@ -14,11 +14,11 @@ setup(
|
|
|
14
14
|
long_description='See project website for more information.',
|
|
15
15
|
|
|
16
16
|
# The project's main homepage.
|
|
17
|
-
url='https://github.com/
|
|
17
|
+
url='https://github.com/GenomiqueENS/toulligQC',
|
|
18
18
|
|
|
19
19
|
# Author details
|
|
20
20
|
author='Genomic Paris Centre team',
|
|
21
|
-
author_email='toulligqc@
|
|
21
|
+
author_email='toulligqc@bio.ens.psl.eu',
|
|
22
22
|
|
|
23
23
|
license='GPL V3',
|
|
24
24
|
platforms='ALL',
|
|
@@ -34,7 +34,7 @@ setup(
|
|
|
34
34
|
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
|
|
35
35
|
'License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)',
|
|
36
36
|
|
|
37
|
-
'Programming Language :: Python :: 3.
|
|
37
|
+
'Programming Language :: Python :: 3.12'
|
|
38
38
|
],
|
|
39
39
|
|
|
40
40
|
keywords='Nanopore MinION QC report',
|
|
@@ -46,10 +46,10 @@ setup(
|
|
|
46
46
|
include_package_data=True,
|
|
47
47
|
|
|
48
48
|
python_requires='>=3.11.0',
|
|
49
|
-
install_requires=['matplotlib>=3.6.3', 'plotly==
|
|
50
|
-
'pandas>=1.
|
|
51
|
-
'scikit-learn>=1.
|
|
52
|
-
'ezcharts==0.7.6'],
|
|
49
|
+
install_requires=['matplotlib>=3.6.3', 'plotly==5.15.0', 'h5py>=3.10.0',
|
|
50
|
+
'pandas>=2.1.4', 'numpy>=1.26.4', 'scipy>=1.11.4',
|
|
51
|
+
'scikit-learn>=1.4.1', 'tqdm>=4.66.2', 'pysam>=0.22.0',
|
|
52
|
+
'pod5>=0.3.10', 'ezcharts==0.7.6'],
|
|
53
53
|
|
|
54
54
|
entry_points={
|
|
55
55
|
'console_scripts': [
|
|
@@ -4,7 +4,7 @@ import numpy as np
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import time
|
|
6
6
|
import pysam
|
|
7
|
-
from
|
|
7
|
+
from collections import defaultdict
|
|
8
8
|
from toulligqc.extractor_common import log_task
|
|
9
9
|
from toulligqc.extractor_common import describe_dict
|
|
10
10
|
from toulligqc.extractor_common import set_result_value
|
|
@@ -15,6 +15,7 @@ from toulligqc.extractor_common import get_result_value
|
|
|
15
15
|
from toulligqc.extractor_common import set_result_dict_telemetry_value
|
|
16
16
|
from toulligqc.extractor_common import fill_series_dict
|
|
17
17
|
from toulligqc.extractor_common import timeISO_to_float
|
|
18
|
+
from toulligqc.extractor_common import extract_barcode_info
|
|
18
19
|
from toulligqc.common_statistics import compute_NXX, compute_LXX, occupancy_channel, avg_qual
|
|
19
20
|
from toulligqc.fastq_bam_common import multiprocessing_submit, extract_headerTag
|
|
20
21
|
from toulligqc.fastq_bam_common import batch_iterator
|
|
@@ -24,13 +25,16 @@ from toulligqc import plotly_graph_generator as pgg
|
|
|
24
25
|
|
|
25
26
|
class uBAM_Extractor:
|
|
26
27
|
def __init__(self, config_dictionary):
|
|
27
|
-
self.
|
|
28
|
+
self.config_dictionary = config_dictionary
|
|
28
29
|
self.ubam = config_dictionary['bam'].split('\t')
|
|
29
30
|
self.images_directory = config_dictionary['images_directory']
|
|
30
31
|
self.threshold_Qscore = int(config_dictionary['threshold'])
|
|
31
32
|
self.batch_size = int(config_dictionary['batch_size'])
|
|
32
33
|
self.thread = int(config_dictionary['thread'])
|
|
33
34
|
self.header = dict()
|
|
35
|
+
self.is_barcode = False
|
|
36
|
+
if config_dictionary['barcoding'] == 'True':
|
|
37
|
+
self.is_barcode = True
|
|
34
38
|
if 'quiet' not in config_dictionary or config_dictionary['quiet'].lower() != 'true':
|
|
35
39
|
self.quiet = False
|
|
36
40
|
else:
|
|
@@ -53,12 +57,24 @@ class uBAM_Extractor:
|
|
|
53
57
|
:return: Panda's Dataframe object
|
|
54
58
|
"""
|
|
55
59
|
start_time = time.time()
|
|
56
|
-
self.
|
|
57
|
-
if self.
|
|
60
|
+
self.dataframe = self._load_uBAM_file()
|
|
61
|
+
if self.dataframe.empty:
|
|
58
62
|
raise pd.errors.EmptyDataError("Dataframe is empty")
|
|
59
63
|
self.dataframe_dict = {}
|
|
64
|
+
|
|
65
|
+
# Add missing categories
|
|
66
|
+
if 'barcode_arrangement' in self.dataframe.columns:
|
|
67
|
+
self.dataframe['barcode_arrangement'] = self.dataframe['barcode_arrangement'].cat.add_categories([0,
|
|
68
|
+
'other barcodes',
|
|
69
|
+
'passes_filtering'])
|
|
70
|
+
|
|
71
|
+
# Replace all NaN values by 0 to avoid data manipulation errors when columns are not the same length
|
|
72
|
+
self.dataframe = self.dataframe.fillna(0)
|
|
73
|
+
|
|
74
|
+
self.barcode_selection = self.config_dictionary['barcode_selection']
|
|
75
|
+
|
|
60
76
|
log_task(self.quiet,
|
|
61
|
-
'Load BAM file ({:,.2f} MB used)'.format(self.
|
|
77
|
+
'Load BAM file ({:,.2f} MB used)'.format(self.dataframe.memory_usage(deep=True).sum()/1024/1024),
|
|
62
78
|
start_time,
|
|
63
79
|
time.time())
|
|
64
80
|
|
|
@@ -70,7 +86,7 @@ class uBAM_Extractor:
|
|
|
70
86
|
"""
|
|
71
87
|
check_result_values(self, result_dict)
|
|
72
88
|
self.dataframe_dict.clear()
|
|
73
|
-
self.
|
|
89
|
+
self.dataframe.iloc[0:0]
|
|
74
90
|
|
|
75
91
|
|
|
76
92
|
@staticmethod
|
|
@@ -79,7 +95,7 @@ class uBAM_Extractor:
|
|
|
79
95
|
Get the name of the extractor.
|
|
80
96
|
:return: the name of the extractor
|
|
81
97
|
"""
|
|
82
|
-
return '
|
|
98
|
+
return 'uBAM'
|
|
83
99
|
|
|
84
100
|
|
|
85
101
|
@staticmethod
|
|
@@ -100,14 +116,38 @@ class uBAM_Extractor:
|
|
|
100
116
|
|
|
101
117
|
add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory))
|
|
102
118
|
add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory))
|
|
103
|
-
add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.
|
|
119
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe, self.images_directory))
|
|
104
120
|
add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory))
|
|
105
121
|
add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))
|
|
106
|
-
add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.
|
|
122
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe, self.images_directory))
|
|
107
123
|
add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))
|
|
108
124
|
add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory))
|
|
109
125
|
add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
|
|
110
126
|
add_image_to_result(self.quiet, images, time.time(), pgg.speed_over_time(self.dataframe_dict, self.images_directory))
|
|
127
|
+
if self.is_barcode:
|
|
128
|
+
if "barcode_alias" in self.config_dictionary:
|
|
129
|
+
barcode_alias = self.config_dictionary['barcode_alias']
|
|
130
|
+
else:
|
|
131
|
+
barcode_alias = None
|
|
132
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
|
|
133
|
+
self.barcode_selection,
|
|
134
|
+
self.images_directory,
|
|
135
|
+
barcode_alias))
|
|
136
|
+
|
|
137
|
+
read_fail = self.dataframe_dict["read.fail.barcoded"]
|
|
138
|
+
if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
|
|
139
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
|
|
140
|
+
self.barcode_selection,
|
|
141
|
+
self.images_directory,
|
|
142
|
+
barcode_alias))
|
|
143
|
+
|
|
144
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
|
|
145
|
+
self.images_directory,
|
|
146
|
+
barcode_alias))
|
|
147
|
+
|
|
148
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
|
|
149
|
+
self.images_directory,
|
|
150
|
+
barcode_alias))
|
|
111
151
|
return images
|
|
112
152
|
|
|
113
153
|
|
|
@@ -117,7 +157,7 @@ class uBAM_Extractor:
|
|
|
117
157
|
:param result_dict:
|
|
118
158
|
"""
|
|
119
159
|
start_time = time.time()
|
|
120
|
-
fill_series_dict(self.dataframe_dict, self.
|
|
160
|
+
fill_series_dict(self.dataframe_dict, self.dataframe)
|
|
121
161
|
|
|
122
162
|
set_result_dict_telemetry_value(result_dict, "run.id", self.header["run_id"])
|
|
123
163
|
set_result_dict_telemetry_value(result_dict, "sample.id", self.header["sample_id"])
|
|
@@ -128,11 +168,11 @@ class uBAM_Extractor:
|
|
|
128
168
|
set_result_dict_telemetry_value(result_dict, "basecalling.date", self.header["run_date"])
|
|
129
169
|
set_result_dict_telemetry_value(result_dict, "pass.threshold.qscore", str(self.threshold_Qscore))
|
|
130
170
|
|
|
131
|
-
set_result_value(self, result_dict, "read.count", len(self.
|
|
171
|
+
set_result_value(self, result_dict, "read.count", len(self.dataframe))
|
|
132
172
|
set_result_value(self, result_dict, "read.pass.count",
|
|
133
|
-
count_boolean_elements(self.
|
|
173
|
+
count_boolean_elements(self.dataframe, 'passes_filtering', True))
|
|
134
174
|
set_result_value(self, result_dict, "read.fail.count",
|
|
135
|
-
count_boolean_elements(self.
|
|
175
|
+
count_boolean_elements(self.dataframe, 'passes_filtering', False))
|
|
136
176
|
total_reads = get_result_value(self, result_dict, "read.count")
|
|
137
177
|
|
|
138
178
|
# Ratios
|
|
@@ -160,9 +200,9 @@ class uBAM_Extractor:
|
|
|
160
200
|
set_result_value(self, result_dict, "n50", compute_NXX(self.dataframe_dict, 50))
|
|
161
201
|
set_result_value(self, result_dict, "l50", compute_LXX(self.dataframe_dict, 50))
|
|
162
202
|
|
|
163
|
-
set_result_value(self, result_dict, "run.time", max(self.
|
|
203
|
+
set_result_value(self, result_dict, "run.time", max(self.dataframe['start_time']))
|
|
164
204
|
# Get channel occupancy statistics and store each value into result_dict
|
|
165
|
-
for index, value in occupancy_channel(self.
|
|
205
|
+
for index, value in occupancy_channel(self.dataframe).items():
|
|
166
206
|
set_result_value(self,
|
|
167
207
|
result_dict, "channel.occupancy.statistics." + index, value)
|
|
168
208
|
|
|
@@ -180,7 +220,7 @@ class uBAM_Extractor:
|
|
|
180
220
|
"fail.reads.sequence.length")
|
|
181
221
|
|
|
182
222
|
# Get Qscore statistics without count value and store them into result_dict
|
|
183
|
-
qscore_statistics = self.
|
|
223
|
+
qscore_statistics = self.dataframe['mean_qscore'].describe().drop(
|
|
184
224
|
"count")
|
|
185
225
|
|
|
186
226
|
for index, value in qscore_statistics.items():
|
|
@@ -190,11 +230,16 @@ class uBAM_Extractor:
|
|
|
190
230
|
# Add statistics (without count) about read pass/fail qscore in the result_dict
|
|
191
231
|
describe_dict(self, result_dict, self.dataframe_dict["pass.reads.mean.qscore"], "pass.reads.mean.qscore")
|
|
192
232
|
describe_dict(self, result_dict, self.dataframe_dict["fail.reads.mean.qscore"], "fail.reads.mean.qscore")
|
|
193
|
-
|
|
233
|
+
if self.is_barcode:
|
|
234
|
+
extract_barcode_info(self, result_dict,
|
|
235
|
+
self.barcode_selection,
|
|
236
|
+
self.dataframe_dict,
|
|
237
|
+
self.dataframe)
|
|
238
|
+
|
|
194
239
|
log_task(self.quiet, 'Extract info from uBAM file', start_time, time.time())
|
|
195
240
|
|
|
196
241
|
|
|
197
|
-
def
|
|
242
|
+
def _load_uBAM_file(self):
|
|
198
243
|
"""
|
|
199
244
|
Load uBAM dataframe
|
|
200
245
|
:return: a Pandas Dataframe object
|
|
@@ -205,23 +250,27 @@ class uBAM_Extractor:
|
|
|
205
250
|
uBAM_chunks,
|
|
206
251
|
n_process=self.thread,
|
|
207
252
|
pbar_update=self.batch_size)
|
|
208
|
-
|
|
253
|
+
uBAM_df = []
|
|
209
254
|
|
|
210
255
|
for _, f in enumerate(rst_futures):
|
|
211
|
-
|
|
256
|
+
uBAM_df.extend(f.result())
|
|
212
257
|
|
|
213
258
|
columns = ['sequence_length', 'mean_qscore', 'passes_filtering', 'start_time', 'channel', 'duration']
|
|
214
|
-
|
|
215
|
-
|
|
259
|
+
if self.is_barcode:
|
|
260
|
+
columns.append('barcode_arrangement')
|
|
216
261
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
262
|
+
uBAM_df = pd.DataFrame(uBAM_df, columns=columns)
|
|
263
|
+
|
|
264
|
+
uBAM_df['sequence_length'] = uBAM_df['sequence_length'].astype(np.uint32)
|
|
265
|
+
uBAM_df['mean_qscore'] = uBAM_df['mean_qscore'].astype(np.float32)
|
|
266
|
+
uBAM_df['passes_filtering'] = uBAM_df['passes_filtering'].astype(np.bool_ if is_numpy_1_24 else np.bool)
|
|
267
|
+
uBAM_df["start_time"] = uBAM_df["start_time"] - uBAM_df["start_time"].min()
|
|
268
|
+
uBAM_df['channel'] = uBAM_df['channel'].astype(np.int16)
|
|
269
|
+
uBAM_df['start_time'] = uBAM_df['start_time'].astype(np.float64)
|
|
270
|
+
uBAM_df['duration'] = uBAM_df['duration'].astype(np.float32)
|
|
271
|
+
if self.is_barcode:
|
|
272
|
+
uBAM_df['barcode_arrangement'] = uBAM_df['barcode_arrangement'].astype("category")
|
|
273
|
+
return uBAM_df
|
|
225
274
|
|
|
226
275
|
|
|
227
276
|
def _uBAM_batch_reader(self, uBAM_chunk):
|
|
@@ -251,14 +300,6 @@ class uBAM_Extractor:
|
|
|
251
300
|
yield batch
|
|
252
301
|
|
|
253
302
|
|
|
254
|
-
def _timeISO_to_float(self, iso_datetime, format):
|
|
255
|
-
"""
|
|
256
|
-
"""
|
|
257
|
-
dt = datetime.strptime(iso_datetime, format)
|
|
258
|
-
unix_timestamp = dt.timestamp()
|
|
259
|
-
return unix_timestamp
|
|
260
|
-
|
|
261
|
-
|
|
262
303
|
def _get_header(self):
|
|
263
304
|
sam_file = pysam.AlignmentFile(self.ubam[0], "rb", check_sq=False)
|
|
264
305
|
header = sam_file.header.to_dict()
|
|
@@ -299,4 +340,6 @@ class uBAM_Extractor:
|
|
|
299
340
|
attributes.get('ch', '1'), # Channel
|
|
300
341
|
attributes.get('du', '1') # Duration
|
|
301
342
|
]
|
|
343
|
+
if self.is_barcode:
|
|
344
|
+
data.append(attributes.get('BC', 'unclassified'))
|
|
302
345
|
return data
|
|
@@ -20,6 +20,8 @@
|
|
|
20
20
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
from packaging import version
|
|
23
|
+
import glob
|
|
24
|
+
import os
|
|
23
25
|
|
|
24
26
|
def is_numpy_1_24():
|
|
25
27
|
"""
|
|
@@ -27,6 +29,7 @@ def is_numpy_1_24():
|
|
|
27
29
|
"""
|
|
28
30
|
return version.parse(np.__version__) >= version.parse("1.20")
|
|
29
31
|
|
|
32
|
+
|
|
30
33
|
def format_duration(t):
|
|
31
34
|
"""
|
|
32
35
|
Format a time duration
|
|
@@ -35,3 +38,28 @@ def format_duration(t):
|
|
|
35
38
|
"""
|
|
36
39
|
|
|
37
40
|
return "{:,d}m{:2.2f}s".format(int(t // 60), t % 60)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def set_result_dict_value(result_dict, key, tracking_id_dict, dict_key):
|
|
44
|
+
"""
|
|
45
|
+
Set metadata values from Fast5 or pod5 dict to result_dict
|
|
46
|
+
"""
|
|
47
|
+
value = ''
|
|
48
|
+
if dict_key in tracking_id_dict:
|
|
49
|
+
value = tracking_id_dict[dict_key]
|
|
50
|
+
|
|
51
|
+
result_dict[key] = value
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def find_file_in_directory(source_file, format):
|
|
55
|
+
"""
|
|
56
|
+
Looking for a suitable Fast5 or Pod5 file in the source directory.
|
|
57
|
+
:return: The path to the first suitable file in the source directory
|
|
58
|
+
"""
|
|
59
|
+
for ext in (format, 'tar.bz2', 'tar.gz'):
|
|
60
|
+
if glob.glob(source_file + '/*.' + ext):
|
|
61
|
+
files_found = os.listdir(source_file)
|
|
62
|
+
if len(files_found) > 0:
|
|
63
|
+
return source_file + files_found[0]
|
|
64
|
+
|
|
65
|
+
return None
|
|
@@ -164,16 +164,24 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
|
|
|
164
164
|
if "unclassified" not in barcode_selection:
|
|
165
165
|
barcode_selection.append("unclassified")
|
|
166
166
|
|
|
167
|
+
|
|
168
|
+
# If the barcode_arrangement column contains a barcode kit id
|
|
169
|
+
mask = df['barcode_arrangement'].str.startswith(('SQK', 'VQK'))
|
|
170
|
+
|
|
171
|
+
if mask.any():
|
|
172
|
+
df['barcode_arrangement'] = df['barcode_arrangement'].astype(str)
|
|
173
|
+
df.loc[mask, 'barcode_arrangement'] = df.loc[mask, 'barcode_arrangement'].str.extract(r'[SV]QK-.+_(.+)$')[0]
|
|
174
|
+
|
|
167
175
|
# Create keys barcode.arrangement, and read.pass/fail.barcode in dataframe_dict with all values of
|
|
168
176
|
# column barcode_arrangement when reads are passed/failed
|
|
169
|
-
dataframe_dict["barcode.arrangement"] = df[
|
|
177
|
+
dataframe_dict["barcode.arrangement"] = df['barcode_arrangement']
|
|
178
|
+
|
|
170
179
|
|
|
171
180
|
# Print warning message if a barcode is unknown
|
|
172
|
-
barcodes_found = set(
|
|
181
|
+
barcodes_found = set(df["barcode_arrangement"].unique())
|
|
173
182
|
for element in barcode_selection:
|
|
174
183
|
if element not in barcodes_found and element != 'other barcodes':
|
|
175
|
-
sys.stderr.write("
|
|
176
|
-
|
|
184
|
+
sys.stderr.write("\033[93mWarning:\033[0m The barcode {} doesn't exist in input data\n".format(element))
|
|
177
185
|
|
|
178
186
|
# Get barcodes frequency by Bases
|
|
179
187
|
df_base_pass_barcode = series_cols_boolean_elements(df, ["barcode_arrangement", "sequence_length"],
|
|
@@ -218,6 +226,7 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
|
|
|
218
226
|
(read_fail_barcoded_count / total_reads) * 100)
|
|
219
227
|
|
|
220
228
|
# Replaces all rows with unused barcodes (ie not in barcode_selection) in column barcode_arrangement with the 'other' value
|
|
229
|
+
|
|
221
230
|
df.loc[~df['barcode_arrangement'].isin(
|
|
222
231
|
barcode_selection), 'barcode_arrangement'] = 'other barcodes'
|
|
223
232
|
|
|
@@ -447,6 +456,7 @@ def read_first_line_file(filename):
|
|
|
447
456
|
except IOError:
|
|
448
457
|
raise FileNotFoundError
|
|
449
458
|
|
|
459
|
+
|
|
450
460
|
def set_result_dict_telemetry_value(result_dict, key, new_value):
|
|
451
461
|
"""
|
|
452
462
|
"""
|
|
@@ -461,4 +471,15 @@ def set_result_dict_telemetry_value(result_dict, key, new_value):
|
|
|
461
471
|
if new_value is None:
|
|
462
472
|
new_value = current_value
|
|
463
473
|
|
|
464
|
-
result_dict[final_key] = new_value
|
|
474
|
+
result_dict[final_key] = new_value
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def pd_read_sequencing_summary(file, cols, data_type):
|
|
478
|
+
try:
|
|
479
|
+
return pd.read_csv(file, sep="\t", usecols=cols,
|
|
480
|
+
dtype=data_type)
|
|
481
|
+
except:
|
|
482
|
+
del data_type['passes_filtering']
|
|
483
|
+
cols.remove('passes_filtering')
|
|
484
|
+
return pd.read_csv(file, sep="\t", usecols=cols,
|
|
485
|
+
dtype=data_type)
|