toulligqc 2.6__tar.gz → 2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {toulligqc-2.6 → toulligqc-2.7}/PKG-INFO +4 -4
- {toulligqc-2.6 → toulligqc-2.7}/README.md +71 -17
- {toulligqc-2.6 → toulligqc-2.7}/setup.py +7 -7
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/bam_extractor.py +49 -34
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/extractor_common.py +13 -4
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/fastq_bam_common.py +17 -2
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/fastq_extractor.py +7 -5
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/html_report_generator.py +2 -2
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/plotly_graph_common.py +20 -8
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/plotly_graph_generator.py +12 -8
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/pod5_extractor.py +1 -1
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/sequencing_summary_extractor.py +53 -26
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/sequencing_summary_onedsquare_extractor.py +7 -11
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/toulligqc.py +33 -8
- toulligqc-2.7/toulligqc/version.py +1 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc.egg-info/PKG-INFO +4 -4
- toulligqc-2.7/toulligqc.egg-info/requires.txt +11 -0
- toulligqc-2.6/toulligqc/version.py +0 -1
- toulligqc-2.6/toulligqc.egg-info/requires.txt +0 -10
- {toulligqc-2.6 → toulligqc-2.7}/AUTHORS +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/LICENSE-CeCILL.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/LICENSE.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/MANIFEST.in +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/setup.cfg +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/test/test_sequencing_summary_extractor.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/__init__.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/common.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/common_statistics.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/configuration.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/fast5_extractor.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/plotly_graph_onedsquare_generator.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/report_data_file_generator.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/resources/plotly-latest.min.js +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/resources/toulligqc.css +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/resources/toulligqc.png +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/sequencing_telemetry_extractor.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc/toulligqc_info_extractor.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc.egg-info/SOURCES.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc.egg-info/dependency_links.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc.egg-info/entry_points.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc.egg-info/not-zip-safe +0 -0
- {toulligqc-2.6 → toulligqc-2.7}/toulligqc.egg-info/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: toulligqc
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7
|
|
4
4
|
Summary: A post sequencing QC tool for Oxford Nanopore sequencers
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/GenomiqueENS/toulligQC
|
|
6
6
|
Author: Genomic Paris Centre team
|
|
7
|
-
Author-email: toulligqc@
|
|
7
|
+
Author-email: toulligqc@bio.ens.psl.eu
|
|
8
8
|
License: GPL V3
|
|
9
9
|
Keywords: Nanopore MinION QC report
|
|
10
10
|
Platform: ALL
|
|
@@ -15,7 +15,7 @@ Classifier: Intended Audience :: Science/Research
|
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
16
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
17
17
|
Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Requires-Python: >=3.11.0
|
|
20
20
|
License-File: LICENSE-CeCILL.txt
|
|
21
21
|
License-File: LICENSE.txt
|
|
@@ -24,6 +24,7 @@ Support is availlable on [GitHub issue page](https://github.com/GenomicParisCent
|
|
|
24
24
|
* 1.3 [Docker](#docker)
|
|
25
25
|
* [Docker image recovery](#docker-image-recovery)
|
|
26
26
|
* [Launching Docker image with docker run](#launching-Docker-image-with-docker-run)
|
|
27
|
+
* 1.4 [nf-core module](#nfcore-module)
|
|
27
28
|
|
|
28
29
|
* 2.[Usage](#usage)
|
|
29
30
|
* 2.1 [Command line](#command-line)
|
|
@@ -93,14 +94,25 @@ $ docker run -ti \
|
|
|
93
94
|
-v /path/to/basecaller/sequencing/summary/file:/path/to/basecaller/sequencing/summary/file \
|
|
94
95
|
-v /path/to/basecaller/sequencing/telemetry/file:/path/to/basecaller/telemetry/summary/file \
|
|
95
96
|
-v /path/to/result/directory:/path/to/result/directory \
|
|
96
|
-
toulligqc:latest
|
|
97
|
+
genomicpariscentre/toulligqc:latest
|
|
97
98
|
```
|
|
99
|
+
|
|
100
|
+
<a name="nfcore-module"></a>
|
|
101
|
+
### 1.4 Using nf-core module
|
|
102
|
+
ToulligQC is also available on nf-core as a module written in nextflow. To install nf-core on your system, please visit their website (<https://nf-co.re/docs/usage/introduction>).
|
|
103
|
+
|
|
104
|
+
The following command line will install the latest version of the ToulligQC module:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
$ nf-core modules install toulligqc
|
|
108
|
+
```
|
|
109
|
+
|
|
98
110
|
<a name="usage"></a>
|
|
99
111
|
## 2. Usage
|
|
100
112
|
<a name="command-line"></a>
|
|
101
113
|
|
|
102
114
|
ToulligQC is adapted to RNA-Seq along with DNA-Seq and it is compatible with 1D² runs.
|
|
103
|
-
This QC tool supports only Guppy basecalling ouput files.
|
|
115
|
+
This QC tool supports only Guppy and Dorado basecalling ouput files.
|
|
104
116
|
It also needs a single FAST5 file (to catch the flowcell ID and the run date) if a telemetry file is not provided.
|
|
105
117
|
Flow cells and kits version are retrieved using the telemetry file.
|
|
106
118
|
ToulligQC can take barcoding samples by adding the barcode list as a command line option.
|
|
@@ -111,7 +123,7 @@ To do so, ToulligQC deals with different file formats: gz, tar.gz, bz2, tar.bz2
|
|
|
111
123
|
This tool will produce a set of graphs, statistic file in plain text format and a HTML report.
|
|
112
124
|
|
|
113
125
|
|
|
114
|
-
To run ToulligQC you need the Guppy basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
|
|
126
|
+
To run ToulligQC you need the Guppy/ Dorado basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
|
|
115
127
|
This can be compressed with gzip or bzip2.
|
|
116
128
|
You can use your initial Fast5 ONT file too.
|
|
117
129
|
ToulligQC can perform analyses on your data if the directory is organised as the following:
|
|
@@ -132,7 +144,7 @@ RUN_ID
|
|
|
132
144
|
└── sequencing_1dsq_summary.txt
|
|
133
145
|
```
|
|
134
146
|
|
|
135
|
-
For a barcoded run you can add the barcoding files generated by Guppy ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
|
|
147
|
+
For a barcoded run you can add the barcoding files generated by Guppy/ Dorado ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
|
|
136
148
|
|
|
137
149
|
For the barcode list to use in the command line options, ToulligQC handle the following naming schemes: BCXX, RBXX, NBXX and barcodeXX where XX is the number of the barcode.
|
|
138
150
|
The barcode naming schemes are case insensitive.
|
|
@@ -156,14 +168,16 @@ This is a directory for 1D² analysis with barcoding files:
|
|
|
156
168
|
|
|
157
169
|
General Options:
|
|
158
170
|
```
|
|
159
|
-
usage: ToulligQC V2.
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
171
|
+
usage: ToulligQC V2.6 [-a SEQUENCING_SUMMARY_SOURCE] [-t TELEMETRY_SOURCE]
|
|
172
|
+
[-f FAST5_SOURCE] [-p POD5_SOURCE] [-q FASTQ] [-u BAM]
|
|
173
|
+
[--thread THREAD] [--batch-size BATCH_SIZE] [--qscore-threshold THRESHOLD]
|
|
174
|
+
[-n REPORT_NAME] [--output-directory OUTPUT] [-o HTML_REPORT_PATH]
|
|
175
|
+
[--data-report-path DATA_REPORT_PATH]
|
|
176
|
+
[--images-directory IMAGES_DIRECTORY]
|
|
177
|
+
[-d SEQUENCING_SUMMARY_1DSQR_SOURCE]
|
|
178
|
+
[-s SAMPLESHEET]
|
|
179
|
+
[-b] [-l BARCODES]
|
|
180
|
+
[--quiet] [--force] [-h] [--version]
|
|
167
181
|
|
|
168
182
|
required arguments:
|
|
169
183
|
-a SEQUENCING_SUMMARY_SOURCE, --sequencing-summary-source SEQUENCING_SUMMARY_SOURCE
|
|
@@ -175,6 +189,9 @@ required arguments:
|
|
|
175
189
|
-f FAST5_SOURCE, --fast5-source FAST5_SOURCE
|
|
176
190
|
Fast5 file source (necessary if no telemetry file),
|
|
177
191
|
can also be in a tar.gz/tar.bz2 archive or a directory
|
|
192
|
+
-p POD5_SOURCE, --pod5-source POD5_SOURCE
|
|
193
|
+
pod5 file source (necessary if no telemetry file),
|
|
194
|
+
can also be in a tar.gz/tar.bz2 archive or a directory
|
|
178
195
|
-q FASTQ, --fastq FASTQ
|
|
179
196
|
FASTQ file (necessary if no sequencing summary file),
|
|
180
197
|
can also be in a .gz archive
|
|
@@ -183,6 +200,8 @@ required arguments:
|
|
|
183
200
|
can also be a SAM format
|
|
184
201
|
|
|
185
202
|
optional arguments:
|
|
203
|
+
-s SAMPLESHEET, --samplesheet SAMPLESHEET
|
|
204
|
+
Samplesheet (.csv file) to fill out sample names in MinKNOW.
|
|
186
205
|
-n REPORT_NAME, --report-name REPORT_NAME
|
|
187
206
|
Report name
|
|
188
207
|
--output-directory OUTPUT
|
|
@@ -197,8 +216,9 @@ optional arguments:
|
|
|
197
216
|
Basecaller 1dsq summary source
|
|
198
217
|
-b, --barcoding Option for barcode usage
|
|
199
218
|
-l BARCODES, --barcodes BARCODES
|
|
200
|
-
|
|
201
|
-
BC05,RB09,NB01,barcode10)
|
|
219
|
+
Comma-separated barcode list (e.g.,
|
|
220
|
+
BC05,RB09,NB01,barcode10) or a range separated with ':' (e.g.,
|
|
221
|
+
barcode01:barcode19)
|
|
202
222
|
--thread THREAD Number of threads for parsing FASTQ or BAM files (default: 2).
|
|
203
223
|
--batch-size BATCH_SIZE Batch size for each threads (default: 500).
|
|
204
224
|
--qscore-threshold THRESHOLD Q-score threshold to distinguish between passing filter and
|
|
@@ -213,7 +233,41 @@ optional arguments:
|
|
|
213
233
|
* #### Examples
|
|
214
234
|
|
|
215
235
|
|
|
216
|
-
|
|
236
|
+
* Sequencing summary alone \
|
|
237
|
+
Note that the fowcell ID and run date will be missing from report, found in telemetry file or single fast5 file
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
$ toulligqc --report-name summary_only \
|
|
241
|
+
--sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
|
|
242
|
+
--html-report-path /path/to/output/report.html
|
|
243
|
+
```
|
|
244
|
+
* Sequencing summary + telemetry file
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
$ toulligqc --report-name summary_plus_telemetry \
|
|
248
|
+
--telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
|
|
249
|
+
--sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
|
|
250
|
+
--html-report-path /path/to/output/report.html
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
* Telemetry file + fast5 files
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
$ toulligqc --report-name telemetry_plus_fast5 \
|
|
257
|
+
--telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
|
|
258
|
+
--fast5-source /path/to/basecaller/output/fast5_files.fast5.gz \
|
|
259
|
+
--html-report-path /path/to/output/report.html
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
* Fastq/ bam files only
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
$ toulligqc --report-name FAF0256 \
|
|
266
|
+
--fastq /path/to/basecaller/output/fastq_files.fq.gz \ # (replace with --bam)
|
|
267
|
+
--html-report-path /path/to/output/report.html
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
* Optional arguments for 1D² analysis
|
|
217
271
|
|
|
218
272
|
```bash
|
|
219
273
|
$ toulligqc --report-name FAF0256 \
|
|
@@ -223,7 +277,7 @@ $ toulligqc --report-name FAF0256 \
|
|
|
223
277
|
--html-report-path /path/to/output/report.html
|
|
224
278
|
```
|
|
225
279
|
|
|
226
|
-
|
|
280
|
+
* Optional arguments to deal with barcoded samples
|
|
227
281
|
|
|
228
282
|
```bash
|
|
229
283
|
$ toulligqc --report-name FAF0256 \
|
|
@@ -271,7 +325,7 @@ $ toulligqc \
|
|
|
271
325
|
--sequencing-summary-source sequencing_summary.txt \
|
|
272
326
|
--sequencing-summary-source barcoding_summary_pass.txt \
|
|
273
327
|
--sequencing-summary-source barcoding_summary_fail.txt \
|
|
274
|
-
--barcodes BC01
|
|
328
|
+
--barcodes BC01:BC07 \
|
|
275
329
|
--output-directory output
|
|
276
330
|
```
|
|
277
331
|
|
|
@@ -14,11 +14,11 @@ setup(
|
|
|
14
14
|
long_description='See project website for more information.',
|
|
15
15
|
|
|
16
16
|
# The project's main homepage.
|
|
17
|
-
url='https://github.com/
|
|
17
|
+
url='https://github.com/GenomiqueENS/toulligQC',
|
|
18
18
|
|
|
19
19
|
# Author details
|
|
20
20
|
author='Genomic Paris Centre team',
|
|
21
|
-
author_email='toulligqc@
|
|
21
|
+
author_email='toulligqc@bio.ens.psl.eu',
|
|
22
22
|
|
|
23
23
|
license='GPL V3',
|
|
24
24
|
platforms='ALL',
|
|
@@ -34,7 +34,7 @@ setup(
|
|
|
34
34
|
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
|
|
35
35
|
'License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)',
|
|
36
36
|
|
|
37
|
-
'Programming Language :: Python :: 3.
|
|
37
|
+
'Programming Language :: Python :: 3.12'
|
|
38
38
|
],
|
|
39
39
|
|
|
40
40
|
keywords='Nanopore MinION QC report',
|
|
@@ -46,10 +46,10 @@ setup(
|
|
|
46
46
|
include_package_data=True,
|
|
47
47
|
|
|
48
48
|
python_requires='>=3.11.0',
|
|
49
|
-
install_requires=['matplotlib>=3.6.3', 'plotly
|
|
50
|
-
'pandas>=1.
|
|
51
|
-
'scikit-learn>=1.
|
|
52
|
-
'pod5>=0.3.6'],
|
|
49
|
+
install_requires=['matplotlib>=3.6.3', 'plotly==5.15.0', 'h5py>=3.10.0',
|
|
50
|
+
'pandas>=2.1.4', 'numpy>=1.26.4', 'scipy>=1.11.4',
|
|
51
|
+
'scikit-learn>=1.4.1', 'tqdm>=4.66.2', 'pysam>=0.22.0',
|
|
52
|
+
'pod5>=0.3.10', 'ezcharts==0.7.6'],
|
|
53
53
|
|
|
54
54
|
entry_points={
|
|
55
55
|
'console_scripts': [
|
|
@@ -64,8 +64,9 @@ class uBAM_Extractor:
|
|
|
64
64
|
|
|
65
65
|
# Add missing categories
|
|
66
66
|
if 'barcode_arrangement' in self.dataframe.columns:
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
self.dataframe['barcode_arrangement'] = self.dataframe['barcode_arrangement'].cat.add_categories([0,
|
|
68
|
+
'other barcodes',
|
|
69
|
+
'passes_filtering'])
|
|
69
70
|
|
|
70
71
|
# Replace all NaN values by 0 to avoid data manipulation errors when columns are not the same length
|
|
71
72
|
self.dataframe = self.dataframe.fillna(0)
|
|
@@ -124,21 +125,29 @@ class uBAM_Extractor:
|
|
|
124
125
|
add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
|
|
125
126
|
add_image_to_result(self.quiet, images, time.time(), pgg.speed_over_time(self.dataframe_dict, self.images_directory))
|
|
126
127
|
if self.is_barcode:
|
|
128
|
+
if "barcode_alias" in self.config_dictionary:
|
|
129
|
+
barcode_alias = self.config_dictionary['barcode_alias']
|
|
130
|
+
else:
|
|
131
|
+
barcode_alias = None
|
|
127
132
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
|
|
128
133
|
self.barcode_selection,
|
|
129
|
-
self.images_directory
|
|
134
|
+
self.images_directory,
|
|
135
|
+
barcode_alias))
|
|
130
136
|
|
|
131
137
|
read_fail = self.dataframe_dict["read.fail.barcoded"]
|
|
132
138
|
if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
|
|
133
139
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
|
|
134
140
|
self.barcode_selection,
|
|
135
|
-
self.images_directory
|
|
141
|
+
self.images_directory,
|
|
142
|
+
barcode_alias))
|
|
136
143
|
|
|
137
144
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
|
|
138
|
-
self.images_directory
|
|
145
|
+
self.images_directory,
|
|
146
|
+
barcode_alias))
|
|
139
147
|
|
|
140
148
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
|
|
141
|
-
self.images_directory
|
|
149
|
+
self.images_directory,
|
|
150
|
+
barcode_alias))
|
|
142
151
|
return images
|
|
143
152
|
|
|
144
153
|
|
|
@@ -271,8 +280,10 @@ class uBAM_Extractor:
|
|
|
271
280
|
"""
|
|
272
281
|
#def process_bam_chunk(bam_chunk):
|
|
273
282
|
rec_data = []
|
|
283
|
+
record_count = 0
|
|
274
284
|
for rec in uBAM_chunk:
|
|
275
|
-
|
|
285
|
+
record_count += 1
|
|
286
|
+
rec_dict = self._process_record(rec, record_count)
|
|
276
287
|
rec_data.append(rec_dict)
|
|
277
288
|
return rec_data
|
|
278
289
|
|
|
@@ -290,41 +301,45 @@ class uBAM_Extractor:
|
|
|
290
301
|
|
|
291
302
|
|
|
292
303
|
def _get_header(self):
|
|
293
|
-
|
|
294
|
-
header =
|
|
295
|
-
run_id, model_version_id =
|
|
304
|
+
sam_file = pysam.AlignmentFile(self.ubam[0], "rb", check_sq=False)
|
|
305
|
+
header = sam_file.header.to_dict()
|
|
306
|
+
run_id, model_version_id = extract_headerTag(header, 'RG','ID',
|
|
307
|
+
'Unknown_Unknown').split('_', 1)
|
|
296
308
|
self.header = {
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
309
|
+
"run_id": run_id,
|
|
310
|
+
"run_date": extract_headerTag(header, 'RG', 'DT', 'Unknown'),
|
|
311
|
+
"sample_id": extract_headerTag(header, 'RG', 'SM', 'Unknown'),
|
|
312
|
+
"basecaller": extract_headerTag(header, 'PG', 'PN', 'Unknown'),
|
|
313
|
+
"basecaller_version": extract_headerTag(header, 'PG', 'VN', 'Unknown'),
|
|
314
|
+
"model_version_id": model_version_id,
|
|
315
|
+
"flow_cell_id": extract_headerTag(header, 'RG', 'PU', 'Unknown')
|
|
304
316
|
}
|
|
305
317
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
def _process_record(self, rec):
|
|
318
|
+
|
|
319
|
+
def _process_record(self, rec, record_count):
|
|
309
320
|
"""
|
|
310
321
|
extract QC info from BAM record
|
|
311
322
|
return : dict of QC info
|
|
312
323
|
"""
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
324
|
+
fields = rec.split("\t")
|
|
325
|
+
|
|
326
|
+
# Parse optional fields
|
|
327
|
+
attributes = {}
|
|
328
|
+
for t in fields[11:]:
|
|
329
|
+
k, t, v = t.split(':', 2)
|
|
330
|
+
attributes[k] = v
|
|
331
|
+
|
|
332
|
+
iso_start_time = attributes.get('st', None)
|
|
333
|
+
qual = avg_qual(fields[10])
|
|
318
334
|
passes_filtering = True if qual > self.threshold_Qscore else False
|
|
319
335
|
data = [
|
|
320
|
-
len(
|
|
321
|
-
qual,
|
|
322
|
-
passes_filtering,
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
336
|
+
len(fields[9]), # read length
|
|
337
|
+
qual, # AVG Qscore
|
|
338
|
+
passes_filtering, # Passing filter
|
|
339
|
+
float(record_count) if iso_start_time is None else timeISO_to_float(iso_start_time, '%Y-%m-%dT%H:%M:%S.%f%z'), # start time
|
|
340
|
+
attributes.get('ch', '1'), # Channel
|
|
341
|
+
attributes.get('du', '1') # Duration
|
|
326
342
|
]
|
|
327
343
|
if self.is_barcode:
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
return data
|
|
344
|
+
data.append(attributes.get('BC', 'unclassified'))
|
|
345
|
+
return data
|
|
@@ -164,16 +164,24 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
|
|
|
164
164
|
if "unclassified" not in barcode_selection:
|
|
165
165
|
barcode_selection.append("unclassified")
|
|
166
166
|
|
|
167
|
+
|
|
168
|
+
# If the barcode_arrangement column contains a barcode kit id
|
|
169
|
+
mask = df['barcode_arrangement'].str.startswith(('SQK', 'VQK'))
|
|
170
|
+
|
|
171
|
+
if mask.any():
|
|
172
|
+
df['barcode_arrangement'] = df['barcode_arrangement'].astype(str)
|
|
173
|
+
df.loc[mask, 'barcode_arrangement'] = df.loc[mask, 'barcode_arrangement'].str.extract(r'[SV]QK-.+_(.+)$')[0]
|
|
174
|
+
|
|
167
175
|
# Create keys barcode.arrangement, and read.pass/fail.barcode in dataframe_dict with all values of
|
|
168
176
|
# column barcode_arrangement when reads are passed/failed
|
|
169
|
-
dataframe_dict["barcode.arrangement"] = df[
|
|
177
|
+
dataframe_dict["barcode.arrangement"] = df['barcode_arrangement']
|
|
178
|
+
|
|
170
179
|
|
|
171
180
|
# Print warning message if a barcode is unknown
|
|
172
|
-
barcodes_found = set(
|
|
181
|
+
barcodes_found = set(df["barcode_arrangement"].unique())
|
|
173
182
|
for element in barcode_selection:
|
|
174
183
|
if element not in barcodes_found and element != 'other barcodes':
|
|
175
|
-
sys.stderr.write("
|
|
176
|
-
|
|
184
|
+
sys.stderr.write("\033[93mWarning:\033[0m The barcode {} doesn't exist in input data\n".format(element))
|
|
177
185
|
|
|
178
186
|
# Get barcodes frequency by Bases
|
|
179
187
|
df_base_pass_barcode = series_cols_boolean_elements(df, ["barcode_arrangement", "sequence_length"],
|
|
@@ -218,6 +226,7 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
|
|
|
218
226
|
(read_fail_barcoded_count / total_reads) * 100)
|
|
219
227
|
|
|
220
228
|
# Replaces all rows with unused barcodes (ie not in barcode_selection) in column barcode_arrangement with the 'other' value
|
|
229
|
+
|
|
221
230
|
df.loc[~df['barcode_arrangement'].isin(
|
|
222
231
|
barcode_selection), 'barcode_arrangement'] = 'other barcodes'
|
|
223
232
|
|
|
@@ -2,8 +2,23 @@ import multiprocessing as mp
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
4
4
|
|
|
5
|
-
def extract_headerTag(header, tagGroup, tag):
|
|
6
|
-
|
|
5
|
+
def extract_headerTag(header, tagGroup, tag, defaultValue = None):
|
|
6
|
+
|
|
7
|
+
if tagGroup not in header:
|
|
8
|
+
if defaultValue is not None:
|
|
9
|
+
return defaultValue
|
|
10
|
+
else:
|
|
11
|
+
raise KeyError(tagGroup)
|
|
12
|
+
|
|
13
|
+
first_entry = header[tagGroup][0]
|
|
14
|
+
|
|
15
|
+
if tag not in first_entry:
|
|
16
|
+
if defaultValue is not None:
|
|
17
|
+
return defaultValue
|
|
18
|
+
else:
|
|
19
|
+
raise KeyError(tag)
|
|
20
|
+
|
|
21
|
+
return first_entry[tag]
|
|
7
22
|
|
|
8
23
|
|
|
9
24
|
def batch_iterator(iterator, batch_size):
|
|
@@ -64,8 +64,9 @@ class fastqExtractor:
|
|
|
64
64
|
|
|
65
65
|
# Add missing categories
|
|
66
66
|
if 'barcode_arrangement' in self.dataframe_1d.columns:
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
self.dataframe_1d['barcode_arrangement'] = self.dataframe_1d['barcode_arrangement'].cat.add_categories([0,
|
|
68
|
+
'other barcodes',
|
|
69
|
+
'passes_filtering'])
|
|
69
70
|
self.dataframe_1d = self.dataframe_1d.fillna(0)
|
|
70
71
|
self.barcode_selection = self.config_dictionary['barcode_selection']
|
|
71
72
|
|
|
@@ -326,9 +327,10 @@ class fastqExtractor:
|
|
|
326
327
|
fastq_lines.append((len(read[1]), qscore, passes_filtering, start_time, ch))
|
|
327
328
|
else:
|
|
328
329
|
for read in read_batch:
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
330
|
+
if len(read)>0:
|
|
331
|
+
qscore = avg_qual(read)
|
|
332
|
+
passes_filtering = True if qscore > self.threshold_Qscore else False
|
|
333
|
+
fastq_lines.append((len(read), qscore, passes_filtering))
|
|
332
334
|
return fastq_lines
|
|
333
335
|
|
|
334
336
|
|
|
@@ -72,7 +72,7 @@ def html_report(config_dictionary, result_dict, graphs):
|
|
|
72
72
|
report = """<!doctype html>
|
|
73
73
|
<html>
|
|
74
74
|
<head>
|
|
75
|
-
<title>
|
|
75
|
+
<title>ToulligQC: {report_name} </title>
|
|
76
76
|
<meta charset='UTF-8'>
|
|
77
77
|
<script>{plotlyjs}</script>
|
|
78
78
|
|
|
@@ -91,7 +91,7 @@ def html_report(config_dictionary, result_dict, graphs):
|
|
|
91
91
|
<div id="header_filename">
|
|
92
92
|
Sample ID: {sample_id} <br>
|
|
93
93
|
Run date: {run_date} <br>
|
|
94
|
-
Report date
|
|
94
|
+
Report date: {report_date} <br>
|
|
95
95
|
</div>
|
|
96
96
|
</div>
|
|
97
97
|
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
|
|
23
23
|
from collections import defaultdict
|
|
24
24
|
|
|
25
|
+
import pkgutil
|
|
25
26
|
import numpy as np
|
|
26
27
|
import pandas as pd
|
|
27
28
|
import plotly.graph_objs as go
|
|
@@ -301,6 +302,10 @@ def _transparent_component(c, b, a):
|
|
|
301
302
|
return '0' + r
|
|
302
303
|
return r
|
|
303
304
|
|
|
305
|
+
def _copy_latest_minjs(result_directory, js_file):
|
|
306
|
+
with open(result_directory + '/' + js_file , 'w+') as f:
|
|
307
|
+
plotly_min_js = pkgutil.get_data(__name__, "resources/plotly-latest.min.js").decode('utf8')
|
|
308
|
+
f.write(plotly_min_js)
|
|
304
309
|
|
|
305
310
|
def _create_and_save_div(fig, result_directory, main):
|
|
306
311
|
div = py.plot(fig,
|
|
@@ -311,11 +316,13 @@ def _create_and_save_div(fig, result_directory, main):
|
|
|
311
316
|
|
|
312
317
|
if result_directory is not None:
|
|
313
318
|
output_file = result_directory + '/' + '_'.join(main.split())
|
|
319
|
+
js_file="plotly.min.js"
|
|
314
320
|
py.plot(fig,
|
|
315
321
|
filename=output_file,
|
|
316
322
|
output_type="file",
|
|
317
|
-
include_plotlyjs=
|
|
323
|
+
include_plotlyjs= js_file,
|
|
318
324
|
auto_open=False)
|
|
325
|
+
_copy_latest_minjs(result_directory, js_file)
|
|
319
326
|
else:
|
|
320
327
|
output_file = None
|
|
321
328
|
|
|
@@ -476,7 +483,7 @@ def _over_time_graph(data_series,
|
|
|
476
483
|
|
|
477
484
|
|
|
478
485
|
def _barcode_boxplot_graph(graph_name, df, barcode_selection, pass_color, fail_color, yaxis_title, legend_title,
|
|
479
|
-
result_directory):
|
|
486
|
+
result_directory, barcode_alias=None):
|
|
480
487
|
# Sort reads by read type and drop read type column
|
|
481
488
|
pass_df = df.loc[df['passes_filtering'] == bool(True)].drop(columns='passes_filtering')
|
|
482
489
|
fail_df = df.loc[df['passes_filtering'] == bool(False)].drop(columns='passes_filtering')
|
|
@@ -504,7 +511,7 @@ def _barcode_boxplot_graph(graph_name, df, barcode_selection, pass_color, fail_c
|
|
|
504
511
|
lowerfence=[d['lowerfence']],
|
|
505
512
|
upperfence=[d['upperfence']],
|
|
506
513
|
name=read_type + " reads",
|
|
507
|
-
x0=barcode,
|
|
514
|
+
x0=barcode_alias.get(barcode, barcode) if barcode_alias else barcode,
|
|
508
515
|
marker_color=color,
|
|
509
516
|
offsetgroup=read_type.lower(),
|
|
510
517
|
showlegend=first
|
|
@@ -539,10 +546,12 @@ def _barcode_boxplot_graph(graph_name, df, barcode_selection, pass_color, fail_c
|
|
|
539
546
|
return graph_name, output_file, table_html, div
|
|
540
547
|
|
|
541
548
|
|
|
542
|
-
def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, result_directory):
|
|
549
|
+
def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, result_directory, barcode_alias=None):
|
|
543
550
|
read_count_sorted = count_sorted[0]
|
|
544
551
|
base_count_sorted = count_sorted[1]
|
|
545
552
|
labels = read_count_sorted.index.values.tolist()
|
|
553
|
+
if barcode_alias:
|
|
554
|
+
labels = [barcode_alias.get(label, label) for label in labels]
|
|
546
555
|
|
|
547
556
|
fig = go.Figure()
|
|
548
557
|
|
|
@@ -622,9 +631,9 @@ def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, resu
|
|
|
622
631
|
method="update"
|
|
623
632
|
),
|
|
624
633
|
dict(
|
|
625
|
-
args=[{'visible': [False, False,
|
|
634
|
+
args=[{'visible': [False, False, True, False]},
|
|
626
635
|
{**_xaxis('Barcodes', dict(visible=True)),
|
|
627
|
-
**_yaxis('
|
|
636
|
+
**_yaxis('Read count', dict(visible=True)),
|
|
628
637
|
'plot_bgcolor': plotly_background_color}],
|
|
629
638
|
label="Reads Histogram",
|
|
630
639
|
method="update"
|
|
@@ -638,9 +647,9 @@ def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, resu
|
|
|
638
647
|
method="update"
|
|
639
648
|
),
|
|
640
649
|
dict(
|
|
641
|
-
args=[{'visible': [False, False,
|
|
650
|
+
args=[{'visible': [False, False, False, True]},
|
|
642
651
|
{**_xaxis('Barcodes', dict(visible=True)),
|
|
643
|
-
**_yaxis('
|
|
652
|
+
**_yaxis('Base count', dict(visible=True)),
|
|
644
653
|
'plot_bgcolor': plotly_background_color}],
|
|
645
654
|
label="Bases Histogram",
|
|
646
655
|
method="update"
|
|
@@ -664,6 +673,9 @@ def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, resu
|
|
|
664
673
|
barcode_table = pd.DataFrame({"Barcode arrangement (%)": read_count_sorted / sum(read_count_sorted) * 100,
|
|
665
674
|
count_col_name: read_count_sorted,
|
|
666
675
|
"Base count": base_count_sorted})
|
|
676
|
+
if barcode_alias:
|
|
677
|
+
barcode_table = barcode_table.rename(index=barcode_alias)
|
|
678
|
+
|
|
667
679
|
barcode_table.sort_index(inplace=True)
|
|
668
680
|
pd.options.display.float_format = percent_format_str.format
|
|
669
681
|
barcode_table[count_col_name] = barcode_table[count_col_name].astype(int).apply(lambda x: _format_int(x))
|
|
@@ -598,7 +598,7 @@ def plot_performance(df, result_directory):
|
|
|
598
598
|
#
|
|
599
599
|
|
|
600
600
|
|
|
601
|
-
def barcode_percentage_pie_chart_pass(dataframe_dict, barcode_selection, result_directory):
|
|
601
|
+
def barcode_percentage_pie_chart_pass(dataframe_dict, barcode_selection, result_directory, barcode_alias):
|
|
602
602
|
"""
|
|
603
603
|
Plots a pie chart of 1D read pass percentage per barcode of a run.
|
|
604
604
|
"""
|
|
@@ -612,10 +612,11 @@ def barcode_percentage_pie_chart_pass(dataframe_dict, barcode_selection, result_
|
|
|
612
612
|
count_sorted=[read_count_sorted, base_count_sorted],
|
|
613
613
|
color_palette=toulligqc_colors['pie_chart_palette'],
|
|
614
614
|
one_d_square=False,
|
|
615
|
-
result_directory=result_directory
|
|
615
|
+
result_directory=result_directory,
|
|
616
|
+
barcode_alias=barcode_alias)
|
|
616
617
|
|
|
617
618
|
|
|
618
|
-
def barcode_percentage_pie_chart_fail(dataframe_dict, barcode_selection, result_directory):
|
|
619
|
+
def barcode_percentage_pie_chart_fail(dataframe_dict, barcode_selection, result_directory, barcode_alias):
|
|
619
620
|
"""
|
|
620
621
|
Plots a pie chart of 1D read fail percentage per barcode of a run.
|
|
621
622
|
Needs the samplesheet file describing the barcodes to run
|
|
@@ -630,10 +631,11 @@ def barcode_percentage_pie_chart_fail(dataframe_dict, barcode_selection, result_
|
|
|
630
631
|
count_sorted=[read_count_sorted, base_count_sorted],
|
|
631
632
|
color_palette=toulligqc_colors['pie_chart_palette'],
|
|
632
633
|
one_d_square=False,
|
|
633
|
-
result_directory=result_directory
|
|
634
|
+
result_directory=result_directory,
|
|
635
|
+
barcode_alias=barcode_alias)
|
|
634
636
|
|
|
635
637
|
|
|
636
|
-
def barcode_length_boxplot(datafame_dict, result_directory):
|
|
638
|
+
def barcode_length_boxplot(datafame_dict, result_directory, barcode_alias):
|
|
637
639
|
"""
|
|
638
640
|
Boxplots all the 1D pass and fail read length for each barcode indicated in the sample sheet
|
|
639
641
|
"""
|
|
@@ -649,10 +651,11 @@ def barcode_length_boxplot(datafame_dict, result_directory):
|
|
|
649
651
|
fail_color=toulligqc_colors['fail'],
|
|
650
652
|
yaxis_title="Sequence length (bp)",
|
|
651
653
|
legend_title="Read type",
|
|
652
|
-
result_directory=result_directory
|
|
654
|
+
result_directory=result_directory,
|
|
655
|
+
barcode_alias=barcode_alias)
|
|
653
656
|
|
|
654
657
|
|
|
655
|
-
def barcoded_phred_score_frequency(dataframe_dict, result_directory):
|
|
658
|
+
def barcoded_phred_score_frequency(dataframe_dict, result_directory, barcode_alias):
|
|
656
659
|
"""
|
|
657
660
|
Plot boxplot of the 1D pass and fail read qscore for each barcode indicated in the sample sheet
|
|
658
661
|
"""
|
|
@@ -668,7 +671,8 @@ def barcoded_phred_score_frequency(dataframe_dict, result_directory):
|
|
|
668
671
|
fail_color=toulligqc_colors['fail'],
|
|
669
672
|
yaxis_title="PHRED score",
|
|
670
673
|
legend_title="Read type",
|
|
671
|
-
result_directory=result_directory
|
|
674
|
+
result_directory=result_directory,
|
|
675
|
+
barcode_alias=barcode_alias)
|
|
672
676
|
|
|
673
677
|
|
|
674
678
|
def sequence_length_over_time(dataframe_dict, result_directory):
|
|
@@ -183,7 +183,7 @@ class Pod5Extractor:
|
|
|
183
183
|
if self.pod5_file_extension == 'tar' or \
|
|
184
184
|
self.pod5_file_extension == 'tar.gz' or \
|
|
185
185
|
self.pod5_file_extension == 'tar.bz2':
|
|
186
|
-
self.
|
|
186
|
+
self.pod5_file = self._pod5_tar_extraction(self.file_to_process, self.pod5_file_extension,
|
|
187
187
|
self.temporary_directory)
|
|
188
188
|
elif self.pod5_file_extension == 'pod5' or self.pod5_file_extension == '.pod5':
|
|
189
189
|
self.pod5_file = self.file_to_process
|
|
@@ -63,6 +63,7 @@ class SequencingSummaryExtractor:
|
|
|
63
63
|
self.sequencing_summary_source = config_dictionary['sequencing_summary_source']
|
|
64
64
|
self.images_directory = config_dictionary['images_directory']
|
|
65
65
|
self.sequencing_summary_files = self.sequencing_summary_source.split('\t')
|
|
66
|
+
self.barcode_colname = 'barcode_arrangement'
|
|
66
67
|
self.threshold_Qscore = int(config_dictionary['threshold'])
|
|
67
68
|
if 'quiet' not in config_dictionary or config_dictionary['quiet'].lower() != 'true':
|
|
68
69
|
self.quiet = False
|
|
@@ -74,6 +75,8 @@ class SequencingSummaryExtractor:
|
|
|
74
75
|
for f in self.sequencing_summary_files:
|
|
75
76
|
if self._is_barcode_file(f) or self._is_sequencing_summary_with_barcodes(f):
|
|
76
77
|
self.is_barcode = True
|
|
78
|
+
self._get_barcode_colname(f)
|
|
79
|
+
break
|
|
77
80
|
|
|
78
81
|
def check_conf(self):
|
|
79
82
|
"""
|
|
@@ -107,17 +110,20 @@ class SequencingSummaryExtractor:
|
|
|
107
110
|
start_time = time.time()
|
|
108
111
|
|
|
109
112
|
self.dataframe_1d = self._load_sequencing_summary_data()
|
|
113
|
+
|
|
110
114
|
if self.dataframe_1d.empty:
|
|
111
115
|
raise pd.errors.EmptyDataError("Dataframe is empty")
|
|
112
116
|
|
|
113
117
|
# Rename 'sequence_length_template' and 'mean_qscore_template'
|
|
114
118
|
self.dataframe_1d.rename(columns={'sequence_length_template': 'sequence_length',
|
|
115
119
|
'mean_qscore_template': 'mean_qscore'}, inplace=True)
|
|
120
|
+
|
|
121
|
+
# Rename 'barcode_arrangement'
|
|
122
|
+
if self.is_barcode and self.barcode_colname == "barcode":
|
|
123
|
+
self.dataframe_1d.rename(columns={'barcode': 'barcode_arrangement'}, inplace=True)
|
|
116
124
|
|
|
117
125
|
# Add missing categories
|
|
118
126
|
if 'barcode_arrangement' in self.dataframe_1d.columns:
|
|
119
|
-
#self.dataframe_1d['barcode_arrangement'].cat.add_categories([0, 'other barcodes', 'passes_filtering'],
|
|
120
|
-
# inplace=True)
|
|
121
127
|
self.dataframe_1d['barcode_arrangement'] = self.dataframe_1d['barcode_arrangement'].cat.add_categories(
|
|
122
128
|
[0, 'other barcodes', 'passes_filtering'])
|
|
123
129
|
if 'passes_filtering' not in self.dataframe_1d.columns:
|
|
@@ -283,21 +289,30 @@ class SequencingSummaryExtractor:
|
|
|
283
289
|
add_image_to_result(self.quiet, images, time.time(), pgg.speed_over_time(self.dataframe_dict, self.images_directory))
|
|
284
290
|
|
|
285
291
|
if self.is_barcode:
|
|
292
|
+
if "barcode_alias" in self.config_dictionary:
|
|
293
|
+
barcode_alias = self.config_dictionary['barcode_alias']
|
|
294
|
+
else:
|
|
295
|
+
barcode_alias = None
|
|
296
|
+
|
|
286
297
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
|
|
287
298
|
self.barcode_selection,
|
|
288
|
-
self.images_directory
|
|
299
|
+
self.images_directory,
|
|
300
|
+
barcode_alias))
|
|
289
301
|
|
|
290
302
|
read_fail = self.dataframe_dict["read.fail.barcoded"]
|
|
291
303
|
if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
|
|
292
304
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
|
|
293
305
|
self.barcode_selection,
|
|
294
|
-
self.images_directory
|
|
306
|
+
self.images_directory,
|
|
307
|
+
barcode_alias))
|
|
295
308
|
|
|
296
309
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
|
|
297
|
-
self.images_directory
|
|
310
|
+
self.images_directory,
|
|
311
|
+
barcode_alias))
|
|
298
312
|
|
|
299
313
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
|
|
300
|
-
self.images_directory
|
|
314
|
+
self.images_directory,
|
|
315
|
+
barcode_alias))
|
|
301
316
|
return images
|
|
302
317
|
|
|
303
318
|
|
|
@@ -327,12 +342,13 @@ class SequencingSummaryExtractor:
|
|
|
327
342
|
'duration': np.float32}
|
|
328
343
|
|
|
329
344
|
# If barcoding files are provided, merging of dataframes must be done on read_id column
|
|
330
|
-
|
|
345
|
+
if self.is_barcode:
|
|
346
|
+
barcoding_summary_columns = ['read_id', self.barcode_colname]
|
|
331
347
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
348
|
+
barcoding_summary_datatypes = {
|
|
349
|
+
'read_id': object,
|
|
350
|
+
self.barcode_colname: 'category'
|
|
351
|
+
}
|
|
336
352
|
|
|
337
353
|
try:
|
|
338
354
|
# If 1 file and it's a sequencing_summary.txt
|
|
@@ -341,9 +357,10 @@ class SequencingSummaryExtractor:
|
|
|
341
357
|
|
|
342
358
|
# If 1 file and it's a sequencing_summary.txt with barcode info, load column barcode_arrangement
|
|
343
359
|
elif len(files) == 1 and self._is_sequencing_summary_with_barcodes(files[0]):
|
|
344
|
-
|
|
360
|
+
if self.is_barcode:
|
|
361
|
+
sequencing_summary_columns.append(self.barcode_colname)
|
|
345
362
|
sequencing_summary_datatypes.update(
|
|
346
|
-
{
|
|
363
|
+
{self.barcode_colname: 'category'})
|
|
347
364
|
|
|
348
365
|
return pd_read_sequencing_summary(files[0], cols=sequencing_summary_columns, data_type=sequencing_summary_datatypes)
|
|
349
366
|
|
|
@@ -357,14 +374,14 @@ class SequencingSummaryExtractor:
|
|
|
357
374
|
barcode_dataframe = dataframe
|
|
358
375
|
# if a barcoding file has already been read, append the 2 dataframes
|
|
359
376
|
else:
|
|
360
|
-
barcode_dataframe =
|
|
361
|
-
dataframe, ignore_index=True)
|
|
377
|
+
barcode_dataframe = pd.concat([barcode_dataframe, dataframe], ignore_index=True)
|
|
362
378
|
|
|
363
|
-
# check for presence of sequencing_summary file with barcode info, if true load column
|
|
379
|
+
# check for presence of sequencing_summary file with barcode info, if true load barcode column and ignore barcoding files.
|
|
364
380
|
elif self._is_sequencing_summary_with_barcodes(f):
|
|
365
|
-
|
|
381
|
+
if self.is_barcode:
|
|
382
|
+
sequencing_summary_columns.append(self.barcode_colname)
|
|
366
383
|
sequencing_summary_datatypes.update(
|
|
367
|
-
{
|
|
384
|
+
{self.barcode_colname: 'category'})
|
|
368
385
|
sys.stderr.write('Warning: The sequencing summary file {} contains barcode information.'
|
|
369
386
|
' The barcoding summary files will be skipped.\n'.format(f))
|
|
370
387
|
return pd_read_sequencing_summary(f, cols=sequencing_summary_columns,
|
|
@@ -382,8 +399,7 @@ class SequencingSummaryExtractor:
|
|
|
382
399
|
if summary_dataframe is None:
|
|
383
400
|
summary_dataframe = dataframe
|
|
384
401
|
else:
|
|
385
|
-
summary_dataframe =
|
|
386
|
-
dataframe, ignore_index=True)
|
|
402
|
+
summary_dataframe = pd.concat([summary_dataframe,dataframe], ignore_index=True)
|
|
387
403
|
|
|
388
404
|
if barcode_dataframe is None:
|
|
389
405
|
# If no barcodes in files, no merged dataframes on column 'read_id'
|
|
@@ -392,20 +408,20 @@ class SequencingSummaryExtractor:
|
|
|
392
408
|
dataframes_merged = pd.merge(
|
|
393
409
|
summary_dataframe, barcode_dataframe, on='read_id', how='left')
|
|
394
410
|
|
|
395
|
-
missing_barcodes_count = dataframes_merged[
|
|
411
|
+
missing_barcodes_count = dataframes_merged[self.barcode_colname].isna().sum()
|
|
396
412
|
if missing_barcodes_count > 0:
|
|
397
413
|
sys.stderr.write('Warning: {} barcodes values are missing in sequencing summary file(s).'
|
|
398
414
|
' They will be marked as "unclassified".\n'.format(missing_barcodes_count))
|
|
399
415
|
|
|
400
416
|
# Replace missing barcodes values by 'unclassified'
|
|
401
|
-
dataframes_merged[
|
|
417
|
+
dataframes_merged[self.barcode_colname] = dataframes_merged[self.barcode_colname].fillna(
|
|
402
418
|
'unclassified')
|
|
403
419
|
|
|
404
420
|
# Delete column read_id after merging
|
|
405
421
|
del dataframes_merged['read_id']
|
|
406
422
|
|
|
407
423
|
# Set 'barcode_arrangement' column type as category
|
|
408
|
-
dataframes_merged[
|
|
424
|
+
dataframes_merged[self.barcode_colname] = dataframes_merged[self.barcode_colname].astype('category')
|
|
409
425
|
|
|
410
426
|
return dataframes_merged
|
|
411
427
|
|
|
@@ -420,7 +436,7 @@ class SequencingSummaryExtractor:
|
|
|
420
436
|
:return: True if the filename is a barcoding summary file
|
|
421
437
|
"""
|
|
422
438
|
header = read_first_line_file(filename)
|
|
423
|
-
return header.startswith('read_id') and 'barcode_arrangement'
|
|
439
|
+
return header.startswith('read_id') and any(col in header for col in ['barcode_arrangement', 'barcode'])
|
|
424
440
|
|
|
425
441
|
@staticmethod
|
|
426
442
|
def _is_sequencing_summary_file(filename):
|
|
@@ -430,7 +446,7 @@ class SequencingSummaryExtractor:
|
|
|
430
446
|
:return: True if the file is indeed a sequencing summary file
|
|
431
447
|
"""
|
|
432
448
|
header = read_first_line_file(filename)
|
|
433
|
-
return header.startswith('filename') and not 'barcode_arrangement'
|
|
449
|
+
return header.startswith('filename') and not any(col in header for col in ['barcode_arrangement', 'barcode'])
|
|
434
450
|
|
|
435
451
|
@staticmethod
|
|
436
452
|
def _is_sequencing_summary_with_barcodes(filename):
|
|
@@ -441,7 +457,18 @@ class SequencingSummaryExtractor:
|
|
|
441
457
|
:return: True if the filename is a sequencing summary file with barcodes
|
|
442
458
|
"""
|
|
443
459
|
header = read_first_line_file(filename)
|
|
444
|
-
return header.startswith('filename') and 'barcode_arrangement'
|
|
460
|
+
return header.startswith('filename') and any(col in header for col in ['barcode_arrangement', 'barcode'])
|
|
461
|
+
|
|
462
|
+
def _get_barcode_colname(self, filename):
|
|
463
|
+
"""
|
|
464
|
+
Check if the barcode colname in sequencing summary is "barcode_arrangement" or "barcode"
|
|
465
|
+
:param filename: path of the file to test
|
|
466
|
+
"""
|
|
467
|
+
header = read_first_line_file(filename)
|
|
468
|
+
if 'barcode_arrangement' in header:
|
|
469
|
+
self.barcode_colname = 'barcode_arrangement'
|
|
470
|
+
else :
|
|
471
|
+
self.barcode_colname = 'barcode'
|
|
445
472
|
|
|
446
473
|
|
|
447
474
|
|
|
@@ -115,7 +115,7 @@ class OneDSquareSequencingSummaryExtractor(SSE):
|
|
|
115
115
|
|
|
116
116
|
# Copy dataframe to avoid changing original df when dropping columns
|
|
117
117
|
dataframe_1d_copy = self.dataframe_1d.copy(deep=True)
|
|
118
|
-
dataframe_1d_copy.drop(columns=["sequence_length", "mean_qscore", "passes_filtering"]
|
|
118
|
+
dataframe_1d_copy = dataframe_1d_copy.drop(columns=["sequence_length", "mean_qscore", "passes_filtering"])
|
|
119
119
|
|
|
120
120
|
# Load dataframe_1dsqr df from 1D² files
|
|
121
121
|
self.dataframe_1dsqr = self._load_sequencing_summary_1dsqr_data()
|
|
@@ -123,7 +123,7 @@ class OneDSquareSequencingSummaryExtractor(SSE):
|
|
|
123
123
|
# Create duration column in dataframe_1dsqr
|
|
124
124
|
self.dataframe_1dsqr['duration'] = self.dataframe_1dsqr['trimmed_duration1'] + self.dataframe_1dsqr[
|
|
125
125
|
'trimmed_duration2'] # duration of the 2 strands sequenced
|
|
126
|
-
self.dataframe_1dsqr.drop(columns=['trimmed_duration1', 'trimmed_duration2']
|
|
126
|
+
self.dataframe_1dsqr = self.dataframe_1dsqr.drop(columns=['trimmed_duration1', 'trimmed_duration2'])
|
|
127
127
|
|
|
128
128
|
# dataframe_dicts
|
|
129
129
|
self.dataframe_dict_1dsqr = {}
|
|
@@ -398,8 +398,7 @@ class OneDSquareSequencingSummaryExtractor(SSE):
|
|
|
398
398
|
barcode_dataframe = dataframe
|
|
399
399
|
# if a barcoding file has already been read, append the 2 dataframes
|
|
400
400
|
else:
|
|
401
|
-
barcode_dataframe =
|
|
402
|
-
dataframe, ignore_index=True)
|
|
401
|
+
barcode_dataframe = pd.concat([barcode_dataframe, dataframe], ignore_index=True)
|
|
403
402
|
|
|
404
403
|
# check for presence of sequencing_summary file, if True add column read_id for merging with barcode dataframe
|
|
405
404
|
else:
|
|
@@ -416,15 +415,13 @@ class OneDSquareSequencingSummaryExtractor(SSE):
|
|
|
416
415
|
if summary_dataframe is None:
|
|
417
416
|
summary_dataframe = dataframe
|
|
418
417
|
else:
|
|
419
|
-
summary_dataframe =
|
|
420
|
-
dataframe, ignore_index=True)
|
|
418
|
+
summary_dataframe = pd.concat([summary_dataframe, dataframe], ignore_index=True)
|
|
421
419
|
|
|
422
420
|
if barcode_dataframe is None:
|
|
423
421
|
# If no barcodes in files, no merged dataframes on column 'read_id'
|
|
424
422
|
return summary_dataframe.drop(columns=['read_id1'])
|
|
425
423
|
else:
|
|
426
|
-
summary_dataframe.rename(columns={"read_id1": "read_id"}
|
|
427
|
-
inplace=True)
|
|
424
|
+
summary_dataframe = summary_dataframe.rename(columns={"read_id1": "read_id"})
|
|
428
425
|
dataframes_merged = pd.merge(summary_dataframe,
|
|
429
426
|
barcode_dataframe,
|
|
430
427
|
on='read_id',
|
|
@@ -436,10 +433,9 @@ class OneDSquareSequencingSummaryExtractor(SSE):
|
|
|
436
433
|
sys.stderr.write('Warning: {} barcodes values are missing in sequencing summary file(s).'
|
|
437
434
|
' They will be marked as "unclassified".\n'.format(missing_barcodes_count))
|
|
438
435
|
# Add missing categories
|
|
439
|
-
dataframes_merged['barcode_arrangement'].cat.add_categories([0, 'other barcodes', 'passes_filtering']
|
|
440
|
-
inplace=True)
|
|
436
|
+
dataframes_merged['barcode_arrangement'] = dataframes_merged['barcode_arrangement'].cat.add_categories([0, 'other barcodes', 'passes_filtering'])
|
|
441
437
|
if 'unclassified' not in dataframes_merged['barcode_arrangement'].cat.categories:
|
|
442
|
-
dataframes_merged['barcode_arrangement'].cat.add_categories(['unclassified']
|
|
438
|
+
dataframes_merged['barcode_arrangement'] = dataframes_merged['barcode_arrangement'].cat.add_categories(['unclassified'])
|
|
443
439
|
|
|
444
440
|
# Replace missing barcodes values by 'unclassified'
|
|
445
441
|
dataframes_merged['barcode_arrangement'] = dataframes_merged['barcode_arrangement'].fillna(
|
|
@@ -32,9 +32,6 @@
|
|
|
32
32
|
# 4. In the case of barcoded sequencing, it searches all barcodes from the command line argument --barcodes
|
|
33
33
|
# 5. It uses all the information collected to generate a qc in the form of a htl-report and a report.data file
|
|
34
34
|
|
|
35
|
-
import matplotlib
|
|
36
|
-
|
|
37
|
-
matplotlib.use('Agg')
|
|
38
35
|
import shutil
|
|
39
36
|
import sys
|
|
40
37
|
import re
|
|
@@ -42,6 +39,7 @@ import argparse
|
|
|
42
39
|
import os
|
|
43
40
|
import time
|
|
44
41
|
import datetime
|
|
42
|
+
import pandas as pd
|
|
45
43
|
|
|
46
44
|
import warnings
|
|
47
45
|
from toulligqc import toulligqc_info_extractor
|
|
@@ -97,6 +95,9 @@ def _parse_args(config_dictionary):
|
|
|
97
95
|
'can also be in SAM format')
|
|
98
96
|
|
|
99
97
|
# Add all optional arguments
|
|
98
|
+
optional.add_argument('-s', '--samplesheet', action='store', dest="samplesheet",
|
|
99
|
+
help='a samplesheet (.csv file) to fill out sample names in MinKNOW')
|
|
100
|
+
|
|
100
101
|
optional.add_argument("--thread", action='store', dest="thread", help="Number of threads", type=int, default=2)
|
|
101
102
|
optional.add_argument("--batch-size", action='store', dest="batch_size", help="Batch size", type=int, default=500)
|
|
102
103
|
optional.add_argument("--qscore-threshold", action='store', dest="threshold", help="Qscore threshold", type=int, default=9)
|
|
@@ -113,7 +114,7 @@ def _parse_args(config_dictionary):
|
|
|
113
114
|
optional.add_argument("-b", "--barcoding", action='store_true', dest='is_barcode', help="Option for barcode usage",
|
|
114
115
|
default=False)
|
|
115
116
|
optional.add_argument('-l', '--barcodes', action='store', default='', dest='barcodes',
|
|
116
|
-
help='
|
|
117
|
+
help='Comma-separated barcode list (e.g., BC05,RB09,NB01,barcode10) or a range separated with ":" (e.g., barcode01:barcode19)')
|
|
117
118
|
optional.add_argument("--quiet", action='store_true', dest='is_quiet', help="Quiet mode",
|
|
118
119
|
default=False)
|
|
119
120
|
optional.add_argument("--report-only", action='store_true', dest='report_only',
|
|
@@ -132,8 +133,8 @@ def _parse_args(config_dictionary):
|
|
|
132
133
|
is_barcode = args.is_barcode
|
|
133
134
|
barcodes = args.barcodes
|
|
134
135
|
|
|
135
|
-
# If a barcode list is provided, automatically add --barcoding argument
|
|
136
|
-
if len(barcodes) > 0:
|
|
136
|
+
# If a barcode list or samplesheet are is provided, automatically add --barcoding argument
|
|
137
|
+
if len(barcodes) > 0 or args.samplesheet:
|
|
137
138
|
is_barcode = True
|
|
138
139
|
|
|
139
140
|
# If no report_name specified, create default one : ToulligQC-report-YYYYMMDD_HHMMSS
|
|
@@ -150,6 +151,7 @@ def _parse_args(config_dictionary):
|
|
|
150
151
|
('sequencing_summary_source', _join_parameter_arguments(args.sequencing_summary_source)),
|
|
151
152
|
('sequencing_summary_1dsqr_source', _join_parameter_arguments(args.sequencing_summary_1dsqr_source)),
|
|
152
153
|
('sequencing_telemetry_source', args.telemetry_source),
|
|
154
|
+
('samplesheet', args.samplesheet),
|
|
153
155
|
('fastq', _join_parameter_arguments(args.fastq)),
|
|
154
156
|
('bam', _join_parameter_arguments(args.bam)),
|
|
155
157
|
('thread', args.thread),
|
|
@@ -235,7 +237,6 @@ def _check_conf(config_dictionary):
|
|
|
235
237
|
_check_if_file_exists(config_dictionary['html_report_path'], force)
|
|
236
238
|
_check_if_file_exists(config_dictionary['data_report_path'], force)
|
|
237
239
|
|
|
238
|
-
print(config_dictionary['html_report_path'])
|
|
239
240
|
|
|
240
241
|
|
|
241
242
|
def _check_if_dir_exists(dir, force):
|
|
@@ -323,6 +324,20 @@ def _create_extractor_list(config_dictionary):
|
|
|
323
324
|
return result
|
|
324
325
|
|
|
325
326
|
|
|
327
|
+
def parse_samplesheet(sample_sheet):
|
|
328
|
+
columns = ['flow_cell_id', 'experiment_id',
|
|
329
|
+
'flow_cell_product_code',
|
|
330
|
+
'kit',
|
|
331
|
+
'barcode',
|
|
332
|
+
'alias']
|
|
333
|
+
try:
|
|
334
|
+
samplesheet = pd.read_csv(sample_sheet, usecols=columns)
|
|
335
|
+
except IOError:
|
|
336
|
+
raise FileNotFoundError("Error while reading samplesheet file")
|
|
337
|
+
|
|
338
|
+
return samplesheet
|
|
339
|
+
|
|
340
|
+
|
|
326
341
|
def main():
|
|
327
342
|
"""
|
|
328
343
|
Main function creating graphs and statistics
|
|
@@ -360,12 +375,22 @@ def main():
|
|
|
360
375
|
if pattern:
|
|
361
376
|
barcode = 'barcode{}'.format(pattern.group(2))
|
|
362
377
|
barcode_set.add(barcode)
|
|
378
|
+
else:
|
|
379
|
+
sys.stderr.write("\033[93mWarning:\033[0m Barcode '{}' is non-standard custom arrangement.\n".format(b))
|
|
380
|
+
barcode_set.add(b)
|
|
363
381
|
|
|
364
382
|
barcode_selection = sorted(barcode_set)
|
|
365
383
|
|
|
366
384
|
if len(barcode_selection) == 0:
|
|
367
385
|
sys.exit("ERROR: No known barcode found in provided list of barcodes")
|
|
368
386
|
config_dictionary['barcode_selection'] = barcode_selection
|
|
387
|
+
|
|
388
|
+
elif 'samplesheet' in config_dictionary:
|
|
389
|
+
samplesheet = parse_samplesheet(config_dictionary['samplesheet'])
|
|
390
|
+
config_dictionary['barcode_selection'] = list(samplesheet['barcode'])
|
|
391
|
+
config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values,
|
|
392
|
+
index=samplesheet.barcode).to_dict()
|
|
393
|
+
|
|
369
394
|
else:
|
|
370
395
|
config_dictionary['barcode_selection'] = ''
|
|
371
396
|
|
|
@@ -420,4 +445,4 @@ def main():
|
|
|
420
445
|
|
|
421
446
|
|
|
422
447
|
if __name__ == "__main__":
|
|
423
|
-
main()
|
|
448
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '2.7'
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: toulligqc
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7
|
|
4
4
|
Summary: A post sequencing QC tool for Oxford Nanopore sequencers
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/GenomiqueENS/toulligQC
|
|
6
6
|
Author: Genomic Paris Centre team
|
|
7
|
-
Author-email: toulligqc@
|
|
7
|
+
Author-email: toulligqc@bio.ens.psl.eu
|
|
8
8
|
License: GPL V3
|
|
9
9
|
Keywords: Nanopore MinION QC report
|
|
10
10
|
Platform: ALL
|
|
@@ -15,7 +15,7 @@ Classifier: Intended Audience :: Science/Research
|
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
16
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
17
17
|
Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Requires-Python: >=3.11.0
|
|
20
20
|
License-File: LICENSE-CeCILL.txt
|
|
21
21
|
License-File: LICENSE.txt
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = '2.6'
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|