toulligqc 2.6__tar.gz → 2.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {toulligqc-2.6 → toulligqc-2.7.1}/PKG-INFO +4 -4
- {toulligqc-2.6 → toulligqc-2.7.1}/README.md +89 -22
- {toulligqc-2.6 → toulligqc-2.7.1}/setup.py +7 -7
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/bam_extractor.py +49 -34
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/common_statistics.py +2 -2
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/extractor_common.py +18 -5
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/fastq_bam_common.py +17 -2
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/fastq_extractor.py +48 -27
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/html_report_generator.py +2 -2
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/plotly_graph_common.py +20 -8
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/plotly_graph_generator.py +12 -8
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/pod5_extractor.py +1 -1
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/sequencing_summary_extractor.py +53 -26
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/sequencing_summary_onedsquare_extractor.py +7 -11
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/toulligqc.py +41 -12
- toulligqc-2.7.1/toulligqc/version.py +1 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/PKG-INFO +4 -4
- toulligqc-2.7.1/toulligqc.egg-info/requires.txt +11 -0
- toulligqc-2.6/toulligqc/version.py +0 -1
- toulligqc-2.6/toulligqc.egg-info/requires.txt +0 -10
- {toulligqc-2.6 → toulligqc-2.7.1}/AUTHORS +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/LICENSE-CeCILL.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/LICENSE.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/MANIFEST.in +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/setup.cfg +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/test/test_sequencing_summary_extractor.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/__init__.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/common.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/configuration.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/fast5_extractor.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/plotly_graph_onedsquare_generator.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/report_data_file_generator.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/resources/plotly-latest.min.js +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/resources/toulligqc.css +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/resources/toulligqc.png +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/sequencing_telemetry_extractor.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/toulligqc_info_extractor.py +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/SOURCES.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/dependency_links.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/entry_points.txt +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/not-zip-safe +0 -0
- {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: toulligqc
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7.1
|
|
4
4
|
Summary: A post sequencing QC tool for Oxford Nanopore sequencers
|
|
5
|
-
Home-page: https://github.com/
|
|
5
|
+
Home-page: https://github.com/GenomiqueENS/toulligQC
|
|
6
6
|
Author: Genomic Paris Centre team
|
|
7
|
-
Author-email: toulligqc@
|
|
7
|
+
Author-email: toulligqc@bio.ens.psl.eu
|
|
8
8
|
License: GPL V3
|
|
9
9
|
Keywords: Nanopore MinION QC report
|
|
10
10
|
Platform: ALL
|
|
@@ -15,7 +15,7 @@ Classifier: Intended Audience :: Science/Research
|
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
16
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
17
17
|
Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Requires-Python: >=3.11.0
|
|
20
20
|
License-File: LICENSE-CeCILL.txt
|
|
21
21
|
License-File: LICENSE.txt
|
|
@@ -24,6 +24,7 @@ Support is availlable on [GitHub issue page](https://github.com/GenomicParisCent
|
|
|
24
24
|
* 1.3 [Docker](#docker)
|
|
25
25
|
* [Docker image recovery](#docker-image-recovery)
|
|
26
26
|
* [Launching Docker image with docker run](#launching-Docker-image-with-docker-run)
|
|
27
|
+
* 1.4 [nf-core module](#nfcore-module)
|
|
27
28
|
|
|
28
29
|
* 2.[Usage](#usage)
|
|
29
30
|
* 2.1 [Command line](#command-line)
|
|
@@ -52,18 +53,31 @@ $ cd toulligqc && python3 setup.py build install
|
|
|
52
53
|
ToulligQC is written with Python 3.
|
|
53
54
|
To run ToulligQC without Docker, you need to install the following Python modules:
|
|
54
55
|
|
|
55
|
-
* matplotlib
|
|
56
|
-
* plotly
|
|
57
|
-
* h5py
|
|
56
|
+
* matplotlib
|
|
57
|
+
* plotly
|
|
58
|
+
* h5py
|
|
58
59
|
* pandas
|
|
59
60
|
* numpy
|
|
60
61
|
* scipy
|
|
61
62
|
* scikit-learn
|
|
62
63
|
* pysam
|
|
64
|
+
* tqdm
|
|
65
|
+
* pod5
|
|
63
66
|
|
|
67
|
+
<a name="Conda-environemnt"></a>
|
|
68
|
+
### 1.2 Conda environemnt**
|
|
69
|
+
|
|
70
|
+
You can use a conda environment to install the required packages:
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
git clone https://github.com/GenomicParisCentre/toulligQC.git
|
|
74
|
+
cd toulligqc && python3 setup.py build install
|
|
75
|
+
conda env create -f environment.yml
|
|
76
|
+
conda activate toulliqc
|
|
77
|
+
```
|
|
64
78
|
|
|
65
79
|
<a name="pypi-installation"></a>
|
|
66
|
-
### 1.
|
|
80
|
+
### 1.3 Using a PyPi package
|
|
67
81
|
|
|
68
82
|
ToulligQC can be more easlily installed with a pip package availlable on the PyPi repository. The following command line will install the latest version of ToulligQC:
|
|
69
83
|
```bash
|
|
@@ -71,7 +85,7 @@ $ pip3 install toulligqc
|
|
|
71
85
|
```
|
|
72
86
|
|
|
73
87
|
<a name="docker"></a>
|
|
74
|
-
### 1.
|
|
88
|
+
### 1.4 Using Docker
|
|
75
89
|
ToulligQC and its dependencies are available through a Docker image. To install docker on your system, go to the Docker website (<https://docs.docker.com/engine/installation/>).
|
|
76
90
|
Even if Docker can run on Windows or macOS virtual machines, we recommend to run ToulligQC on a Linux host.
|
|
77
91
|
<a name="docker-image-recovery"></a>
|
|
@@ -93,14 +107,25 @@ $ docker run -ti \
|
|
|
93
107
|
-v /path/to/basecaller/sequencing/summary/file:/path/to/basecaller/sequencing/summary/file \
|
|
94
108
|
-v /path/to/basecaller/sequencing/telemetry/file:/path/to/basecaller/telemetry/summary/file \
|
|
95
109
|
-v /path/to/result/directory:/path/to/result/directory \
|
|
96
|
-
toulligqc:latest
|
|
110
|
+
genomicpariscentre/toulligqc:latest
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
<a name="nfcore-module"></a>
|
|
114
|
+
### 1.4 Using nf-core module
|
|
115
|
+
ToulligQC is also available on nf-core as a module written in nextflow. To install nf-core on your system, please visit their website (<https://nf-co.re/docs/usage/introduction>).
|
|
116
|
+
|
|
117
|
+
The following command line will install the latest version of the ToulligQC module:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
$ nf-core modules install toulligqc
|
|
97
121
|
```
|
|
122
|
+
|
|
98
123
|
<a name="usage"></a>
|
|
99
124
|
## 2. Usage
|
|
100
125
|
<a name="command-line"></a>
|
|
101
126
|
|
|
102
127
|
ToulligQC is adapted to RNA-Seq along with DNA-Seq and it is compatible with 1D² runs.
|
|
103
|
-
This QC tool supports only Guppy basecalling ouput files.
|
|
128
|
+
This QC tool supports only Guppy and Dorado basecalling ouput files.
|
|
104
129
|
It also needs a single FAST5 file (to catch the flowcell ID and the run date) if a telemetry file is not provided.
|
|
105
130
|
Flow cells and kits version are retrieved using the telemetry file.
|
|
106
131
|
ToulligQC can take barcoding samples by adding the barcode list as a command line option.
|
|
@@ -111,7 +136,7 @@ To do so, ToulligQC deals with different file formats: gz, tar.gz, bz2, tar.bz2
|
|
|
111
136
|
This tool will produce a set of graphs, statistic file in plain text format and a HTML report.
|
|
112
137
|
|
|
113
138
|
|
|
114
|
-
To run ToulligQC you need the Guppy basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
|
|
139
|
+
To run ToulligQC you need the Guppy/ Dorado basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
|
|
115
140
|
This can be compressed with gzip or bzip2.
|
|
116
141
|
You can use your initial Fast5 ONT file too.
|
|
117
142
|
ToulligQC can perform analyses on your data if the directory is organised as the following:
|
|
@@ -132,7 +157,7 @@ RUN_ID
|
|
|
132
157
|
└── sequencing_1dsq_summary.txt
|
|
133
158
|
```
|
|
134
159
|
|
|
135
|
-
For a barcoded run you can add the barcoding files generated by Guppy ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
|
|
160
|
+
For a barcoded run you can add the barcoding files generated by Guppy/ Dorado ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
|
|
136
161
|
|
|
137
162
|
For the barcode list to use in the command line options, ToulligQC handle the following naming schemes: BCXX, RBXX, NBXX and barcodeXX where XX is the number of the barcode.
|
|
138
163
|
The barcode naming schemes are case insensitive.
|
|
@@ -156,14 +181,16 @@ This is a directory for 1D² analysis with barcoding files:
|
|
|
156
181
|
|
|
157
182
|
General Options:
|
|
158
183
|
```
|
|
159
|
-
usage: ToulligQC V2.
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
184
|
+
usage: ToulligQC V2.6 [-a SEQUENCING_SUMMARY_SOURCE] [-t TELEMETRY_SOURCE]
|
|
185
|
+
[-f FAST5_SOURCE] [-p POD5_SOURCE] [-q FASTQ] [-u BAM]
|
|
186
|
+
[--thread THREAD] [--batch-size BATCH_SIZE] [--qscore-threshold THRESHOLD]
|
|
187
|
+
[-n REPORT_NAME] [--output-directory OUTPUT] [-o HTML_REPORT_PATH]
|
|
188
|
+
[--data-report-path DATA_REPORT_PATH]
|
|
189
|
+
[--images-directory IMAGES_DIRECTORY]
|
|
190
|
+
[-d SEQUENCING_SUMMARY_1DSQR_SOURCE]
|
|
191
|
+
[-s SAMPLESHEET]
|
|
192
|
+
[-b] [-l BARCODES]
|
|
193
|
+
[--quiet] [--force] [-h] [--version]
|
|
167
194
|
|
|
168
195
|
required arguments:
|
|
169
196
|
-a SEQUENCING_SUMMARY_SOURCE, --sequencing-summary-source SEQUENCING_SUMMARY_SOURCE
|
|
@@ -175,6 +202,9 @@ required arguments:
|
|
|
175
202
|
-f FAST5_SOURCE, --fast5-source FAST5_SOURCE
|
|
176
203
|
Fast5 file source (necessary if no telemetry file),
|
|
177
204
|
can also be in a tar.gz/tar.bz2 archive or a directory
|
|
205
|
+
-p POD5_SOURCE, --pod5-source POD5_SOURCE
|
|
206
|
+
pod5 file source (necessary if no telemetry file),
|
|
207
|
+
can also be in a tar.gz/tar.bz2 archive or a directory
|
|
178
208
|
-q FASTQ, --fastq FASTQ
|
|
179
209
|
FASTQ file (necessary if no sequencing summary file),
|
|
180
210
|
can also be in a .gz archive
|
|
@@ -183,6 +213,8 @@ required arguments:
|
|
|
183
213
|
can also be a SAM format
|
|
184
214
|
|
|
185
215
|
optional arguments:
|
|
216
|
+
-s SAMPLESHEET, --samplesheet SAMPLESHEET
|
|
217
|
+
Samplesheet (.csv file) to fill out sample names in MinKNOW.
|
|
186
218
|
-n REPORT_NAME, --report-name REPORT_NAME
|
|
187
219
|
Report name
|
|
188
220
|
--output-directory OUTPUT
|
|
@@ -197,8 +229,9 @@ optional arguments:
|
|
|
197
229
|
Basecaller 1dsq summary source
|
|
198
230
|
-b, --barcoding Option for barcode usage
|
|
199
231
|
-l BARCODES, --barcodes BARCODES
|
|
200
|
-
|
|
201
|
-
BC05,RB09,NB01,barcode10)
|
|
232
|
+
Comma-separated barcode list (e.g.,
|
|
233
|
+
BC05,RB09,NB01,barcode10) or a range separated with ':' (e.g.,
|
|
234
|
+
barcode01:barcode19)
|
|
202
235
|
--thread THREAD Number of threads for parsing FASTQ or BAM files (default: 2).
|
|
203
236
|
--batch-size BATCH_SIZE Batch size for each threads (default: 500).
|
|
204
237
|
--qscore-threshold THRESHOLD Q-score threshold to distinguish between passing filter and
|
|
@@ -213,7 +246,41 @@ optional arguments:
|
|
|
213
246
|
* #### Examples
|
|
214
247
|
|
|
215
248
|
|
|
216
|
-
|
|
249
|
+
* Sequencing summary alone \
|
|
250
|
+
Note that the fowcell ID and run date will be missing from report, found in telemetry file or single fast5 file
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
$ toulligqc --report-name summary_only \
|
|
254
|
+
--sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
|
|
255
|
+
--html-report-path /path/to/output/report.html
|
|
256
|
+
```
|
|
257
|
+
* Sequencing summary + telemetry file
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
$ toulligqc --report-name summary_plus_telemetry \
|
|
261
|
+
--telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
|
|
262
|
+
--sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
|
|
263
|
+
--html-report-path /path/to/output/report.html
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
* Telemetry file + fast5 files
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
$ toulligqc --report-name telemetry_plus_fast5 \
|
|
270
|
+
--telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
|
|
271
|
+
--fast5-source /path/to/basecaller/output/fast5_files.fast5.gz \
|
|
272
|
+
--html-report-path /path/to/output/report.html
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
* Fastq/ bam files only
|
|
276
|
+
|
|
277
|
+
```bash
|
|
278
|
+
$ toulligqc --report-name FAF0256 \
|
|
279
|
+
--fastq /path/to/basecaller/output/fastq_files.fq.gz \ # (replace with --bam)
|
|
280
|
+
--html-report-path /path/to/output/report.html
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
* Optional arguments for 1D² analysis
|
|
217
284
|
|
|
218
285
|
```bash
|
|
219
286
|
$ toulligqc --report-name FAF0256 \
|
|
@@ -223,7 +290,7 @@ $ toulligqc --report-name FAF0256 \
|
|
|
223
290
|
--html-report-path /path/to/output/report.html
|
|
224
291
|
```
|
|
225
292
|
|
|
226
|
-
|
|
293
|
+
* Optional arguments to deal with barcoded samples
|
|
227
294
|
|
|
228
295
|
```bash
|
|
229
296
|
$ toulligqc --report-name FAF0256 \
|
|
@@ -271,7 +338,7 @@ $ toulligqc \
|
|
|
271
338
|
--sequencing-summary-source sequencing_summary.txt \
|
|
272
339
|
--sequencing-summary-source barcoding_summary_pass.txt \
|
|
273
340
|
--sequencing-summary-source barcoding_summary_fail.txt \
|
|
274
|
-
--barcodes BC01
|
|
341
|
+
--barcodes BC01:BC07 \
|
|
275
342
|
--output-directory output
|
|
276
343
|
```
|
|
277
344
|
|
|
@@ -14,11 +14,11 @@ setup(
|
|
|
14
14
|
long_description='See project website for more information.',
|
|
15
15
|
|
|
16
16
|
# The project's main homepage.
|
|
17
|
-
url='https://github.com/
|
|
17
|
+
url='https://github.com/GenomiqueENS/toulligQC',
|
|
18
18
|
|
|
19
19
|
# Author details
|
|
20
20
|
author='Genomic Paris Centre team',
|
|
21
|
-
author_email='toulligqc@
|
|
21
|
+
author_email='toulligqc@bio.ens.psl.eu',
|
|
22
22
|
|
|
23
23
|
license='GPL V3',
|
|
24
24
|
platforms='ALL',
|
|
@@ -34,7 +34,7 @@ setup(
|
|
|
34
34
|
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
|
|
35
35
|
'License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)',
|
|
36
36
|
|
|
37
|
-
'Programming Language :: Python :: 3.
|
|
37
|
+
'Programming Language :: Python :: 3.12'
|
|
38
38
|
],
|
|
39
39
|
|
|
40
40
|
keywords='Nanopore MinION QC report',
|
|
@@ -46,10 +46,10 @@ setup(
|
|
|
46
46
|
include_package_data=True,
|
|
47
47
|
|
|
48
48
|
python_requires='>=3.11.0',
|
|
49
|
-
install_requires=['matplotlib>=3.6.3', 'plotly
|
|
50
|
-
'pandas>=1.
|
|
51
|
-
'scikit-learn>=1.
|
|
52
|
-
'pod5>=0.3.6'],
|
|
49
|
+
install_requires=['matplotlib>=3.6.3', 'plotly==5.15.0', 'h5py>=3.10.0',
|
|
50
|
+
'pandas>=2.1.4', 'numpy>=1.26.4', 'scipy>=1.11.4',
|
|
51
|
+
'scikit-learn>=1.4.1', 'tqdm>=4.66.2', 'pysam>=0.22.0',
|
|
52
|
+
'pod5>=0.3.10', 'ezcharts==0.7.6'],
|
|
53
53
|
|
|
54
54
|
entry_points={
|
|
55
55
|
'console_scripts': [
|
|
@@ -64,8 +64,9 @@ class uBAM_Extractor:
|
|
|
64
64
|
|
|
65
65
|
# Add missing categories
|
|
66
66
|
if 'barcode_arrangement' in self.dataframe.columns:
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
self.dataframe['barcode_arrangement'] = self.dataframe['barcode_arrangement'].cat.add_categories([0,
|
|
68
|
+
'other barcodes',
|
|
69
|
+
'passes_filtering'])
|
|
69
70
|
|
|
70
71
|
# Replace all NaN values by 0 to avoid data manipulation errors when columns are not the same length
|
|
71
72
|
self.dataframe = self.dataframe.fillna(0)
|
|
@@ -124,21 +125,29 @@ class uBAM_Extractor:
|
|
|
124
125
|
add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
|
|
125
126
|
add_image_to_result(self.quiet, images, time.time(), pgg.speed_over_time(self.dataframe_dict, self.images_directory))
|
|
126
127
|
if self.is_barcode:
|
|
128
|
+
if "barcode_alias" in self.config_dictionary:
|
|
129
|
+
barcode_alias = self.config_dictionary['barcode_alias']
|
|
130
|
+
else:
|
|
131
|
+
barcode_alias = None
|
|
127
132
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
|
|
128
133
|
self.barcode_selection,
|
|
129
|
-
self.images_directory
|
|
134
|
+
self.images_directory,
|
|
135
|
+
barcode_alias))
|
|
130
136
|
|
|
131
137
|
read_fail = self.dataframe_dict["read.fail.barcoded"]
|
|
132
138
|
if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
|
|
133
139
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
|
|
134
140
|
self.barcode_selection,
|
|
135
|
-
self.images_directory
|
|
141
|
+
self.images_directory,
|
|
142
|
+
barcode_alias))
|
|
136
143
|
|
|
137
144
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
|
|
138
|
-
self.images_directory
|
|
145
|
+
self.images_directory,
|
|
146
|
+
barcode_alias))
|
|
139
147
|
|
|
140
148
|
add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
|
|
141
|
-
self.images_directory
|
|
149
|
+
self.images_directory,
|
|
150
|
+
barcode_alias))
|
|
142
151
|
return images
|
|
143
152
|
|
|
144
153
|
|
|
@@ -271,8 +280,10 @@ class uBAM_Extractor:
|
|
|
271
280
|
"""
|
|
272
281
|
#def process_bam_chunk(bam_chunk):
|
|
273
282
|
rec_data = []
|
|
283
|
+
record_count = 0
|
|
274
284
|
for rec in uBAM_chunk:
|
|
275
|
-
|
|
285
|
+
record_count += 1
|
|
286
|
+
rec_dict = self._process_record(rec, record_count)
|
|
276
287
|
rec_data.append(rec_dict)
|
|
277
288
|
return rec_data
|
|
278
289
|
|
|
@@ -290,41 +301,45 @@ class uBAM_Extractor:
|
|
|
290
301
|
|
|
291
302
|
|
|
292
303
|
def _get_header(self):
|
|
293
|
-
|
|
294
|
-
header =
|
|
295
|
-
run_id, model_version_id =
|
|
304
|
+
sam_file = pysam.AlignmentFile(self.ubam[0], "rb", check_sq=False)
|
|
305
|
+
header = sam_file.header.to_dict()
|
|
306
|
+
run_id, model_version_id = extract_headerTag(header, 'RG','ID',
|
|
307
|
+
'Unknown_Unknown').split('_', 1)
|
|
296
308
|
self.header = {
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
309
|
+
"run_id": run_id,
|
|
310
|
+
"run_date": extract_headerTag(header, 'RG', 'DT', 'Unknown'),
|
|
311
|
+
"sample_id": extract_headerTag(header, 'RG', 'SM', 'Unknown'),
|
|
312
|
+
"basecaller": extract_headerTag(header, 'PG', 'PN', 'Unknown'),
|
|
313
|
+
"basecaller_version": extract_headerTag(header, 'PG', 'VN', 'Unknown'),
|
|
314
|
+
"model_version_id": model_version_id,
|
|
315
|
+
"flow_cell_id": extract_headerTag(header, 'RG', 'PU', 'Unknown')
|
|
304
316
|
}
|
|
305
317
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
def _process_record(self, rec):
|
|
318
|
+
|
|
319
|
+
def _process_record(self, rec, record_count):
|
|
309
320
|
"""
|
|
310
321
|
extract QC info from BAM record
|
|
311
322
|
return : dict of QC info
|
|
312
323
|
"""
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
324
|
+
fields = rec.split("\t")
|
|
325
|
+
|
|
326
|
+
# Parse optional fields
|
|
327
|
+
attributes = {}
|
|
328
|
+
for t in fields[11:]:
|
|
329
|
+
k, t, v = t.split(':', 2)
|
|
330
|
+
attributes[k] = v
|
|
331
|
+
|
|
332
|
+
iso_start_time = attributes.get('st', None)
|
|
333
|
+
qual = avg_qual(fields[10])
|
|
318
334
|
passes_filtering = True if qual > self.threshold_Qscore else False
|
|
319
335
|
data = [
|
|
320
|
-
len(
|
|
321
|
-
qual,
|
|
322
|
-
passes_filtering,
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
336
|
+
len(fields[9]), # read length
|
|
337
|
+
qual, # AVG Qscore
|
|
338
|
+
passes_filtering, # Passing filter
|
|
339
|
+
float(record_count) if iso_start_time is None else timeISO_to_float(iso_start_time, '%Y-%m-%dT%H:%M:%S.%f%z'), # start time
|
|
340
|
+
attributes.get('ch', '1'), # Channel
|
|
341
|
+
attributes.get('du', '1') # Duration
|
|
326
342
|
]
|
|
327
343
|
if self.is_barcode:
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
return data
|
|
344
|
+
data.append(attributes.get('BC', 'unclassified'))
|
|
345
|
+
return data
|
|
@@ -18,7 +18,7 @@ def compute_LXX(dataframe_dict, x):
|
|
|
18
18
|
cum_sum = 0
|
|
19
19
|
count = 0
|
|
20
20
|
for v in data:
|
|
21
|
-
cum_sum += v
|
|
21
|
+
cum_sum += int(v)
|
|
22
22
|
count += 1
|
|
23
23
|
if cum_sum >= half_sum:
|
|
24
24
|
return count
|
|
@@ -31,7 +31,7 @@ def compute_NXX(dataframe_dict, x):
|
|
|
31
31
|
half_sum = data.sum() * x / 100
|
|
32
32
|
cum_sum = 0
|
|
33
33
|
for v in data:
|
|
34
|
-
cum_sum += v
|
|
34
|
+
cum_sum += int(v)
|
|
35
35
|
if cum_sum >= half_sum:
|
|
36
36
|
return int(v)
|
|
37
37
|
|
|
@@ -164,16 +164,24 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
|
|
|
164
164
|
if "unclassified" not in barcode_selection:
|
|
165
165
|
barcode_selection.append("unclassified")
|
|
166
166
|
|
|
167
|
+
|
|
168
|
+
# If the barcode_arrangement column contains a barcode kit id
|
|
169
|
+
mask = df['barcode_arrangement'].str.startswith(('SQK', 'VQK'))
|
|
170
|
+
|
|
171
|
+
if mask.any():
|
|
172
|
+
df['barcode_arrangement'] = df['barcode_arrangement'].astype(str)
|
|
173
|
+
df.loc[mask, 'barcode_arrangement'] = df.loc[mask, 'barcode_arrangement'].str.extract(r'[SV]QK-.+_(.+)$')[0]
|
|
174
|
+
|
|
167
175
|
# Create keys barcode.arrangement, and read.pass/fail.barcode in dataframe_dict with all values of
|
|
168
176
|
# column barcode_arrangement when reads are passed/failed
|
|
169
|
-
dataframe_dict["barcode.arrangement"] = df[
|
|
177
|
+
dataframe_dict["barcode.arrangement"] = df['barcode_arrangement']
|
|
178
|
+
|
|
170
179
|
|
|
171
180
|
# Print warning message if a barcode is unknown
|
|
172
|
-
barcodes_found = set(
|
|
181
|
+
barcodes_found = set(df["barcode_arrangement"].unique())
|
|
173
182
|
for element in barcode_selection:
|
|
174
183
|
if element not in barcodes_found and element != 'other barcodes':
|
|
175
|
-
sys.stderr.write("
|
|
176
|
-
|
|
184
|
+
sys.stderr.write("\033[93mWarning:\033[0m The barcode {} doesn't exist in input data\n".format(element))
|
|
177
185
|
|
|
178
186
|
# Get barcodes frequency by Bases
|
|
179
187
|
df_base_pass_barcode = series_cols_boolean_elements(df, ["barcode_arrangement", "sequence_length"],
|
|
@@ -218,6 +226,7 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
|
|
|
218
226
|
(read_fail_barcoded_count / total_reads) * 100)
|
|
219
227
|
|
|
220
228
|
# Replaces all rows with unused barcodes (ie not in barcode_selection) in column barcode_arrangement with the 'other' value
|
|
229
|
+
|
|
221
230
|
df.loc[~df['barcode_arrangement'].isin(
|
|
222
231
|
barcode_selection), 'barcode_arrangement'] = 'other barcodes'
|
|
223
232
|
|
|
@@ -423,7 +432,11 @@ def add_image_to_result(quiet, image_list, start_time, image):
|
|
|
423
432
|
def timeISO_to_float(iso_datetime, format):
|
|
424
433
|
"""
|
|
425
434
|
"""
|
|
426
|
-
|
|
435
|
+
try:
|
|
436
|
+
dt = datetime.strptime(iso_datetime, format)
|
|
437
|
+
except:
|
|
438
|
+
format = '%Y-%m-%dT%H:%M:%SZ'
|
|
439
|
+
dt = datetime.strptime(iso_datetime, format)
|
|
427
440
|
unix_timestamp = dt.timestamp()
|
|
428
441
|
return unix_timestamp
|
|
429
442
|
|
|
@@ -2,8 +2,23 @@ import multiprocessing as mp
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
4
4
|
|
|
5
|
-
def extract_headerTag(header, tagGroup, tag):
|
|
6
|
-
|
|
5
|
+
def extract_headerTag(header, tagGroup, tag, defaultValue = None):
|
|
6
|
+
|
|
7
|
+
if tagGroup not in header:
|
|
8
|
+
if defaultValue is not None:
|
|
9
|
+
return defaultValue
|
|
10
|
+
else:
|
|
11
|
+
raise KeyError(tagGroup)
|
|
12
|
+
|
|
13
|
+
first_entry = header[tagGroup][0]
|
|
14
|
+
|
|
15
|
+
if tag not in first_entry:
|
|
16
|
+
if defaultValue is not None:
|
|
17
|
+
return defaultValue
|
|
18
|
+
else:
|
|
19
|
+
raise KeyError(tag)
|
|
20
|
+
|
|
21
|
+
return first_entry[tag]
|
|
7
22
|
|
|
8
23
|
|
|
9
24
|
def batch_iterator(iterator, batch_size):
|
|
@@ -64,8 +64,9 @@ class fastqExtractor:
|
|
|
64
64
|
|
|
65
65
|
# Add missing categories
|
|
66
66
|
if 'barcode_arrangement' in self.dataframe_1d.columns:
|
|
67
|
-
|
|
68
|
-
|
|
67
|
+
self.dataframe_1d['barcode_arrangement'] = self.dataframe_1d['barcode_arrangement'].cat.add_categories([0,
|
|
68
|
+
'other barcodes',
|
|
69
|
+
'passes_filtering'])
|
|
69
70
|
self.dataframe_1d = self.dataframe_1d.fillna(0)
|
|
70
71
|
self.barcode_selection = self.config_dictionary['barcode_selection']
|
|
71
72
|
|
|
@@ -118,32 +119,45 @@ class fastqExtractor:
|
|
|
118
119
|
|
|
119
120
|
add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory))
|
|
120
121
|
add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory))
|
|
122
|
+
|
|
121
123
|
if self.rich:
|
|
122
124
|
add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe_1d, self.images_directory))
|
|
123
125
|
add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory))
|
|
124
126
|
add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))
|
|
127
|
+
|
|
125
128
|
if self.rich:
|
|
126
129
|
add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory))
|
|
127
130
|
add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))
|
|
131
|
+
|
|
128
132
|
if self.rich:
|
|
129
133
|
add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory))
|
|
130
134
|
add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
|
|
131
|
-
if self.is_barcode:
|
|
132
|
-
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
|
|
133
|
-
self.barcode_selection,
|
|
134
|
-
self.images_directory))
|
|
135
135
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
136
|
+
if self.is_barcode:
|
|
137
|
+
if "barcode_alias" in self.config_dictionary:
|
|
138
|
+
barcode_alias = self.config_dictionary['barcode_alias']
|
|
139
|
+
else:
|
|
140
|
+
barcode_alias = None
|
|
141
|
+
|
|
142
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
|
|
143
|
+
self.barcode_selection,
|
|
144
|
+
self.images_directory,
|
|
145
|
+
barcode_alias))
|
|
146
|
+
|
|
147
|
+
read_fail = self.dataframe_dict["read.fail.barcoded"]
|
|
148
|
+
if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
|
|
149
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
|
|
150
|
+
self.barcode_selection,
|
|
151
|
+
self.images_directory,
|
|
152
|
+
barcode_alias))
|
|
153
|
+
|
|
154
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
|
|
155
|
+
self.images_directory,
|
|
156
|
+
barcode_alias))
|
|
157
|
+
|
|
158
|
+
add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
|
|
159
|
+
self.images_directory,
|
|
160
|
+
barcode_alias))
|
|
147
161
|
return images
|
|
148
162
|
|
|
149
163
|
|
|
@@ -210,7 +224,7 @@ class fastqExtractor:
|
|
|
210
224
|
"pass.reads.sequence.length")
|
|
211
225
|
describe_dict(self, result_dict, self.dataframe_dict["fail.reads.sequence.length"],
|
|
212
226
|
"fail.reads.sequence.length")
|
|
213
|
-
if self.is_barcode:
|
|
227
|
+
if self.rich and self.is_barcode:
|
|
214
228
|
extract_barcode_info(self, result_dict,
|
|
215
229
|
self.barcode_selection,
|
|
216
230
|
self.dataframe_dict,
|
|
@@ -257,8 +271,9 @@ class fastqExtractor:
|
|
|
257
271
|
columns = ['sequence_length', 'mean_qscore', 'passes_filtering']
|
|
258
272
|
if self.rich:
|
|
259
273
|
columns.extend(['start_time', 'channel'])
|
|
260
|
-
|
|
261
|
-
|
|
274
|
+
|
|
275
|
+
if self.is_barcode:
|
|
276
|
+
columns.append('barcode_arrangement')
|
|
262
277
|
|
|
263
278
|
fq_df = pd.DataFrame(fq_df, columns=columns)
|
|
264
279
|
|
|
@@ -270,8 +285,10 @@ class fastqExtractor:
|
|
|
270
285
|
fq_df["start_time"] = fq_df["start_time"] - fq_df["start_time"].min()
|
|
271
286
|
fq_df['start_time'] = fq_df['start_time'].astype(np.float64)
|
|
272
287
|
fq_df['channel'] = fq_df['channel'].astype(np.int16)
|
|
273
|
-
|
|
274
|
-
|
|
288
|
+
|
|
289
|
+
if self.is_barcode:
|
|
290
|
+
fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")
|
|
291
|
+
|
|
275
292
|
return fq_df
|
|
276
293
|
|
|
277
294
|
|
|
@@ -326,9 +343,10 @@ class fastqExtractor:
|
|
|
326
343
|
fastq_lines.append((len(read[1]), qscore, passes_filtering, start_time, ch))
|
|
327
344
|
else:
|
|
328
345
|
for read in read_batch:
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
346
|
+
if len(read)>0:
|
|
347
|
+
qscore = avg_qual(read)
|
|
348
|
+
passes_filtering = True if qscore > self.threshold_Qscore else False
|
|
349
|
+
fastq_lines.append((len(read), qscore, passes_filtering))
|
|
332
350
|
return fastq_lines
|
|
333
351
|
|
|
334
352
|
|
|
@@ -344,8 +362,11 @@ class fastqExtractor:
|
|
|
344
362
|
self.is_barcode = False
|
|
345
363
|
if 'model_version_id' not in metadata:
|
|
346
364
|
metadata['model_version_id'] = 'Unknow'
|
|
365
|
+
run_info = []
|
|
347
366
|
try:
|
|
348
|
-
|
|
367
|
+
sample_id = 'sample_id' if 'sample_id' in metadata else 'sampleid'
|
|
368
|
+
run_id = 'run_id' if 'run_id' in metadata else 'runid'
|
|
369
|
+
return metadata[run_id] , metadata[sample_id] , metadata['model_version_id']
|
|
349
370
|
except:
|
|
350
371
|
return None
|
|
351
372
|
|
|
@@ -354,7 +375,7 @@ class fastqExtractor:
|
|
|
354
375
|
"""
|
|
355
376
|
"""
|
|
356
377
|
metadata = dict(x.split("=") for x in name.split(" ")[1:])
|
|
357
|
-
start_time = timeISO_to_float(metadata['start_time'],
|
|
378
|
+
start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f%z')
|
|
358
379
|
if self.is_barcode:
|
|
359
380
|
return start_time, metadata['ch'], metadata['barcode']
|
|
360
381
|
return start_time, metadata['ch']
|