toulligqc 2.6__tar.gz → 2.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {toulligqc-2.6 → toulligqc-2.7.1}/PKG-INFO +4 -4
  2. {toulligqc-2.6 → toulligqc-2.7.1}/README.md +89 -22
  3. {toulligqc-2.6 → toulligqc-2.7.1}/setup.py +7 -7
  4. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/bam_extractor.py +49 -34
  5. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/common_statistics.py +2 -2
  6. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/extractor_common.py +18 -5
  7. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/fastq_bam_common.py +17 -2
  8. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/fastq_extractor.py +48 -27
  9. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/html_report_generator.py +2 -2
  10. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/plotly_graph_common.py +20 -8
  11. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/plotly_graph_generator.py +12 -8
  12. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/pod5_extractor.py +1 -1
  13. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/sequencing_summary_extractor.py +53 -26
  14. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/sequencing_summary_onedsquare_extractor.py +7 -11
  15. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/toulligqc.py +41 -12
  16. toulligqc-2.7.1/toulligqc/version.py +1 -0
  17. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/PKG-INFO +4 -4
  18. toulligqc-2.7.1/toulligqc.egg-info/requires.txt +11 -0
  19. toulligqc-2.6/toulligqc/version.py +0 -1
  20. toulligqc-2.6/toulligqc.egg-info/requires.txt +0 -10
  21. {toulligqc-2.6 → toulligqc-2.7.1}/AUTHORS +0 -0
  22. {toulligqc-2.6 → toulligqc-2.7.1}/LICENSE-CeCILL.txt +0 -0
  23. {toulligqc-2.6 → toulligqc-2.7.1}/LICENSE.txt +0 -0
  24. {toulligqc-2.6 → toulligqc-2.7.1}/MANIFEST.in +0 -0
  25. {toulligqc-2.6 → toulligqc-2.7.1}/setup.cfg +0 -0
  26. {toulligqc-2.6 → toulligqc-2.7.1}/test/test_sequencing_summary_extractor.py +0 -0
  27. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/__init__.py +0 -0
  28. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/common.py +0 -0
  29. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/configuration.py +0 -0
  30. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/fast5_extractor.py +0 -0
  31. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/plotly_graph_onedsquare_generator.py +0 -0
  32. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/report_data_file_generator.py +0 -0
  33. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/resources/plotly-latest.min.js +0 -0
  34. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/resources/toulligqc.css +0 -0
  35. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/resources/toulligqc.png +0 -0
  36. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/sequencing_telemetry_extractor.py +0 -0
  37. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc/toulligqc_info_extractor.py +0 -0
  38. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/SOURCES.txt +0 -0
  39. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/dependency_links.txt +0 -0
  40. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/entry_points.txt +0 -0
  41. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/not-zip-safe +0 -0
  42. {toulligqc-2.6 → toulligqc-2.7.1}/toulligqc.egg-info/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: toulligqc
3
- Version: 2.6
3
+ Version: 2.7.1
4
4
  Summary: A post sequencing QC tool for Oxford Nanopore sequencers
5
- Home-page: https://github.com/GenomicParisCentre/toulligQC
5
+ Home-page: https://github.com/GenomiqueENS/toulligQC
6
6
  Author: Genomic Paris Centre team
7
- Author-email: toulligqc@biologie.ens.fr
7
+ Author-email: toulligqc@bio.ens.psl.eu
8
8
  License: GPL V3
9
9
  Keywords: Nanopore MinION QC report
10
10
  Platform: ALL
@@ -15,7 +15,7 @@ Classifier: Intended Audience :: Science/Research
15
15
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
16
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
17
17
  Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
18
- Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
19
  Requires-Python: >=3.11.0
20
20
  License-File: LICENSE-CeCILL.txt
21
21
  License-File: LICENSE.txt
@@ -24,6 +24,7 @@ Support is availlable on [GitHub issue page](https://github.com/GenomicParisCent
24
24
  * 1.3 [Docker](#docker)
25
25
  * [Docker image recovery](#docker-image-recovery)
26
26
  * [Launching Docker image with docker run](#launching-Docker-image-with-docker-run)
27
+ * 1.4 [nf-core module](#nfcore-module)
27
28
 
28
29
  * 2.[Usage](#usage)
29
30
  * 2.1 [Command line](#command-line)
@@ -52,18 +53,31 @@ $ cd toulligqc && python3 setup.py build install
52
53
  ToulligQC is written with Python 3.
53
54
  To run ToulligQC without Docker, you need to install the following Python modules:
54
55
 
55
- * matplotlib
56
- * plotly
57
- * h5py
56
+ * matplotlib
57
+ * plotly
58
+ * h5py
58
59
  * pandas
59
60
  * numpy
60
61
  * scipy
61
62
  * scikit-learn
62
63
  * pysam
64
+ * tqdm
65
+ * pod5
63
66
 
67
+ <a name="Conda-environemnt"></a>
68
+ ### 1.2 Conda environemnt**
69
+
70
+ You can use a conda environment to install the required packages:
71
+
72
+ ```
73
+ git clone https://github.com/GenomicParisCentre/toulligQC.git
74
+ cd toulligqc && python3 setup.py build install
75
+ conda env create -f environment.yml
76
+ conda activate toulliqc
77
+ ```
64
78
 
65
79
  <a name="pypi-installation"></a>
66
- ### 1.2 Using a PyPi package
80
+ ### 1.3 Using a PyPi package
67
81
 
68
82
  ToulligQC can be more easlily installed with a pip package availlable on the PyPi repository. The following command line will install the latest version of ToulligQC:
69
83
  ```bash
@@ -71,7 +85,7 @@ $ pip3 install toulligqc
71
85
  ```
72
86
 
73
87
  <a name="docker"></a>
74
- ### 1.3 Using Docker
88
+ ### 1.4 Using Docker
75
89
  ToulligQC and its dependencies are available through a Docker image. To install docker on your system, go to the Docker website (<https://docs.docker.com/engine/installation/>).
76
90
  Even if Docker can run on Windows or macOS virtual machines, we recommend to run ToulligQC on a Linux host.
77
91
  <a name="docker-image-recovery"></a>
@@ -93,14 +107,25 @@ $ docker run -ti \
93
107
  -v /path/to/basecaller/sequencing/summary/file:/path/to/basecaller/sequencing/summary/file \
94
108
  -v /path/to/basecaller/sequencing/telemetry/file:/path/to/basecaller/telemetry/summary/file \
95
109
  -v /path/to/result/directory:/path/to/result/directory \
96
- toulligqc:latest
110
+ genomicpariscentre/toulligqc:latest
111
+ ```
112
+
113
+ <a name="nfcore-module"></a>
114
+ ### 1.4 Using nf-core module
115
+ ToulligQC is also available on nf-core as a module written in nextflow. To install nf-core on your system, please visit their website (<https://nf-co.re/docs/usage/introduction>).
116
+
117
+ The following command line will install the latest version of the ToulligQC module:
118
+
119
+ ```bash
120
+ $ nf-core modules install toulligqc
97
121
  ```
122
+
98
123
  <a name="usage"></a>
99
124
  ## 2. Usage
100
125
  <a name="command-line"></a>
101
126
 
102
127
  ToulligQC is adapted to RNA-Seq along with DNA-Seq and it is compatible with 1D² runs.
103
- This QC tool supports only Guppy basecalling ouput files.
128
+ This QC tool supports only Guppy and Dorado basecalling ouput files.
104
129
  It also needs a single FAST5 file (to catch the flowcell ID and the run date) if a telemetry file is not provided.
105
130
  Flow cells and kits version are retrieved using the telemetry file.
106
131
  ToulligQC can take barcoding samples by adding the barcode list as a command line option.
@@ -111,7 +136,7 @@ To do so, ToulligQC deals with different file formats: gz, tar.gz, bz2, tar.bz2
111
136
  This tool will produce a set of graphs, statistic file in plain text format and a HTML report.
112
137
 
113
138
 
114
- To run ToulligQC you need the Guppy basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
139
+ To run ToulligQC you need the Guppy/ Dorado basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
115
140
  This can be compressed with gzip or bzip2.
116
141
  You can use your initial Fast5 ONT file too.
117
142
  ToulligQC can perform analyses on your data if the directory is organised as the following:
@@ -132,7 +157,7 @@ RUN_ID
132
157
  └── sequencing_1dsq_summary.txt
133
158
  ```
134
159
 
135
- For a barcoded run you can add the barcoding files generated by Guppy ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
160
+ For a barcoded run you can add the barcoding files generated by Guppy/ Dorado ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
136
161
 
137
162
  For the barcode list to use in the command line options, ToulligQC handle the following naming schemes: BCXX, RBXX, NBXX and barcodeXX where XX is the number of the barcode.
138
163
  The barcode naming schemes are case insensitive.
@@ -156,14 +181,16 @@ This is a directory for 1D² analysis with barcoding files:
156
181
 
157
182
  General Options:
158
183
  ```
159
- usage: ToulligQC V2.2.1 -a SEQUENCING_SUMMARY_SOURCE [-t TELEMETRY_SOURCE]
160
- [--fastq -q FASTQ] [--bam -u BAM]
161
- [-f FAST5_SOURCE] [-n REPORT_NAME]
162
- [--output-directory OUTPUT] [-o HTML_REPORT_PATH]
163
- [--data-report-path DATA_REPORT_PATH]
164
- [--images-directory IMAGES_DIRECTORY]
165
- [-d SEQUENCING_SUMMARY_1DSQR_SOURCE] [-b]
166
- [-l BARCODES] [--quiet] [--force] [-h] [--version]
184
+ usage: ToulligQC V2.6 [-a SEQUENCING_SUMMARY_SOURCE] [-t TELEMETRY_SOURCE]
185
+ [-f FAST5_SOURCE] [-p POD5_SOURCE] [-q FASTQ] [-u BAM]
186
+ [--thread THREAD] [--batch-size BATCH_SIZE] [--qscore-threshold THRESHOLD]
187
+ [-n REPORT_NAME] [--output-directory OUTPUT] [-o HTML_REPORT_PATH]
188
+ [--data-report-path DATA_REPORT_PATH]
189
+ [--images-directory IMAGES_DIRECTORY]
190
+ [-d SEQUENCING_SUMMARY_1DSQR_SOURCE]
191
+ [-s SAMPLESHEET]
192
+ [-b] [-l BARCODES]
193
+ [--quiet] [--force] [-h] [--version]
167
194
 
168
195
  required arguments:
169
196
  -a SEQUENCING_SUMMARY_SOURCE, --sequencing-summary-source SEQUENCING_SUMMARY_SOURCE
@@ -175,6 +202,9 @@ required arguments:
175
202
  -f FAST5_SOURCE, --fast5-source FAST5_SOURCE
176
203
  Fast5 file source (necessary if no telemetry file),
177
204
  can also be in a tar.gz/tar.bz2 archive or a directory
205
+ -p POD5_SOURCE, --pod5-source POD5_SOURCE
206
+ pod5 file source (necessary if no telemetry file),
207
+ can also be in a tar.gz/tar.bz2 archive or a directory
178
208
  -q FASTQ, --fastq FASTQ
179
209
  FASTQ file (necessary if no sequencing summary file),
180
210
  can also be in a .gz archive
@@ -183,6 +213,8 @@ required arguments:
183
213
  can also be a SAM format
184
214
 
185
215
  optional arguments:
216
+ -s SAMPLESHEET, --samplesheet SAMPLESHEET
217
+ Samplesheet (.csv file) to fill out sample names in MinKNOW.
186
218
  -n REPORT_NAME, --report-name REPORT_NAME
187
219
  Report name
188
220
  --output-directory OUTPUT
@@ -197,8 +229,9 @@ optional arguments:
197
229
  Basecaller 1dsq summary source
198
230
  -b, --barcoding Option for barcode usage
199
231
  -l BARCODES, --barcodes BARCODES
200
- Coma separated barcode list (e.g.
201
- BC05,RB09,NB01,barcode10)
232
+ Comma-separated barcode list (e.g.,
233
+ BC05,RB09,NB01,barcode10) or a range separated with ':' (e.g.,
234
+ barcode01:barcode19)
202
235
  --thread THREAD Number of threads for parsing FASTQ or BAM files (default: 2).
203
236
  --batch-size BATCH_SIZE Batch size for each threads (default: 500).
204
237
  --qscore-threshold THRESHOLD Q-score threshold to distinguish between passing filter and
@@ -213,7 +246,41 @@ optional arguments:
213
246
  * #### Examples
214
247
 
215
248
 
216
- Example with optional arguments:
249
+ * Sequencing summary alone \
250
+ Note that the fowcell ID and run date will be missing from report, found in telemetry file or single fast5 file
251
+
252
+ ```bash
253
+ $ toulligqc --report-name summary_only \
254
+ --sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
255
+ --html-report-path /path/to/output/report.html
256
+ ```
257
+ * Sequencing summary + telemetry file
258
+
259
+ ```bash
260
+ $ toulligqc --report-name summary_plus_telemetry \
261
+ --telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
262
+ --sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
263
+ --html-report-path /path/to/output/report.html
264
+ ```
265
+
266
+ * Telemetry file + fast5 files
267
+
268
+ ```bash
269
+ $ toulligqc --report-name telemetry_plus_fast5 \
270
+ --telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
271
+ --fast5-source /path/to/basecaller/output/fast5_files.fast5.gz \
272
+ --html-report-path /path/to/output/report.html
273
+ ```
274
+
275
+ * Fastq/ bam files only
276
+
277
+ ```bash
278
+ $ toulligqc --report-name FAF0256 \
279
+ --fastq /path/to/basecaller/output/fastq_files.fq.gz \ # (replace with --bam)
280
+ --html-report-path /path/to/output/report.html
281
+ ```
282
+
283
+ * Optional arguments for 1D² analysis
217
284
 
218
285
  ```bash
219
286
  $ toulligqc --report-name FAF0256 \
@@ -223,7 +290,7 @@ $ toulligqc --report-name FAF0256 \
223
290
  --html-report-path /path/to/output/report.html
224
291
  ```
225
292
 
226
- Example with optional arguments to deal with barcoded samples:
293
+ * Optional arguments to deal with barcoded samples
227
294
 
228
295
  ```bash
229
296
  $ toulligqc --report-name FAF0256 \
@@ -271,7 +338,7 @@ $ toulligqc \
271
338
  --sequencing-summary-source sequencing_summary.txt \
272
339
  --sequencing-summary-source barcoding_summary_pass.txt \
273
340
  --sequencing-summary-source barcoding_summary_fail.txt \
274
- --barcodes BC01,BC02,BC03,BC04,BC05,BC07 \
341
+ --barcodes BC01:BC07 \
275
342
  --output-directory output
276
343
  ```
277
344
 
@@ -14,11 +14,11 @@ setup(
14
14
  long_description='See project website for more information.',
15
15
 
16
16
  # The project's main homepage.
17
- url='https://github.com/GenomicParisCentre/toulligQC',
17
+ url='https://github.com/GenomiqueENS/toulligQC',
18
18
 
19
19
  # Author details
20
20
  author='Genomic Paris Centre team',
21
- author_email='toulligqc@biologie.ens.fr',
21
+ author_email='toulligqc@bio.ens.psl.eu',
22
22
 
23
23
  license='GPL V3',
24
24
  platforms='ALL',
@@ -34,7 +34,7 @@ setup(
34
34
  'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
35
35
  'License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)',
36
36
 
37
- 'Programming Language :: Python :: 3.11'
37
+ 'Programming Language :: Python :: 3.12'
38
38
  ],
39
39
 
40
40
  keywords='Nanopore MinION QC report',
@@ -46,10 +46,10 @@ setup(
46
46
  include_package_data=True,
47
47
 
48
48
  python_requires='>=3.11.0',
49
- install_requires=['matplotlib>=3.6.3', 'plotly>=5.15.0', 'h5py>=3.7.0',
50
- 'pandas>=1.5.3', 'numpy>=1.24.2', 'scipy>=1.10.1',
51
- 'scikit-learn>=1.2.1', 'tqdm>=4.64.1', 'pysam>=0.21.0',
52
- 'pod5>=0.3.6'],
49
+ install_requires=['matplotlib>=3.6.3', 'plotly==5.15.0', 'h5py>=3.10.0',
50
+ 'pandas>=2.1.4', 'numpy>=1.26.4', 'scipy>=1.11.4',
51
+ 'scikit-learn>=1.4.1', 'tqdm>=4.66.2', 'pysam>=0.22.0',
52
+ 'pod5>=0.3.10', 'ezcharts==0.7.6'],
53
53
 
54
54
  entry_points={
55
55
  'console_scripts': [
@@ -64,8 +64,9 @@ class uBAM_Extractor:
64
64
 
65
65
  # Add missing categories
66
66
  if 'barcode_arrangement' in self.dataframe.columns:
67
- self.dataframe['barcode_arrangement'].cat.add_categories([0, 'other barcodes', 'passes_filtering'],
68
- inplace=True)
67
+ self.dataframe['barcode_arrangement'] = self.dataframe['barcode_arrangement'].cat.add_categories([0,
68
+ 'other barcodes',
69
+ 'passes_filtering'])
69
70
 
70
71
  # Replace all NaN values by 0 to avoid data manipulation errors when columns are not the same length
71
72
  self.dataframe = self.dataframe.fillna(0)
@@ -124,21 +125,29 @@ class uBAM_Extractor:
124
125
  add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
125
126
  add_image_to_result(self.quiet, images, time.time(), pgg.speed_over_time(self.dataframe_dict, self.images_directory))
126
127
  if self.is_barcode:
128
+ if "barcode_alias" in self.config_dictionary:
129
+ barcode_alias = self.config_dictionary['barcode_alias']
130
+ else:
131
+ barcode_alias = None
127
132
  add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
128
133
  self.barcode_selection,
129
- self.images_directory))
134
+ self.images_directory,
135
+ barcode_alias))
130
136
 
131
137
  read_fail = self.dataframe_dict["read.fail.barcoded"]
132
138
  if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
133
139
  add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
134
140
  self.barcode_selection,
135
- self.images_directory))
141
+ self.images_directory,
142
+ barcode_alias))
136
143
 
137
144
  add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
138
- self.images_directory))
145
+ self.images_directory,
146
+ barcode_alias))
139
147
 
140
148
  add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
141
- self.images_directory))
149
+ self.images_directory,
150
+ barcode_alias))
142
151
  return images
143
152
 
144
153
 
@@ -271,8 +280,10 @@ class uBAM_Extractor:
271
280
  """
272
281
  #def process_bam_chunk(bam_chunk):
273
282
  rec_data = []
283
+ record_count = 0
274
284
  for rec in uBAM_chunk:
275
- rec_dict = self._process_record(rec)
285
+ record_count += 1
286
+ rec_dict = self._process_record(rec, record_count)
276
287
  rec_data.append(rec_dict)
277
288
  return rec_data
278
289
 
@@ -290,41 +301,45 @@ class uBAM_Extractor:
290
301
 
291
302
 
292
303
  def _get_header(self):
293
- samfile = pysam.AlignmentFile(self.ubam[0], "rb", check_sq=False)
294
- header = samfile.header.to_dict()
295
- run_id, model_version_id = extract_headerTag(header,'RG','ID').split('_', 1)
304
+ sam_file = pysam.AlignmentFile(self.ubam[0], "rb", check_sq=False)
305
+ header = sam_file.header.to_dict()
306
+ run_id, model_version_id = extract_headerTag(header, 'RG','ID',
307
+ 'Unknown_Unknown').split('_', 1)
296
308
  self.header = {
297
- "run_id" : run_id,
298
- "run_date" : extract_headerTag(header, 'RG', 'DT'),
299
- "sample_id" : extract_headerTag(header,'RG','SM'),
300
- "basecaller" : extract_headerTag(header,'PG','PN'),
301
- "basecaller_version" : extract_headerTag(header,'PG','VN'),
302
- "model_version_id" : model_version_id,
303
- "flow_cell_id" : extract_headerTag(header,'RG','PU')
309
+ "run_id": run_id,
310
+ "run_date": extract_headerTag(header, 'RG', 'DT', 'Unknown'),
311
+ "sample_id": extract_headerTag(header, 'RG', 'SM', 'Unknown'),
312
+ "basecaller": extract_headerTag(header, 'PG', 'PN', 'Unknown'),
313
+ "basecaller_version": extract_headerTag(header, 'PG', 'VN', 'Unknown'),
314
+ "model_version_id": model_version_id,
315
+ "flow_cell_id": extract_headerTag(header, 'RG', 'PU', 'Unknown')
304
316
  }
305
317
 
306
-
307
-
308
- def _process_record(self, rec):
318
+
319
+ def _process_record(self, rec, record_count):
309
320
  """
310
321
  extract QC info from BAM record
311
322
  return : dict of QC info
312
323
  """
313
- tags = rec.split("\t")
314
- tag_dict = defaultdict(lambda:'unclassified')
315
- tag_dict.update({key : value for key,_, value in [item.split(':',2) for item in tags[11:]]})
316
- start_time = timeISO_to_float(tag_dict['st'], '%Y-%m-%dT%H:%M:%S.%f%z')
317
- qual = avg_qual(tags[10])
324
+ fields = rec.split("\t")
325
+
326
+ # Parse optional fields
327
+ attributes = {}
328
+ for t in fields[11:]:
329
+ k, t, v = t.split(':', 2)
330
+ attributes[k] = v
331
+
332
+ iso_start_time = attributes.get('st', None)
333
+ qual = avg_qual(fields[10])
318
334
  passes_filtering = True if qual > self.threshold_Qscore else False
319
335
  data = [
320
- len(tags[9]),
321
- qual,
322
- passes_filtering,
323
- start_time,
324
- tag_dict['ch'],
325
- tag_dict['du']
336
+ len(fields[9]), # read length
337
+ qual, # AVG Qscore
338
+ passes_filtering, # Passing filter
339
+ float(record_count) if iso_start_time is None else timeISO_to_float(iso_start_time, '%Y-%m-%dT%H:%M:%S.%f%z'), # start time
340
+ attributes.get('ch', '1'), # Channel
341
+ attributes.get('du', '1') # Duration
326
342
  ]
327
343
  if self.is_barcode:
328
- bc = tag_dict['BC'].split('_')[-1]
329
- data.append(bc)
330
- return data
344
+ data.append(attributes.get('BC', 'unclassified'))
345
+ return data
@@ -18,7 +18,7 @@ def compute_LXX(dataframe_dict, x):
18
18
  cum_sum = 0
19
19
  count = 0
20
20
  for v in data:
21
- cum_sum += v
21
+ cum_sum += int(v)
22
22
  count += 1
23
23
  if cum_sum >= half_sum:
24
24
  return count
@@ -31,7 +31,7 @@ def compute_NXX(dataframe_dict, x):
31
31
  half_sum = data.sum() * x / 100
32
32
  cum_sum = 0
33
33
  for v in data:
34
- cum_sum += v
34
+ cum_sum += int(v)
35
35
  if cum_sum >= half_sum:
36
36
  return int(v)
37
37
 
@@ -164,16 +164,24 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
164
164
  if "unclassified" not in barcode_selection:
165
165
  barcode_selection.append("unclassified")
166
166
 
167
+
168
+ # If the barcode_arrangement column contains a barcode kit id
169
+ mask = df['barcode_arrangement'].str.startswith(('SQK', 'VQK'))
170
+
171
+ if mask.any():
172
+ df['barcode_arrangement'] = df['barcode_arrangement'].astype(str)
173
+ df.loc[mask, 'barcode_arrangement'] = df.loc[mask, 'barcode_arrangement'].str.extract(r'[SV]QK-.+_(.+)$')[0]
174
+
167
175
  # Create keys barcode.arrangement, and read.pass/fail.barcode in dataframe_dict with all values of
168
176
  # column barcode_arrangement when reads are passed/failed
169
- dataframe_dict["barcode.arrangement"] = df["barcode_arrangement"]
177
+ dataframe_dict["barcode.arrangement"] = df['barcode_arrangement']
178
+
170
179
 
171
180
  # Print warning message if a barcode is unknown
172
- barcodes_found = set(dataframe_dict["barcode.arrangement"].unique())
181
+ barcodes_found = set(df["barcode_arrangement"].unique())
173
182
  for element in barcode_selection:
174
183
  if element not in barcodes_found and element != 'other barcodes':
175
- sys.stderr.write("Warning: The barcode {} doesn't exist in input data\n".format(element))
176
-
184
+ sys.stderr.write("\033[93mWarning:\033[0m The barcode {} doesn't exist in input data\n".format(element))
177
185
 
178
186
  # Get barcodes frequency by Bases
179
187
  df_base_pass_barcode = series_cols_boolean_elements(df, ["barcode_arrangement", "sequence_length"],
@@ -218,6 +226,7 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
218
226
  (read_fail_barcoded_count / total_reads) * 100)
219
227
 
220
228
  # Replaces all rows with unused barcodes (ie not in barcode_selection) in column barcode_arrangement with the 'other' value
229
+
221
230
  df.loc[~df['barcode_arrangement'].isin(
222
231
  barcode_selection), 'barcode_arrangement'] = 'other barcodes'
223
232
 
@@ -423,7 +432,11 @@ def add_image_to_result(quiet, image_list, start_time, image):
423
432
  def timeISO_to_float(iso_datetime, format):
424
433
  """
425
434
  """
426
- dt = datetime.strptime(iso_datetime, format)
435
+ try:
436
+ dt = datetime.strptime(iso_datetime, format)
437
+ except:
438
+ format = '%Y-%m-%dT%H:%M:%SZ'
439
+ dt = datetime.strptime(iso_datetime, format)
427
440
  unix_timestamp = dt.timestamp()
428
441
  return unix_timestamp
429
442
 
@@ -2,8 +2,23 @@ import multiprocessing as mp
2
2
  from tqdm import tqdm
3
3
  from concurrent.futures import ProcessPoolExecutor, as_completed
4
4
 
5
- def extract_headerTag(header, tagGroup, tag):
6
- return header[tagGroup][0][tag]
5
+ def extract_headerTag(header, tagGroup, tag, defaultValue = None):
6
+
7
+ if tagGroup not in header:
8
+ if defaultValue is not None:
9
+ return defaultValue
10
+ else:
11
+ raise KeyError(tagGroup)
12
+
13
+ first_entry = header[tagGroup][0]
14
+
15
+ if tag not in first_entry:
16
+ if defaultValue is not None:
17
+ return defaultValue
18
+ else:
19
+ raise KeyError(tag)
20
+
21
+ return first_entry[tag]
7
22
 
8
23
 
9
24
  def batch_iterator(iterator, batch_size):
@@ -64,8 +64,9 @@ class fastqExtractor:
64
64
 
65
65
  # Add missing categories
66
66
  if 'barcode_arrangement' in self.dataframe_1d.columns:
67
- self.dataframe_1d['barcode_arrangement'].cat.add_categories([0, 'other barcodes', 'passes_filtering'],
68
- inplace=True)
67
+ self.dataframe_1d['barcode_arrangement'] = self.dataframe_1d['barcode_arrangement'].cat.add_categories([0,
68
+ 'other barcodes',
69
+ 'passes_filtering'])
69
70
  self.dataframe_1d = self.dataframe_1d.fillna(0)
70
71
  self.barcode_selection = self.config_dictionary['barcode_selection']
71
72
 
@@ -118,32 +119,45 @@ class fastqExtractor:
118
119
 
119
120
  add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory))
120
121
  add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory))
122
+
121
123
  if self.rich:
122
124
  add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe_1d, self.images_directory))
123
125
  add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory))
124
126
  add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))
127
+
125
128
  if self.rich:
126
129
  add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory))
127
130
  add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))
131
+
128
132
  if self.rich:
129
133
  add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory))
130
134
  add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
131
- if self.is_barcode:
132
- add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
133
- self.barcode_selection,
134
- self.images_directory))
135
135
 
136
- read_fail = self.dataframe_dict["read.fail.barcoded"]
137
- if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
138
- add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
139
- self.barcode_selection,
140
- self.images_directory))
141
-
142
- add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
143
- self.images_directory))
144
-
145
- add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
146
- self.images_directory))
136
+ if self.is_barcode:
137
+ if "barcode_alias" in self.config_dictionary:
138
+ barcode_alias = self.config_dictionary['barcode_alias']
139
+ else:
140
+ barcode_alias = None
141
+
142
+ add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
143
+ self.barcode_selection,
144
+ self.images_directory,
145
+ barcode_alias))
146
+
147
+ read_fail = self.dataframe_dict["read.fail.barcoded"]
148
+ if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
149
+ add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
150
+ self.barcode_selection,
151
+ self.images_directory,
152
+ barcode_alias))
153
+
154
+ add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
155
+ self.images_directory,
156
+ barcode_alias))
157
+
158
+ add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
159
+ self.images_directory,
160
+ barcode_alias))
147
161
  return images
148
162
 
149
163
 
@@ -210,7 +224,7 @@ class fastqExtractor:
210
224
  "pass.reads.sequence.length")
211
225
  describe_dict(self, result_dict, self.dataframe_dict["fail.reads.sequence.length"],
212
226
  "fail.reads.sequence.length")
213
- if self.is_barcode:
227
+ if self.rich and self.is_barcode:
214
228
  extract_barcode_info(self, result_dict,
215
229
  self.barcode_selection,
216
230
  self.dataframe_dict,
@@ -257,8 +271,9 @@ class fastqExtractor:
257
271
  columns = ['sequence_length', 'mean_qscore', 'passes_filtering']
258
272
  if self.rich:
259
273
  columns.extend(['start_time', 'channel'])
260
- if self.is_barcode:
261
- columns.append('barcode_arrangement')
274
+
275
+ if self.is_barcode:
276
+ columns.append('barcode_arrangement')
262
277
 
263
278
  fq_df = pd.DataFrame(fq_df, columns=columns)
264
279
 
@@ -270,8 +285,10 @@ class fastqExtractor:
270
285
  fq_df["start_time"] = fq_df["start_time"] - fq_df["start_time"].min()
271
286
  fq_df['start_time'] = fq_df['start_time'].astype(np.float64)
272
287
  fq_df['channel'] = fq_df['channel'].astype(np.int16)
273
- if self.is_barcode:
274
- fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")
288
+
289
+ if self.is_barcode:
290
+ fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")
291
+
275
292
  return fq_df
276
293
 
277
294
 
@@ -326,9 +343,10 @@ class fastqExtractor:
326
343
  fastq_lines.append((len(read[1]), qscore, passes_filtering, start_time, ch))
327
344
  else:
328
345
  for read in read_batch:
329
- qscore = avg_qual(read)
330
- passes_filtering = True if qscore > self.threshold_Qscore else False
331
- fastq_lines.append((len(read), qscore, passes_filtering))
346
+ if len(read)>0:
347
+ qscore = avg_qual(read)
348
+ passes_filtering = True if qscore > self.threshold_Qscore else False
349
+ fastq_lines.append((len(read), qscore, passes_filtering))
332
350
  return fastq_lines
333
351
 
334
352
 
@@ -344,8 +362,11 @@ class fastqExtractor:
344
362
  self.is_barcode = False
345
363
  if 'model_version_id' not in metadata:
346
364
  metadata['model_version_id'] = 'Unknow'
365
+ run_info = []
347
366
  try:
348
- return metadata['runid'] , metadata['sampleid'] , metadata['model_version_id']
367
+ sample_id = 'sample_id' if 'sample_id' in metadata else 'sampleid'
368
+ run_id = 'run_id' if 'run_id' in metadata else 'runid'
369
+ return metadata[run_id] , metadata[sample_id] , metadata['model_version_id']
349
370
  except:
350
371
  return None
351
372
 
@@ -354,7 +375,7 @@ class fastqExtractor:
354
375
  """
355
376
  """
356
377
  metadata = dict(x.split("=") for x in name.split(" ")[1:])
357
- start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ')
378
+ start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f%z')
358
379
  if self.is_barcode:
359
380
  return start_time, metadata['ch'], metadata['barcode']
360
381
  return start_time, metadata['ch']