toulligqc 2.5.7__tar.gz → 2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {toulligqc-2.5.7 → toulligqc-2.7}/PKG-INFO +4 -4
  2. {toulligqc-2.5.7 → toulligqc-2.7}/README.md +71 -17
  3. {toulligqc-2.5.7 → toulligqc-2.7}/setup.py +7 -7
  4. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/bam_extractor.py +81 -38
  5. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/common.py +28 -0
  6. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/extractor_common.py +26 -5
  7. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/fast5_extractor.py +54 -77
  8. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/fastq_extractor.py +67 -19
  9. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/html_report_generator.py +2 -2
  10. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/plotly_graph_common.py +8 -3
  11. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/plotly_graph_generator.py +12 -8
  12. toulligqc-2.7/toulligqc/pod5_extractor.py +234 -0
  13. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/sequencing_summary_extractor.py +66 -34
  14. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/sequencing_summary_onedsquare_extractor.py +7 -11
  15. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/toulligqc.py +42 -8
  16. toulligqc-2.7/toulligqc/version.py +1 -0
  17. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/PKG-INFO +4 -4
  18. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/SOURCES.txt +1 -0
  19. toulligqc-2.7/toulligqc.egg-info/requires.txt +11 -0
  20. toulligqc-2.5.7/toulligqc/version.py +0 -1
  21. toulligqc-2.5.7/toulligqc.egg-info/requires.txt +0 -10
  22. {toulligqc-2.5.7 → toulligqc-2.7}/AUTHORS +0 -0
  23. {toulligqc-2.5.7 → toulligqc-2.7}/LICENSE-CeCILL.txt +0 -0
  24. {toulligqc-2.5.7 → toulligqc-2.7}/LICENSE.txt +0 -0
  25. {toulligqc-2.5.7 → toulligqc-2.7}/MANIFEST.in +0 -0
  26. {toulligqc-2.5.7 → toulligqc-2.7}/setup.cfg +0 -0
  27. {toulligqc-2.5.7 → toulligqc-2.7}/test/test_sequencing_summary_extractor.py +0 -0
  28. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/__init__.py +0 -0
  29. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/common_statistics.py +0 -0
  30. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/configuration.py +0 -0
  31. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/fastq_bam_common.py +0 -0
  32. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/plotly_graph_onedsquare_generator.py +0 -0
  33. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/report_data_file_generator.py +0 -0
  34. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/resources/plotly-latest.min.js +0 -0
  35. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/resources/toulligqc.css +0 -0
  36. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/resources/toulligqc.png +0 -0
  37. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/sequencing_telemetry_extractor.py +0 -0
  38. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc/toulligqc_info_extractor.py +0 -0
  39. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/dependency_links.txt +0 -0
  40. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/entry_points.txt +0 -0
  41. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/not-zip-safe +0 -0
  42. {toulligqc-2.5.7 → toulligqc-2.7}/toulligqc.egg-info/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: toulligqc
3
- Version: 2.5.7
3
+ Version: 2.7
4
4
  Summary: A post sequencing QC tool for Oxford Nanopore sequencers
5
- Home-page: https://github.com/GenomicParisCentre/toulligQC
5
+ Home-page: https://github.com/GenomiqueENS/toulligQC
6
6
  Author: Genomic Paris Centre team
7
- Author-email: toulligqc@biologie.ens.fr
7
+ Author-email: toulligqc@bio.ens.psl.eu
8
8
  License: GPL V3
9
9
  Keywords: Nanopore MinION QC report
10
10
  Platform: ALL
@@ -15,7 +15,7 @@ Classifier: Intended Audience :: Science/Research
15
15
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
16
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
17
17
  Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
18
- Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
19
  Requires-Python: >=3.11.0
20
20
  License-File: LICENSE-CeCILL.txt
21
21
  License-File: LICENSE.txt
@@ -24,6 +24,7 @@ Support is availlable on [GitHub issue page](https://github.com/GenomicParisCent
24
24
  * 1.3 [Docker](#docker)
25
25
  * [Docker image recovery](#docker-image-recovery)
26
26
  * [Launching Docker image with docker run](#launching-Docker-image-with-docker-run)
27
+ * 1.4 [nf-core module](#nfcore-module)
27
28
 
28
29
  * 2.[Usage](#usage)
29
30
  * 2.1 [Command line](#command-line)
@@ -93,14 +94,25 @@ $ docker run -ti \
93
94
  -v /path/to/basecaller/sequencing/summary/file:/path/to/basecaller/sequencing/summary/file \
94
95
  -v /path/to/basecaller/sequencing/telemetry/file:/path/to/basecaller/telemetry/summary/file \
95
96
  -v /path/to/result/directory:/path/to/result/directory \
96
- toulligqc:latest
97
+ genomicpariscentre/toulligqc:latest
97
98
  ```
99
+
100
+ <a name="nfcore-module"></a>
101
+ ### 1.4 Using nf-core module
102
+ ToulligQC is also available on nf-core as a module written in nextflow. To install nf-core on your system, please visit their website (<https://nf-co.re/docs/usage/introduction>).
103
+
104
+ The following command line will install the latest version of the ToulligQC module:
105
+
106
+ ```bash
107
+ $ nf-core modules install toulligqc
108
+ ```
109
+
98
110
  <a name="usage"></a>
99
111
  ## 2. Usage
100
112
  <a name="command-line"></a>
101
113
 
102
114
  ToulligQC is adapted to RNA-Seq along with DNA-Seq and it is compatible with 1D² runs.
103
- This QC tool supports only Guppy basecalling ouput files.
115
+ This QC tool supports only Guppy and Dorado basecalling ouput files.
104
116
  It also needs a single FAST5 file (to catch the flowcell ID and the run date) if a telemetry file is not provided.
105
117
  Flow cells and kits version are retrieved using the telemetry file.
106
118
  ToulligQC can take barcoding samples by adding the barcode list as a command line option.
@@ -111,7 +123,7 @@ To do so, ToulligQC deals with different file formats: gz, tar.gz, bz2, tar.bz2
111
123
  This tool will produce a set of graphs, statistic file in plain text format and a HTML report.
112
124
 
113
125
 
114
- To run ToulligQC you need the Guppy basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
126
+ To run ToulligQC you need the Guppy/ Dorado basecaller output files : ```sequencing_summary.txt``` and ```sequencing_telemetry.js```. or ```FASTQ``` or ```BAM```
115
127
  This can be compressed with gzip or bzip2.
116
128
  You can use your initial Fast5 ONT file too.
117
129
  ToulligQC can perform analyses on your data if the directory is organised as the following:
@@ -132,7 +144,7 @@ RUN_ID
132
144
  └── sequencing_1dsq_summary.txt
133
145
  ```
134
146
 
135
- For a barcoded run you can add the barcoding files generated by Guppy ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
147
+ For a barcoded run you can add the barcoding files generated by Guppy/ Dorado ```barcoding_summary_pass.txt``` and ```barcoding_summary_fail.txt``` to ToulligQC or a single file ```sequencing_summary_all.txt``` containing sequencing_summary and barcoding_summary information combined.
136
148
 
137
149
  For the barcode list to use in the command line options, ToulligQC handle the following naming schemes: BCXX, RBXX, NBXX and barcodeXX where XX is the number of the barcode.
138
150
  The barcode naming schemes are case insensitive.
@@ -156,14 +168,16 @@ This is a directory for 1D² analysis with barcoding files:
156
168
 
157
169
  General Options:
158
170
  ```
159
- usage: ToulligQC V2.2.1 -a SEQUENCING_SUMMARY_SOURCE [-t TELEMETRY_SOURCE]
160
- [--fastq -q FASTQ] [--bam -u BAM]
161
- [-f FAST5_SOURCE] [-n REPORT_NAME]
162
- [--output-directory OUTPUT] [-o HTML_REPORT_PATH]
163
- [--data-report-path DATA_REPORT_PATH]
164
- [--images-directory IMAGES_DIRECTORY]
165
- [-d SEQUENCING_SUMMARY_1DSQR_SOURCE] [-b]
166
- [-l BARCODES] [--quiet] [--force] [-h] [--version]
171
+ usage: ToulligQC V2.6 [-a SEQUENCING_SUMMARY_SOURCE] [-t TELEMETRY_SOURCE]
172
+ [-f FAST5_SOURCE] [-p POD5_SOURCE] [-q FASTQ] [-u BAM]
173
+ [--thread THREAD] [--batch-size BATCH_SIZE] [--qscore-threshold THRESHOLD]
174
+ [-n REPORT_NAME] [--output-directory OUTPUT] [-o HTML_REPORT_PATH]
175
+ [--data-report-path DATA_REPORT_PATH]
176
+ [--images-directory IMAGES_DIRECTORY]
177
+ [-d SEQUENCING_SUMMARY_1DSQR_SOURCE]
178
+ [-s SAMPLESHEET]
179
+ [-b] [-l BARCODES]
180
+ [--quiet] [--force] [-h] [--version]
167
181
 
168
182
  required arguments:
169
183
  -a SEQUENCING_SUMMARY_SOURCE, --sequencing-summary-source SEQUENCING_SUMMARY_SOURCE
@@ -175,6 +189,9 @@ required arguments:
175
189
  -f FAST5_SOURCE, --fast5-source FAST5_SOURCE
176
190
  Fast5 file source (necessary if no telemetry file),
177
191
  can also be in a tar.gz/tar.bz2 archive or a directory
192
+ -p POD5_SOURCE, --pod5-source POD5_SOURCE
193
+ pod5 file source (necessary if no telemetry file),
194
+ can also be in a tar.gz/tar.bz2 archive or a directory
178
195
  -q FASTQ, --fastq FASTQ
179
196
  FASTQ file (necessary if no sequencing summary file),
180
197
  can also be in a .gz archive
@@ -183,6 +200,8 @@ required arguments:
183
200
  can also be a SAM format
184
201
 
185
202
  optional arguments:
203
+ -s SAMPLESHEET, --samplesheet SAMPLESHEET
204
+ Samplesheet (.csv file) to fill out sample names in MinKNOW.
186
205
  -n REPORT_NAME, --report-name REPORT_NAME
187
206
  Report name
188
207
  --output-directory OUTPUT
@@ -197,8 +216,9 @@ optional arguments:
197
216
  Basecaller 1dsq summary source
198
217
  -b, --barcoding Option for barcode usage
199
218
  -l BARCODES, --barcodes BARCODES
200
- Coma separated barcode list (e.g.
201
- BC05,RB09,NB01,barcode10)
219
+ Comma-separated barcode list (e.g.,
220
+ BC05,RB09,NB01,barcode10) or a range separated with ':' (e.g.,
221
+ barcode01:barcode19)
202
222
  --thread THREAD Number of threads for parsing FASTQ or BAM files (default: 2).
203
223
  --batch-size BATCH_SIZE Batch size for each threads (default: 500).
204
224
  --qscore-threshold THRESHOLD Q-score threshold to distinguish between passing filter and
@@ -213,7 +233,41 @@ optional arguments:
213
233
  * #### Examples
214
234
 
215
235
 
216
- Example with optional arguments:
236
+ * Sequencing summary alone \
237
+ Note that the fowcell ID and run date will be missing from report, found in telemetry file or single fast5 file
238
+
239
+ ```bash
240
+ $ toulligqc --report-name summary_only \
241
+ --sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
242
+ --html-report-path /path/to/output/report.html
243
+ ```
244
+ * Sequencing summary + telemetry file
245
+
246
+ ```bash
247
+ $ toulligqc --report-name summary_plus_telemetry \
248
+ --telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
249
+ --sequencing-summary-source /path/to/basecaller/output/sequencing_summary.txt \
250
+ --html-report-path /path/to/output/report.html
251
+ ```
252
+
253
+ * Telemetry file + fast5 files
254
+
255
+ ```bash
256
+ $ toulligqc --report-name telemetry_plus_fast5 \
257
+ --telemetry-source /path/to/basecaller/output/sequencing_telemetry.js \
258
+ --fast5-source /path/to/basecaller/output/fast5_files.fast5.gz \
259
+ --html-report-path /path/to/output/report.html
260
+ ```
261
+
262
+ * Fastq/ bam files only
263
+
264
+ ```bash
265
+ $ toulligqc --report-name FAF0256 \
266
+ --fastq /path/to/basecaller/output/fastq_files.fq.gz \ # (replace with --bam)
267
+ --html-report-path /path/to/output/report.html
268
+ ```
269
+
270
+ * Optional arguments for 1D² analysis
217
271
 
218
272
  ```bash
219
273
  $ toulligqc --report-name FAF0256 \
@@ -223,7 +277,7 @@ $ toulligqc --report-name FAF0256 \
223
277
  --html-report-path /path/to/output/report.html
224
278
  ```
225
279
 
226
- Example with optional arguments to deal with barcoded samples:
280
+ * Optional arguments to deal with barcoded samples
227
281
 
228
282
  ```bash
229
283
  $ toulligqc --report-name FAF0256 \
@@ -271,7 +325,7 @@ $ toulligqc \
271
325
  --sequencing-summary-source sequencing_summary.txt \
272
326
  --sequencing-summary-source barcoding_summary_pass.txt \
273
327
  --sequencing-summary-source barcoding_summary_fail.txt \
274
- --barcodes BC01,BC02,BC03,BC04,BC05,BC07 \
328
+ --barcodes BC01:BC07 \
275
329
  --output-directory output
276
330
  ```
277
331
 
@@ -14,11 +14,11 @@ setup(
14
14
  long_description='See project website for more information.',
15
15
 
16
16
  # The project's main homepage.
17
- url='https://github.com/GenomicParisCentre/toulligQC',
17
+ url='https://github.com/GenomiqueENS/toulligQC',
18
18
 
19
19
  # Author details
20
20
  author='Genomic Paris Centre team',
21
- author_email='toulligqc@biologie.ens.fr',
21
+ author_email='toulligqc@bio.ens.psl.eu',
22
22
 
23
23
  license='GPL V3',
24
24
  platforms='ALL',
@@ -34,7 +34,7 @@ setup(
34
34
  'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
35
35
  'License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)',
36
36
 
37
- 'Programming Language :: Python :: 3.11'
37
+ 'Programming Language :: Python :: 3.12'
38
38
  ],
39
39
 
40
40
  keywords='Nanopore MinION QC report',
@@ -46,10 +46,10 @@ setup(
46
46
  include_package_data=True,
47
47
 
48
48
  python_requires='>=3.11.0',
49
- install_requires=['matplotlib>=3.6.3', 'plotly==4.5.0', 'h5py>=3.7.0',
50
- 'pandas>=1.5.3', 'numpy>=1.24.2', 'scipy>=1.10.1',
51
- 'scikit-learn>=1.2.1', 'tqdm>=4.64.1', 'pysam>=0.21.0',
52
- 'ezcharts==0.7.6'],
49
+ install_requires=['matplotlib>=3.6.3', 'plotly==5.15.0', 'h5py>=3.10.0',
50
+ 'pandas>=2.1.4', 'numpy>=1.26.4', 'scipy>=1.11.4',
51
+ 'scikit-learn>=1.4.1', 'tqdm>=4.66.2', 'pysam>=0.22.0',
52
+ 'pod5>=0.3.10', 'ezcharts==0.7.6'],
53
53
 
54
54
  entry_points={
55
55
  'console_scripts': [
@@ -4,7 +4,7 @@ import numpy as np
4
4
  import pandas as pd
5
5
  import time
6
6
  import pysam
7
- from datetime import datetime
7
+ from collections import defaultdict
8
8
  from toulligqc.extractor_common import log_task
9
9
  from toulligqc.extractor_common import describe_dict
10
10
  from toulligqc.extractor_common import set_result_value
@@ -15,6 +15,7 @@ from toulligqc.extractor_common import get_result_value
15
15
  from toulligqc.extractor_common import set_result_dict_telemetry_value
16
16
  from toulligqc.extractor_common import fill_series_dict
17
17
  from toulligqc.extractor_common import timeISO_to_float
18
+ from toulligqc.extractor_common import extract_barcode_info
18
19
  from toulligqc.common_statistics import compute_NXX, compute_LXX, occupancy_channel, avg_qual
19
20
  from toulligqc.fastq_bam_common import multiprocessing_submit, extract_headerTag
20
21
  from toulligqc.fastq_bam_common import batch_iterator
@@ -24,13 +25,16 @@ from toulligqc import plotly_graph_generator as pgg
24
25
 
25
26
  class uBAM_Extractor:
26
27
  def __init__(self, config_dictionary):
27
- self.config_file_dictionary = config_dictionary
28
+ self.config_dictionary = config_dictionary
28
29
  self.ubam = config_dictionary['bam'].split('\t')
29
30
  self.images_directory = config_dictionary['images_directory']
30
31
  self.threshold_Qscore = int(config_dictionary['threshold'])
31
32
  self.batch_size = int(config_dictionary['batch_size'])
32
33
  self.thread = int(config_dictionary['thread'])
33
34
  self.header = dict()
35
+ self.is_barcode = False
36
+ if config_dictionary['barcoding'] == 'True':
37
+ self.is_barcode = True
34
38
  if 'quiet' not in config_dictionary or config_dictionary['quiet'].lower() != 'true':
35
39
  self.quiet = False
36
40
  else:
@@ -53,12 +57,24 @@ class uBAM_Extractor:
53
57
  :return: Panda's Dataframe object
54
58
  """
55
59
  start_time = time.time()
56
- self.dataframe_1d = self._load_uBAM_data()
57
- if self.dataframe_1d.empty:
60
+ self.dataframe = self._load_uBAM_file()
61
+ if self.dataframe.empty:
58
62
  raise pd.errors.EmptyDataError("Dataframe is empty")
59
63
  self.dataframe_dict = {}
64
+
65
+ # Add missing categories
66
+ if 'barcode_arrangement' in self.dataframe.columns:
67
+ self.dataframe['barcode_arrangement'] = self.dataframe['barcode_arrangement'].cat.add_categories([0,
68
+ 'other barcodes',
69
+ 'passes_filtering'])
70
+
71
+ # Replace all NaN values by 0 to avoid data manipulation errors when columns are not the same length
72
+ self.dataframe = self.dataframe.fillna(0)
73
+
74
+ self.barcode_selection = self.config_dictionary['barcode_selection']
75
+
60
76
  log_task(self.quiet,
61
- 'Load BAM file ({:,.2f} MB used)'.format(self.dataframe_1d.memory_usage(deep=True).sum()/1024/1024),
77
+ 'Load BAM file ({:,.2f} MB used)'.format(self.dataframe.memory_usage(deep=True).sum()/1024/1024),
62
78
  start_time,
63
79
  time.time())
64
80
 
@@ -70,7 +86,7 @@ class uBAM_Extractor:
70
86
  """
71
87
  check_result_values(self, result_dict)
72
88
  self.dataframe_dict.clear()
73
- self.dataframe_1d.iloc[0:0]
89
+ self.dataframe.iloc[0:0]
74
90
 
75
91
 
76
92
  @staticmethod
@@ -79,7 +95,7 @@ class uBAM_Extractor:
79
95
  Get the name of the extractor.
80
96
  :return: the name of the extractor
81
97
  """
82
- return 'ubam'
98
+ return 'uBAM'
83
99
 
84
100
 
85
101
  @staticmethod
@@ -100,14 +116,38 @@ class uBAM_Extractor:
100
116
 
101
117
  add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory))
102
118
  add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory))
103
- add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe_1d, self.images_directory))
119
+ add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe, self.images_directory))
104
120
  add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory))
105
121
  add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))
106
- add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory))
122
+ add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe, self.images_directory))
107
123
  add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))
108
124
  add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory))
109
125
  add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
110
126
  add_image_to_result(self.quiet, images, time.time(), pgg.speed_over_time(self.dataframe_dict, self.images_directory))
127
+ if self.is_barcode:
128
+ if "barcode_alias" in self.config_dictionary:
129
+ barcode_alias = self.config_dictionary['barcode_alias']
130
+ else:
131
+ barcode_alias = None
132
+ add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
133
+ self.barcode_selection,
134
+ self.images_directory,
135
+ barcode_alias))
136
+
137
+ read_fail = self.dataframe_dict["read.fail.barcoded"]
138
+ if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
139
+ add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
140
+ self.barcode_selection,
141
+ self.images_directory,
142
+ barcode_alias))
143
+
144
+ add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
145
+ self.images_directory,
146
+ barcode_alias))
147
+
148
+ add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
149
+ self.images_directory,
150
+ barcode_alias))
111
151
  return images
112
152
 
113
153
 
@@ -117,7 +157,7 @@ class uBAM_Extractor:
117
157
  :param result_dict:
118
158
  """
119
159
  start_time = time.time()
120
- fill_series_dict(self.dataframe_dict, self.dataframe_1d)
160
+ fill_series_dict(self.dataframe_dict, self.dataframe)
121
161
 
122
162
  set_result_dict_telemetry_value(result_dict, "run.id", self.header["run_id"])
123
163
  set_result_dict_telemetry_value(result_dict, "sample.id", self.header["sample_id"])
@@ -128,11 +168,11 @@ class uBAM_Extractor:
128
168
  set_result_dict_telemetry_value(result_dict, "basecalling.date", self.header["run_date"])
129
169
  set_result_dict_telemetry_value(result_dict, "pass.threshold.qscore", str(self.threshold_Qscore))
130
170
 
131
- set_result_value(self, result_dict, "read.count", len(self.dataframe_1d))
171
+ set_result_value(self, result_dict, "read.count", len(self.dataframe))
132
172
  set_result_value(self, result_dict, "read.pass.count",
133
- count_boolean_elements(self.dataframe_1d, 'passes_filtering', True))
173
+ count_boolean_elements(self.dataframe, 'passes_filtering', True))
134
174
  set_result_value(self, result_dict, "read.fail.count",
135
- count_boolean_elements(self.dataframe_1d, 'passes_filtering', False))
175
+ count_boolean_elements(self.dataframe, 'passes_filtering', False))
136
176
  total_reads = get_result_value(self, result_dict, "read.count")
137
177
 
138
178
  # Ratios
@@ -160,9 +200,9 @@ class uBAM_Extractor:
160
200
  set_result_value(self, result_dict, "n50", compute_NXX(self.dataframe_dict, 50))
161
201
  set_result_value(self, result_dict, "l50", compute_LXX(self.dataframe_dict, 50))
162
202
 
163
- set_result_value(self, result_dict, "run.time", max(self.dataframe_1d['start_time']))
203
+ set_result_value(self, result_dict, "run.time", max(self.dataframe['start_time']))
164
204
  # Get channel occupancy statistics and store each value into result_dict
165
- for index, value in occupancy_channel(self.dataframe_1d).items():
205
+ for index, value in occupancy_channel(self.dataframe).items():
166
206
  set_result_value(self,
167
207
  result_dict, "channel.occupancy.statistics." + index, value)
168
208
 
@@ -180,7 +220,7 @@ class uBAM_Extractor:
180
220
  "fail.reads.sequence.length")
181
221
 
182
222
  # Get Qscore statistics without count value and store them into result_dict
183
- qscore_statistics = self.dataframe_1d['mean_qscore'].describe().drop(
223
+ qscore_statistics = self.dataframe['mean_qscore'].describe().drop(
184
224
  "count")
185
225
 
186
226
  for index, value in qscore_statistics.items():
@@ -190,11 +230,16 @@ class uBAM_Extractor:
190
230
  # Add statistics (without count) about read pass/fail qscore in the result_dict
191
231
  describe_dict(self, result_dict, self.dataframe_dict["pass.reads.mean.qscore"], "pass.reads.mean.qscore")
192
232
  describe_dict(self, result_dict, self.dataframe_dict["fail.reads.mean.qscore"], "fail.reads.mean.qscore")
193
-
233
+ if self.is_barcode:
234
+ extract_barcode_info(self, result_dict,
235
+ self.barcode_selection,
236
+ self.dataframe_dict,
237
+ self.dataframe)
238
+
194
239
  log_task(self.quiet, 'Extract info from uBAM file', start_time, time.time())
195
240
 
196
241
 
197
- def _load_uBAM_data(self):
242
+ def _load_uBAM_file(self):
198
243
  """
199
244
  Load uBAM dataframe
200
245
  :return: a Pandas Dataframe object
@@ -205,23 +250,27 @@ class uBAM_Extractor:
205
250
  uBAM_chunks,
206
251
  n_process=self.thread,
207
252
  pbar_update=self.batch_size)
208
- uBAM_data = []
253
+ uBAM_df = []
209
254
 
210
255
  for _, f in enumerate(rst_futures):
211
- uBAM_data.extend(f.result())
256
+ uBAM_df.extend(f.result())
212
257
 
213
258
  columns = ['sequence_length', 'mean_qscore', 'passes_filtering', 'start_time', 'channel', 'duration']
214
-
215
- uBAM_data = pd.DataFrame(uBAM_data, columns=columns)
259
+ if self.is_barcode:
260
+ columns.append('barcode_arrangement')
216
261
 
217
- uBAM_data['sequence_length'] = uBAM_data['sequence_length'].astype(np.uint32)
218
- uBAM_data['mean_qscore'] = uBAM_data['mean_qscore'].astype(np.float32)
219
- uBAM_data['passes_filtering'] = uBAM_data['passes_filtering'].astype(np.bool_ if is_numpy_1_24 else np.bool)
220
- uBAM_data["start_time"] = uBAM_data["start_time"] - uBAM_data["start_time"].min()
221
- uBAM_data['channel'] = uBAM_data['channel'].astype(np.int16)
222
- uBAM_data['start_time'] = uBAM_data['start_time'].astype(np.float64)
223
- uBAM_data['duration'] = uBAM_data['duration'].astype(np.float32)
224
- return uBAM_data
262
+ uBAM_df = pd.DataFrame(uBAM_df, columns=columns)
263
+
264
+ uBAM_df['sequence_length'] = uBAM_df['sequence_length'].astype(np.uint32)
265
+ uBAM_df['mean_qscore'] = uBAM_df['mean_qscore'].astype(np.float32)
266
+ uBAM_df['passes_filtering'] = uBAM_df['passes_filtering'].astype(np.bool_ if is_numpy_1_24 else np.bool)
267
+ uBAM_df["start_time"] = uBAM_df["start_time"] - uBAM_df["start_time"].min()
268
+ uBAM_df['channel'] = uBAM_df['channel'].astype(np.int16)
269
+ uBAM_df['start_time'] = uBAM_df['start_time'].astype(np.float64)
270
+ uBAM_df['duration'] = uBAM_df['duration'].astype(np.float32)
271
+ if self.is_barcode:
272
+ uBAM_df['barcode_arrangement'] = uBAM_df['barcode_arrangement'].astype("category")
273
+ return uBAM_df
225
274
 
226
275
 
227
276
  def _uBAM_batch_reader(self, uBAM_chunk):
@@ -251,14 +300,6 @@ class uBAM_Extractor:
251
300
  yield batch
252
301
 
253
302
 
254
- def _timeISO_to_float(self, iso_datetime, format):
255
- """
256
- """
257
- dt = datetime.strptime(iso_datetime, format)
258
- unix_timestamp = dt.timestamp()
259
- return unix_timestamp
260
-
261
-
262
303
  def _get_header(self):
263
304
  sam_file = pysam.AlignmentFile(self.ubam[0], "rb", check_sq=False)
264
305
  header = sam_file.header.to_dict()
@@ -299,4 +340,6 @@ class uBAM_Extractor:
299
340
  attributes.get('ch', '1'), # Channel
300
341
  attributes.get('du', '1') # Duration
301
342
  ]
343
+ if self.is_barcode:
344
+ data.append(attributes.get('BC', 'unclassified'))
302
345
  return data
@@ -20,6 +20,8 @@
20
20
 
21
21
  import numpy as np
22
22
  from packaging import version
23
+ import glob
24
+ import os
23
25
 
24
26
  def is_numpy_1_24():
25
27
  """
@@ -27,6 +29,7 @@ def is_numpy_1_24():
27
29
  """
28
30
  return version.parse(np.__version__) >= version.parse("1.20")
29
31
 
32
+
30
33
  def format_duration(t):
31
34
  """
32
35
  Format a time duration
@@ -35,3 +38,28 @@ def format_duration(t):
35
38
  """
36
39
 
37
40
  return "{:,d}m{:2.2f}s".format(int(t // 60), t % 60)
41
+
42
+
43
+ def set_result_dict_value(result_dict, key, tracking_id_dict, dict_key):
44
+ """
45
+ Set metadata values from Fast5 or pod5 dict to result_dict
46
+ """
47
+ value = ''
48
+ if dict_key in tracking_id_dict:
49
+ value = tracking_id_dict[dict_key]
50
+
51
+ result_dict[key] = value
52
+
53
+
54
+ def find_file_in_directory(source_file, format):
55
+ """
56
+ Looking for a suitable Fast5 or Pod5 file in the source directory.
57
+ :return: The path to the first suitable file in the source directory
58
+ """
59
+ for ext in (format, 'tar.bz2', 'tar.gz'):
60
+ if glob.glob(source_file + '/*.' + ext):
61
+ files_found = os.listdir(source_file)
62
+ if len(files_found) > 0:
63
+ return source_file + files_found[0]
64
+
65
+ return None
@@ -164,16 +164,24 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
164
164
  if "unclassified" not in barcode_selection:
165
165
  barcode_selection.append("unclassified")
166
166
 
167
+
168
+ # If the barcode_arrangement column contains a barcode kit id
169
+ mask = df['barcode_arrangement'].str.startswith(('SQK', 'VQK'))
170
+
171
+ if mask.any():
172
+ df['barcode_arrangement'] = df['barcode_arrangement'].astype(str)
173
+ df.loc[mask, 'barcode_arrangement'] = df.loc[mask, 'barcode_arrangement'].str.extract(r'[SV]QK-.+_(.+)$')[0]
174
+
167
175
  # Create keys barcode.arrangement, and read.pass/fail.barcode in dataframe_dict with all values of
168
176
  # column barcode_arrangement when reads are passed/failed
169
- dataframe_dict["barcode.arrangement"] = df["barcode_arrangement"]
177
+ dataframe_dict["barcode.arrangement"] = df['barcode_arrangement']
178
+
170
179
 
171
180
  # Print warning message if a barcode is unknown
172
- barcodes_found = set(dataframe_dict["barcode.arrangement"].unique())
181
+ barcodes_found = set(df["barcode_arrangement"].unique())
173
182
  for element in barcode_selection:
174
183
  if element not in barcodes_found and element != 'other barcodes':
175
- sys.stderr.write("Warning: The barcode {} doesn't exist in input data\n".format(element))
176
-
184
+ sys.stderr.write("\033[93mWarning:\033[0m The barcode {} doesn't exist in input data\n".format(element))
177
185
 
178
186
  # Get barcodes frequency by Bases
179
187
  df_base_pass_barcode = series_cols_boolean_elements(df, ["barcode_arrangement", "sequence_length"],
@@ -218,6 +226,7 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
218
226
  (read_fail_barcoded_count / total_reads) * 100)
219
227
 
220
228
  # Replaces all rows with unused barcodes (ie not in barcode_selection) in column barcode_arrangement with the 'other' value
229
+
221
230
  df.loc[~df['barcode_arrangement'].isin(
222
231
  barcode_selection), 'barcode_arrangement'] = 'other barcodes'
223
232
 
@@ -447,6 +456,7 @@ def read_first_line_file(filename):
447
456
  except IOError:
448
457
  raise FileNotFoundError
449
458
 
459
+
450
460
  def set_result_dict_telemetry_value(result_dict, key, new_value):
451
461
  """
452
462
  """
@@ -461,4 +471,15 @@ def set_result_dict_telemetry_value(result_dict, key, new_value):
461
471
  if new_value is None:
462
472
  new_value = current_value
463
473
 
464
- result_dict[final_key] = new_value
474
+ result_dict[final_key] = new_value
475
+
476
+
477
+ def pd_read_sequencing_summary(file, cols, data_type):
478
+ try:
479
+ return pd.read_csv(file, sep="\t", usecols=cols,
480
+ dtype=data_type)
481
+ except:
482
+ del data_type['passes_filtering']
483
+ cols.remove('passes_filtering')
484
+ return pd.read_csv(file, sep="\t", usecols=cols,
485
+ dtype=data_type)