toulligqc 2.5.2__tar.gz → 2.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {toulligqc-2.5.2 → toulligqc-2.5.4}/PKG-INFO +3 -3
- {toulligqc-2.5.2 → toulligqc-2.5.4}/setup.py +5 -4
- {toulligqc-2.5.2 → toulligqc-2.5.4}/test/test_sequencing_summary_extractor.py +2 -1
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/bam_extractor.py +32 -21
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/fastq_bam_common.py +17 -2
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/fastq_extractor.py +2 -1
- toulligqc-2.5.4/toulligqc/version.py +1 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc.egg-info/PKG-INFO +3 -3
- toulligqc-2.5.4/toulligqc.egg-info/requires.txt +9 -0
- toulligqc-2.5.2/toulligqc/version.py +0 -1
- toulligqc-2.5.2/toulligqc.egg-info/requires.txt +0 -7
- {toulligqc-2.5.2 → toulligqc-2.5.4}/AUTHORS +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/LICENSE-CeCILL.txt +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/LICENSE.txt +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/MANIFEST.in +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/README.md +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/setup.cfg +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/__init__.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/common.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/common_statistics.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/configuration.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/extractor_common.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/fast5_extractor.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/html_report_generator.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/plotly_graph_common.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/plotly_graph_generator.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/plotly_graph_onedsquare_generator.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/report_data_file_generator.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/resources/plotly-latest.min.js +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/resources/toulligqc.css +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/resources/toulligqc.png +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/sequencing_summary_extractor.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/sequencing_summary_onedsquare_extractor.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/sequencing_telemetry_extractor.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/toulligqc.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc/toulligqc_info_extractor.py +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc.egg-info/SOURCES.txt +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc.egg-info/dependency_links.txt +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc.egg-info/entry_points.txt +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc.egg-info/not-zip-safe +0 -0
- {toulligqc-2.5.2 → toulligqc-2.5.4}/toulligqc.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: toulligqc
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.4
|
|
4
4
|
Summary: A post sequencing QC tool for Oxford Nanopore sequencers
|
|
5
5
|
Home-page: https://github.com/GenomicParisCentre/toulligQC
|
|
6
6
|
Author: Genomic Paris Centre team
|
|
@@ -15,8 +15,8 @@ Classifier: Intended Audience :: Science/Research
|
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
16
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
17
17
|
Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.
|
|
19
|
-
Requires-Python: >=3.
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Requires-Python: >=3.11.0
|
|
20
20
|
License-File: LICENSE-CeCILL.txt
|
|
21
21
|
License-File: LICENSE.txt
|
|
22
22
|
License-File: AUTHORS
|
|
@@ -34,7 +34,7 @@ setup(
|
|
|
34
34
|
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
|
|
35
35
|
'License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)',
|
|
36
36
|
|
|
37
|
-
'Programming Language :: Python :: 3.
|
|
37
|
+
'Programming Language :: Python :: 3.11'
|
|
38
38
|
],
|
|
39
39
|
|
|
40
40
|
keywords='Nanopore MinION QC report',
|
|
@@ -45,9 +45,10 @@ setup(
|
|
|
45
45
|
zip_safe=False,
|
|
46
46
|
include_package_data=True,
|
|
47
47
|
|
|
48
|
-
python_requires='>=3.
|
|
49
|
-
install_requires=['matplotlib>=3.
|
|
50
|
-
'pandas>=
|
|
48
|
+
python_requires='>=3.11.0',
|
|
49
|
+
install_requires=['matplotlib>=3.6.3', 'plotly>=5.15.0', 'h5py>=3.7.0',
|
|
50
|
+
'pandas>=1.5.3', 'numpy>=1.24.2', 'scipy>=1.10.1',
|
|
51
|
+
'scikit-learn>=1.2.1', 'tqdm>=4.64.1', 'pysam>=0.21.0'],
|
|
51
52
|
|
|
52
53
|
entry_points={
|
|
53
54
|
'console_scripts': [
|
|
@@ -6,6 +6,7 @@ from unittest.mock import patch, Mock, MagicMock
|
|
|
6
6
|
import config as cfg
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import pandas.util.testing as testing
|
|
9
|
+
from toulligqc.common import is_numpy_1_24
|
|
9
10
|
import numpy as np
|
|
10
11
|
from distutils import util
|
|
11
12
|
|
|
@@ -65,7 +66,7 @@ class TestSequencingSummaryExtractorWholeConfig (unittest.TestCase):
|
|
|
65
66
|
cls.expected_df = cls.expected_df.astype({
|
|
66
67
|
'channel': np.int16,
|
|
67
68
|
'start_time': np.float,
|
|
68
|
-
'passes_filtering': np.bool,
|
|
69
|
+
'passes_filtering': np.bool_ if is_numpy_1_24 else np.bool,
|
|
69
70
|
'sequence_length': np.uint32,
|
|
70
71
|
'mean_qscore_template': np.float,
|
|
71
72
|
'duration': np.float,
|
|
@@ -18,6 +18,7 @@ from toulligqc.extractor_common import timeISO_to_float
|
|
|
18
18
|
from toulligqc.common_statistics import compute_NXX, compute_LXX, occupancy_channel, avg_qual
|
|
19
19
|
from toulligqc.fastq_bam_common import multiprocessing_submit, extract_headerTag
|
|
20
20
|
from toulligqc.fastq_bam_common import batch_iterator
|
|
21
|
+
from toulligqc.common import is_numpy_1_24
|
|
21
22
|
from toulligqc import plotly_graph_generator as pgg
|
|
22
23
|
|
|
23
24
|
|
|
@@ -215,7 +216,7 @@ class uBAM_Extractor:
|
|
|
215
216
|
|
|
216
217
|
uBAM_data['sequence_length'] = uBAM_data['sequence_length'].astype(np.uint32)
|
|
217
218
|
uBAM_data['mean_qscore'] = uBAM_data['mean_qscore'].astype(np.float32)
|
|
218
|
-
uBAM_data['passes_filtering'] = uBAM_data['passes_filtering'].astype(np.bool)
|
|
219
|
+
uBAM_data['passes_filtering'] = uBAM_data['passes_filtering'].astype(np.bool_ if is_numpy_1_24 else np.bool)
|
|
219
220
|
uBAM_data["start_time"] = uBAM_data["start_time"] - uBAM_data["start_time"].min()
|
|
220
221
|
uBAM_data['channel'] = uBAM_data['channel'].astype(np.int16)
|
|
221
222
|
uBAM_data['start_time'] = uBAM_data['start_time'].astype(np.float64)
|
|
@@ -230,8 +231,10 @@ class uBAM_Extractor:
|
|
|
230
231
|
"""
|
|
231
232
|
#def process_bam_chunk(bam_chunk):
|
|
232
233
|
rec_data = []
|
|
234
|
+
record_count = 0
|
|
233
235
|
for rec in uBAM_chunk:
|
|
234
|
-
|
|
236
|
+
record_count += 1
|
|
237
|
+
rec_dict = self._process_record(rec, record_count)
|
|
235
238
|
rec_data.append(rec_dict)
|
|
236
239
|
return rec_data
|
|
237
240
|
|
|
@@ -257,35 +260,43 @@ class uBAM_Extractor:
|
|
|
257
260
|
|
|
258
261
|
|
|
259
262
|
def _get_header(self):
|
|
260
|
-
|
|
261
|
-
header =
|
|
262
|
-
run_id, model_version_id =
|
|
263
|
+
sam_file = pysam.AlignmentFile(self.ubam[0], "rb", check_sq=False)
|
|
264
|
+
header = sam_file.header.to_dict()
|
|
265
|
+
run_id, model_version_id = extract_headerTag(header, 'RG','ID',
|
|
266
|
+
'Unknown_Unknown').split('_', 1)
|
|
263
267
|
self.header = {
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
268
|
+
"run_id": run_id,
|
|
269
|
+
"run_date": extract_headerTag(header, 'RG', 'DT', 'Unknown'),
|
|
270
|
+
"sample_id": extract_headerTag(header, 'RG', 'SM', 'Unknown'),
|
|
271
|
+
"basecaller": extract_headerTag(header, 'PG', 'PN', 'Unknown'),
|
|
272
|
+
"basecaller_version": extract_headerTag(header, 'PG', 'VN', 'Unknown'),
|
|
273
|
+
"model_version_id": model_version_id,
|
|
274
|
+
"flow_cell_id": extract_headerTag(header, 'RG', 'PU', 'Unknown')
|
|
271
275
|
}
|
|
272
276
|
|
|
273
277
|
|
|
274
|
-
def _process_record(self, rec):
|
|
278
|
+
def _process_record(self, rec, record_count):
|
|
275
279
|
"""
|
|
276
280
|
extract QC info from BAM record
|
|
277
281
|
return : dict of QC info
|
|
278
282
|
"""
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
283
|
+
fields = rec.split("\t")
|
|
284
|
+
|
|
285
|
+
# Parse optional fields
|
|
286
|
+
attributes = {}
|
|
287
|
+
for t in fields[11:]:
|
|
288
|
+
k, t, v = t.split(':', 2)
|
|
289
|
+
attributes[k] = v
|
|
290
|
+
|
|
291
|
+
iso_start_time = attributes.get('st', None)
|
|
292
|
+
qual = avg_qual(fields[10])
|
|
282
293
|
passes_filtering = True if qual > self.threshold_Qscore else False
|
|
283
294
|
data = [
|
|
284
|
-
len(
|
|
295
|
+
len(fields[9]), # read length
|
|
285
296
|
qual, # AVG Qscore
|
|
286
297
|
passes_filtering, # Passing filter
|
|
287
|
-
timeISO_to_float(iso_start_time, '%Y-%m-%dT%H:%M:%S.%f%z'), # start time
|
|
288
|
-
|
|
289
|
-
|
|
298
|
+
float(record_count) if iso_start_time is None else timeISO_to_float(iso_start_time, '%Y-%m-%dT%H:%M:%S.%f%z'), # start time
|
|
299
|
+
attributes.get('ch', '1'), # Channel
|
|
300
|
+
attributes.get('du', '1') # Duration
|
|
290
301
|
]
|
|
291
|
-
return data
|
|
302
|
+
return data
|
|
@@ -2,8 +2,23 @@ import multiprocessing as mp
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
4
4
|
|
|
5
|
-
def extract_headerTag(header, tagGroup, tag):
|
|
6
|
-
|
|
5
|
+
def extract_headerTag(header, tagGroup, tag, defaultValue = None):
|
|
6
|
+
|
|
7
|
+
if tagGroup not in header:
|
|
8
|
+
if defaultValue is not None:
|
|
9
|
+
return defaultValue
|
|
10
|
+
else:
|
|
11
|
+
raise KeyError(tagGroup)
|
|
12
|
+
|
|
13
|
+
first_entry = header[tagGroup][0]
|
|
14
|
+
|
|
15
|
+
if tag not in first_entry:
|
|
16
|
+
if defaultValue is not None:
|
|
17
|
+
return defaultValue
|
|
18
|
+
else:
|
|
19
|
+
raise KeyError(tag)
|
|
20
|
+
|
|
21
|
+
return first_entry[tag]
|
|
7
22
|
|
|
8
23
|
|
|
9
24
|
def batch_iterator(iterator, batch_size):
|
|
@@ -16,6 +16,7 @@ from toulligqc.extractor_common import set_result_dict_telemetry_value
|
|
|
16
16
|
from toulligqc.extractor_common import timeISO_to_float
|
|
17
17
|
from toulligqc.common_statistics import compute_NXX, compute_LXX, occupancy_channel, avg_qual
|
|
18
18
|
from toulligqc.fastq_bam_common import multiprocessing_submit
|
|
19
|
+
from toulligqc.common import is_numpy_1_24
|
|
19
20
|
from toulligqc import plotly_graph_generator as pgg
|
|
20
21
|
|
|
21
22
|
|
|
@@ -226,7 +227,7 @@ class fastqExtractor:
|
|
|
226
227
|
|
|
227
228
|
fq_data['sequence_length'] = fq_data['sequence_length'].astype(np.uint32)
|
|
228
229
|
fq_data['mean_qscore'] = fq_data['mean_qscore'].astype(np.float32)
|
|
229
|
-
fq_data['passes_filtering'] = fq_data['passes_filtering'].astype(np.bool)
|
|
230
|
+
fq_data['passes_filtering'] = fq_data['passes_filtering'].astype(np.bool_ if is_numpy_1_24 else np.bool)
|
|
230
231
|
|
|
231
232
|
if self.rich:
|
|
232
233
|
fq_data["start_time"] = fq_data["start_time"] - fq_data["start_time"].min()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '2.5.4'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: toulligqc
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.4
|
|
4
4
|
Summary: A post sequencing QC tool for Oxford Nanopore sequencers
|
|
5
5
|
Home-page: https://github.com/GenomicParisCentre/toulligQC
|
|
6
6
|
Author: Genomic Paris Centre team
|
|
@@ -15,8 +15,8 @@ Classifier: Intended Audience :: Science/Research
|
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
16
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
17
17
|
Classifier: License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.
|
|
19
|
-
Requires-Python: >=3.
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Requires-Python: >=3.11.0
|
|
20
20
|
License-File: LICENSE-CeCILL.txt
|
|
21
21
|
License-File: LICENSE.txt
|
|
22
22
|
License-File: AUTHORS
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = '2.5.2'
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|