toulligqc 2.2.3__tar.gz → 2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {toulligqc-2.2.3/toulligqc.egg-info → toulligqc-2.3}/PKG-INFO +1 -1
  2. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/common.py +8 -0
  3. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/plotly_graph_common.py +235 -46
  4. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/plotly_graph_generator.py +10 -8
  5. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/plotly_graph_onedsquare_generator.py +9 -7
  6. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/sequencing_summary_common.py +74 -0
  7. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/sequencing_summary_extractor.py +3 -4
  8. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/sequencing_summary_onedsquare_extractor.py +4 -3
  9. toulligqc-2.3/toulligqc/version.py +1 -0
  10. {toulligqc-2.2.3 → toulligqc-2.3/toulligqc.egg-info}/PKG-INFO +1 -1
  11. toulligqc-2.2.3/toulligqc/version.py +0 -1
  12. {toulligqc-2.2.3 → toulligqc-2.3}/MANIFEST.in +0 -0
  13. {toulligqc-2.2.3 → toulligqc-2.3}/README.md +0 -0
  14. {toulligqc-2.2.3 → toulligqc-2.3}/setup.cfg +0 -0
  15. {toulligqc-2.2.3 → toulligqc-2.3}/setup.py +0 -0
  16. {toulligqc-2.2.3 → toulligqc-2.3}/test/test_sequencing_summary_extractor.py +0 -0
  17. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/__init__.py +0 -0
  18. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/configuration.py +0 -0
  19. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/fast5_extractor.py +0 -0
  20. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/html_report_generator.py +0 -0
  21. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/report_data_file_generator.py +0 -0
  22. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/resources/plotly-latest.min.js +0 -0
  23. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/resources/toulligqc.css +0 -0
  24. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/resources/toulligqc.png +0 -0
  25. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/sequencing_telemetry_extractor.py +0 -0
  26. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/toulligqc.py +0 -0
  27. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc/toulligqc_info_extractor.py +0 -0
  28. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc.egg-info/SOURCES.txt +0 -0
  29. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc.egg-info/dependency_links.txt +0 -0
  30. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc.egg-info/entry_points.txt +0 -0
  31. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc.egg-info/not-zip-safe +0 -0
  32. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc.egg-info/requires.txt +0 -0
  33. {toulligqc-2.2.3 → toulligqc-2.3}/toulligqc.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 1.2
2
2
  Name: toulligqc
3
- Version: 2.2.3
3
+ Version: 2.3
4
4
  Summary: A post sequencing QC tool for Oxford Nanopore sequencers
5
5
  Home-page: https://github.com/GenomicParisCentre/toulligQC
6
6
  Author: Genomic Paris Centre team
@@ -18,6 +18,14 @@
18
18
  # Maintainer: Laurent Jourdren
19
19
  # Since version 2.2
20
20
 
21
+ import numpy as np
22
+ from packaging import version
23
+
24
+ def is_numpy_1_24():
25
+ """
26
+ This function checks if Numpy version is later then 1.20
27
+ """
28
+ return version.parse(np.__version__) >= version.parse("1.20")
21
29
 
22
30
  def format_duration(t):
23
31
  """
@@ -535,7 +535,9 @@ def _barcode_boxplot_graph(graph_name, df, barcode_selection, pass_color, fail_c
535
535
 
536
536
 
537
537
  def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, result_directory):
538
- labels = count_sorted.index.values.tolist()
538
+ read_count_sorted = count_sorted[0]
539
+ base_count_sorted = count_sorted[1]
540
+ labels = read_count_sorted.index.values.tolist()
539
541
 
540
542
  fig = go.Figure()
541
543
 
@@ -546,9 +548,9 @@ def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, resu
546
548
  pie_marker = dict(line=dict(width=line_width, color='#808080'))
547
549
  bar_colors = color_palette[0]
548
550
 
549
- # Pie chart
551
+ # reads Pie chart
550
552
  fig.add_trace(go.Pie(labels=labels,
551
- values=count_sorted,
553
+ values=read_count_sorted,
552
554
  hoverinfo='label+percent',
553
555
  textinfo='percent',
554
556
  textfont_size=14,
@@ -557,9 +559,29 @@ def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, resu
557
559
  hovertemplate='<b>%{label}</b><br>%{percent:.1%} (%{value:,})<extra></extra>',
558
560
  visible=True
559
561
  ))
560
- # Histogram
562
+ # Bases Pie chart
563
+ fig.add_trace(go.Pie(labels=labels,
564
+ values=base_count_sorted,
565
+ hoverinfo='label+percent',
566
+ textinfo='percent',
567
+ textfont_size=14,
568
+ marker=pie_marker,
569
+ textposition='inside',
570
+ hovertemplate='<b>%{label}</b><br>%{percent:.1%} (%{value:,})<extra></extra>',
571
+ visible=True
572
+ ))
573
+ # Reads Histogram
574
+ fig.add_trace(go.Bar(x=labels,
575
+ y=read_count_sorted,
576
+ marker_color=bar_colors,
577
+ marker_line_color='gray',
578
+ marker_line_width=line_width,
579
+ hovertemplate='<b>%{x}</b><br>%{y:,}<extra></extra>',
580
+ visible=False
581
+ ))
582
+ # Bases Histogram
561
583
  fig.add_trace(go.Bar(x=labels,
562
- y=count_sorted,
584
+ y=base_count_sorted,
563
585
  marker_color=bar_colors,
564
586
  marker_line_color='gray',
565
587
  marker_line_width=line_width,
@@ -571,7 +593,7 @@ def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, resu
571
593
  fig.update_layout(
572
594
  **_title(graph_name),
573
595
  **default_graph_layout,
574
- **_legend('Barcodes'),
596
+ **_legend('Barcodes',args=dict(y=0.75)),
575
597
  uniformtext_minsize=12,
576
598
  uniformtext_mode='hide',
577
599
  xaxis={'visible': False},
@@ -584,32 +606,48 @@ def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, resu
584
606
  updatemenus=[
585
607
  dict(
586
608
  type="buttons",
587
- direction="left",
609
+ direction="down",
588
610
  buttons=list([
589
611
  dict(
590
- args=[{'visible': [True, False]},
612
+ args=[{'visible': [False, True, False, False]},
591
613
  {'xaxis': {'visible': False},
592
614
  'yaxis': {'visible': False},
593
615
  'plot_bgcolor': 'white'}],
594
- label="Pie chart",
616
+ label="Reads Pie chart",
595
617
  method="update"
596
618
  ),
597
619
  dict(
598
- args=[{'visible': [False, True]},
620
+ args=[{'visible': [False, False, False, True]},
621
+ {**_xaxis('Barcodes', dict(visible=True)),
622
+ **_yaxis('Base count', dict(visible=True)),
623
+ 'plot_bgcolor': plotly_background_color}],
624
+ label="Reads Histogram",
625
+ method="update"
626
+ ),
627
+ dict(
628
+ args=[{'visible': [True, False, False, False]},
629
+ {'xaxis': {'visible': False},
630
+ 'yaxis': {'visible': False},
631
+ 'plot_bgcolor': 'white'}],
632
+ label="Bases Pie chart",
633
+ method="update"
634
+ ),
635
+ dict(
636
+ args=[{'visible': [False, False, True, False]},
599
637
  {**_xaxis('Barcodes', dict(visible=True)),
600
638
  **_yaxis('Read count', dict(visible=True)),
601
639
  'plot_bgcolor': plotly_background_color}],
602
- label="Histogram",
640
+ label="Bases Histogram",
603
641
  method="update"
604
642
  )
605
643
  ]),
606
- pad={"r": 20, "t": 20, "l": 20, "b": 20},
644
+ pad={"r": 20, "t": 20, "l": 40, "b": 20},
607
645
  showactive=True,
608
646
  x=1.0,
609
647
  xanchor="left",
610
648
  y=1.25,
611
649
  yanchor="top"
612
- ),
650
+ )
613
651
  ]
614
652
  )
615
653
 
@@ -618,11 +656,13 @@ def _pie_chart_graph(graph_name, count_sorted, color_palette, one_d_square, resu
618
656
  else:
619
657
  count_col_name = 'Read count'
620
658
 
621
- barcode_table = pd.DataFrame({"Barcode arrangement (%)": count_sorted / sum(count_sorted) * 100,
622
- count_col_name: count_sorted})
659
+ barcode_table = pd.DataFrame({"Barcode arrangement (%)": read_count_sorted / sum(read_count_sorted) * 100,
660
+ count_col_name: read_count_sorted,
661
+ "Base count": base_count_sorted})
623
662
  barcode_table.sort_index(inplace=True)
624
663
  pd.options.display.float_format = percent_format_str.format
625
664
  barcode_table[count_col_name] = barcode_table[count_col_name].astype(int).apply(lambda x: _format_int(x))
665
+ barcode_table["Base count"] = barcode_table["Base count"].astype(int).apply(lambda x: _format_int(x))
626
666
  table_html = _dataFrame_to_html(barcode_table)
627
667
 
628
668
  div, output_file = _create_and_save_div(fig, result_directory, graph_name)
@@ -987,53 +1027,202 @@ def _quality_multiboxplot(graph_name, result_directory, df, onedsquare=False):
987
1027
  return graph_name, output_file, table_html, div
988
1028
 
989
1029
 
990
- def _scatterplot(graph_name, dataframe_dict, result_directory, onedsquare=False):
1030
+ def _twod_density_char(graph_name, dataframe_dict, result_directory, onedsquare=False):
991
1031
  read_pass_length = dataframe_dict["pass.reads.sequence.length"]
992
1032
  read_pass_qscore = dataframe_dict["pass.reads.mean.qscore"]
993
1033
  read_fail_length = dataframe_dict["fail.reads.sequence.length"]
994
1034
  read_fail_qscore = dataframe_dict["fail.reads.mean.qscore"]
995
1035
 
996
- # If more than 10.000 reads, interpolate data
997
- npoints, sigma = interpolation_points(read_pass_length, 'scatterplot')
998
- if (len(read_pass_length) + len(read_fail_length)) > npoints:
999
- pass_ratio = len(read_pass_length) / (len(read_pass_length) + len(read_fail_length))
1000
- pass_data = _interpolate(read_pass_length, int(npoints * pass_ratio), y=read_pass_qscore, interp_type="nearest")
1001
- fail_data = _interpolate(read_fail_length, int(npoints * (1 - pass_ratio)), y=read_fail_qscore,
1002
- interp_type="nearest")
1003
- else:
1004
- pass_data = [read_pass_length, read_pass_qscore]
1005
- fail_data = [read_fail_length, read_fail_qscore]
1036
+ all_length = dataframe_dict['all.reads.sequence.length']
1037
+ all_qscore = dataframe_dict['all.reads.mean.qscore']
1038
+
1039
+ prefix = '1D² ' if onedsquare else ''
1040
+ graph_name = prefix + graph_name
1041
+ npoint = 50000
1042
+
1006
1043
  fig = go.Figure()
1044
+
1045
+ idx_pass = np.random.choice(read_pass_length.index, min(npoint, len(read_pass_length)), replace=False)
1046
+ idx_fail = np.random.choice(read_fail_length.index, min(npoint, len(read_fail_length)), replace=False)
1047
+ idx_all = np.random.choice(all_length.index, min(npoint, len(all_length)), replace=False)
1048
+
1049
+ pass_color = toulligqc_colors['pass']
1050
+ fail_color = toulligqc_colors['fail']
1051
+ all_color = toulligqc_colors['all']
1052
+
1053
+
1054
+ fig.add_trace(go.Histogram2dContour(
1055
+ x = all_length[idx_all],
1056
+ y = all_qscore[idx_all],
1057
+ colorscale = [[0, 'white'], [0.5, 'khaki'], [1.0, all_color]],
1058
+ reversescale = False,
1059
+ xaxis = 'x',
1060
+ yaxis = 'y',
1061
+ colorbar = dict(
1062
+ title = '<b>Legend</b>',
1063
+ len = 0.4
1064
+ )
1065
+ ))
1007
1066
 
1008
- fig.add_trace(go.Scatter(x=pass_data[0],
1009
- y=pass_data[1],
1010
- name="Pass reads",
1011
- marker_color=toulligqc_colors['pass'],
1012
- mode="markers"
1013
- ))
1067
+ fig.add_trace(go.Histogram2dContour(
1068
+ x = read_pass_length[idx_pass],
1069
+ y = read_pass_qscore[idx_pass],
1070
+ colorscale = [[0, 'white'], [0.5, 'honeydew'], [1.0, pass_color]],
1071
+ reversescale = False,
1072
+ xaxis = 'x',
1073
+ yaxis = 'y',
1074
+ colorbar = dict(
1075
+ title = '<b>Legend</b>',
1076
+ len = 0.4
1077
+ ),
1078
+ visible=False
1079
+ ))
1014
1080
 
1015
- fig.add_trace(go.Scatter(x=fail_data[0],
1016
- y=fail_data[1],
1017
- name="Fail reads",
1018
- marker_color=toulligqc_colors['fail'],
1019
- mode="markers"
1020
- ))
1081
+ fig.add_trace(go.Histogram2dContour(
1082
+ x = read_fail_length[idx_fail],
1083
+ y = read_fail_qscore[idx_fail],
1084
+ colorscale = [[0, 'white'], [0.5, 'coral'], [1.0, fail_color]],
1085
+ reversescale = False,
1086
+ xaxis = 'x',
1087
+ yaxis = 'y',
1088
+ colorbar = dict(
1089
+ title = '<b>Legend</b>',
1090
+ len = 0.4,
1091
+ ),
1092
+ visible=False
1093
+ ))
1094
+
1095
+ max_x_range = max(np.percentile(read_pass_length, 99), np.percentile(read_fail_length, 99))
1096
+ max_y_range = max(np.percentile(read_pass_qscore, 99.8), np.percentile(read_fail_qscore, 99.8))
1097
+ fig.update_xaxes(range=[0, max_x_range])
1098
+ fig.update_yaxes(range=[0, max_y_range])
1099
+
1100
+ fig.add_trace(go.Histogram(
1101
+ y = all_qscore[idx_all],
1102
+ xaxis = 'x2',
1103
+ opacity = 0.5,
1104
+ marker = dict(
1105
+ color = all_color
1106
+ )
1107
+ ))
1108
+
1109
+ fig.add_trace(go.Histogram(
1110
+ x = all_length[idx_all],
1111
+ yaxis = 'y2',
1112
+ opacity = 0.5,
1113
+ marker = dict(
1114
+ color = all_color
1115
+ )
1116
+ ))
1117
+
1118
+ fig.add_trace(go.Histogram(
1119
+ y = read_pass_qscore[idx_pass],
1120
+ xaxis = 'x2',
1121
+ marker = dict(
1122
+ color = pass_color
1123
+ ),
1124
+ visible=False
1125
+ ))
1126
+
1127
+ fig.add_trace(go.Histogram(
1128
+ x = read_pass_length[idx_pass],
1129
+ yaxis = 'y2',
1130
+ marker = dict(
1131
+ color = pass_color
1132
+ ),
1133
+ visible=False
1134
+ ))
1135
+
1136
+ fig.add_trace(go.Histogram(
1137
+ y = read_fail_qscore[idx_fail],
1138
+ xaxis = 'x2',
1139
+ marker = dict(
1140
+ color = fail_color
1141
+ ),
1142
+ visible=False
1143
+ ))
1144
+ fig.add_trace(go.Histogram(
1145
+ x = read_fail_length[idx_fail],
1146
+ yaxis = 'y2',
1147
+ marker = dict(
1148
+ color = fail_color
1149
+ ),
1150
+ visible=False
1151
+ ))
1021
1152
 
1022
- prefix = '1D² ' if onedsquare else ''
1023
1153
  fig.update_layout(
1154
+ autosize = False,
1155
+ xaxis = dict(
1156
+ zeroline = False,
1157
+ domain = [0,0.85],
1158
+ showgrid = False,
1159
+ title = '<b>Sequence length</b>'
1160
+ ),
1161
+ yaxis = dict(
1162
+ zeroline = False,
1163
+ domain = [0,0.85],
1164
+ showgrid = False,
1165
+ title = '<b>PHRED score</b>'
1166
+ ),
1167
+ xaxis2 = dict(
1168
+ zeroline = False,
1169
+ domain = [0.85,1],
1170
+ showgrid = False,
1171
+ visible = False
1172
+ ),
1173
+ yaxis2 = dict(
1174
+ zeroline = False,
1175
+ domain = [0.85,1],
1176
+ showgrid = False,
1177
+ visible = False
1178
+ ),
1179
+ height = 600,
1180
+ width = 1000,
1181
+ bargap = 0,
1182
+ paper_bgcolor="#FFFFFF",
1183
+ plot_bgcolor="#FFFFFF",
1184
+ hovermode = 'closest',
1185
+ showlegend = False,
1024
1186
  **_title(graph_name),
1025
- **default_graph_layout,
1026
- **_legend(prefix + 'Read type'),
1027
- **_xaxis('Sequence length (bp)'),
1028
- **_yaxis('PHRED score', dict(fixedrange=False)),
1187
+
1029
1188
  )
1030
- # Trim x axis to avoid negative values
1031
- max_val = max(max(read_fail_length), max(read_pass_length))
1032
1189
 
1033
- fig.update_xaxes(range=[0, max_val])
1190
+ # Add buttons
1191
+ fig.update_layout(
1192
+ updatemenus=[
1193
+ dict(
1194
+ type="buttons",
1195
+ direction="down",
1196
+ buttons=list([
1197
+ dict(
1198
+ args=[{'visible': [True, False, False, True, True, False, False, False, False]}, {'hovermode': False}],
1199
+ label="all reads",
1200
+ method="restyle"
1201
+ ),
1202
+ dict(
1203
+ args=[{'visible': [False, True, False, False, False, True, True, False, False]}, {'hovermode': 'x'}],
1204
+ label="Pass reads",
1205
+ method="restyle"
1206
+ ),
1207
+ dict(
1208
+ args=[{'visible': [False, False, True, False, False, False, False, True, True]}, {'hovermode': False}],
1209
+ label="Fail reads",
1210
+ method="restyle"
1211
+ )
1212
+ ]),
1213
+ pad={"r": 20, "t": 20, "l": 20, "b": 20},
1214
+ showactive=True,
1215
+ x=1.0,
1216
+ xanchor="left",
1217
+ y=1.25,
1218
+ yanchor="top"
1219
+ ),
1220
+ ]
1221
+ )
1034
1222
 
1035
1223
  table_html = None
1036
1224
  div, output_file = _create_and_save_div(fig, result_directory, graph_name)
1225
+
1037
1226
  return graph_name, output_file, table_html, div
1038
1227
 
1039
1228
 
@@ -36,7 +36,7 @@ from toulligqc.plotly_graph_common import _phred_score_density
36
36
  from toulligqc.plotly_graph_common import _pie_chart_graph
37
37
  from toulligqc.plotly_graph_common import _quality_multiboxplot
38
38
  from toulligqc.plotly_graph_common import _read_length_distribution
39
- from toulligqc.plotly_graph_common import _scatterplot
39
+ from toulligqc.plotly_graph_common import _twod_density_char
40
40
  from toulligqc.plotly_graph_common import _smooth_data
41
41
  from toulligqc.plotly_graph_common import _title
42
42
  from toulligqc.plotly_graph_common import _transparent_colors
@@ -366,14 +366,14 @@ def allphred_score_frequency(dataframe_dict, result_directory):
366
366
  result_directory=result_directory)
367
367
 
368
368
 
369
- def all_scatterplot(dataframe_dict, result_directory):
369
+ def twod_density(dataframe_dict, result_directory):
370
370
  """
371
371
  Plot the scatter plot representing the relation between the phred score and the sequence length in log
372
372
  """
373
373
 
374
374
  graph_name = "Correlation between read length and PHRED score"
375
-
376
- return _scatterplot(graph_name, dataframe_dict, result_directory)
375
+
376
+ return _twod_density_char(graph_name, dataframe_dict, result_directory)
377
377
 
378
378
 
379
379
  def _compute_channel_map(df):
@@ -604,10 +604,11 @@ def barcode_percentage_pie_chart_pass(dataframe_dict, barcode_selection, result_
604
604
 
605
605
  graph_name = "Pass barcoded reads distribution"
606
606
 
607
- count_sorted = dataframe_dict["read.pass.barcoded"]
607
+ read_count_sorted = dataframe_dict["read.pass.barcoded"]
608
+ base_count_sorted = dataframe_dict["base.pass.barcoded"]
608
609
 
609
610
  return _pie_chart_graph(graph_name=graph_name,
610
- count_sorted=count_sorted,
611
+ count_sorted=[read_count_sorted, base_count_sorted],
611
612
  color_palette=toulligqc_colors['pie_chart_palette'],
612
613
  one_d_square=False,
613
614
  result_directory=result_directory)
@@ -621,10 +622,11 @@ def barcode_percentage_pie_chart_fail(dataframe_dict, barcode_selection, result_
621
622
 
622
623
  graph_name = "Fail barcoded reads distribution"
623
624
 
624
- count_sorted = dataframe_dict["read.fail.barcoded"]
625
+ read_count_sorted = dataframe_dict["read.fail.barcoded"]
626
+ base_count_sorted = dataframe_dict["base.fail.barcoded"]
625
627
 
626
628
  return _pie_chart_graph(graph_name=graph_name,
627
- count_sorted=count_sorted,
629
+ count_sorted=[read_count_sorted, base_count_sorted],
628
630
  color_palette=toulligqc_colors['pie_chart_palette'],
629
631
  one_d_square=False,
630
632
  result_directory=result_directory)
@@ -34,7 +34,7 @@ from toulligqc.plotly_graph_common import _phred_score_density
34
34
  from toulligqc.plotly_graph_common import _pie_chart_graph
35
35
  from toulligqc.plotly_graph_common import _quality_multiboxplot
36
36
  from toulligqc.plotly_graph_common import _read_length_distribution
37
- from toulligqc.plotly_graph_common import _scatterplot
37
+ from toulligqc.plotly_graph_common import _twod_density_char
38
38
  from toulligqc.plotly_graph_common import _title
39
39
  from toulligqc.plotly_graph_common import _transparent_colors
40
40
  from toulligqc.plotly_graph_common import _xaxis
@@ -207,14 +207,14 @@ def dsqr_allphred_score_frequency(result_dict, dataframe_dict_1dsqr, result_dire
207
207
  result_directory=result_directory)
208
208
 
209
209
 
210
- def scatterplot_1dsqr(dataframe_dict_1dsqr, result_directory):
210
+ def twod_density(dataframe_dict, result_directory):
211
211
  """
212
212
  Plot the scatter plot representing the relation between the phred score and the sequence length in log
213
213
  """
214
214
 
215
215
  graph_name = "Correlation between 1D² read length and PHRED score"
216
216
 
217
- return _scatterplot(graph_name, dataframe_dict_1dsqr, result_directory, onedsquare=True)
217
+ return _twod_density_char(graph_name, dataframe_dict, result_directory, onedsquare = True)
218
218
 
219
219
 
220
220
  #
@@ -228,10 +228,11 @@ def barcode_percentage_pie_chart_1dsqr_pass(dataframe_dict_1dsqr, barcode_select
228
228
 
229
229
  graph_name = "1D² read pass barcode distribution"
230
230
 
231
- count_sorted = dataframe_dict_1dsqr["read.pass.barcoded"]
231
+ read_count_sorted = dataframe_dict_1dsqr["read.pass.barcoded"]
232
+ base_count_sorted = dataframe_dict_1dsqr["base.pass.barcoded"]
232
233
 
233
234
  return _pie_chart_graph(graph_name=graph_name,
234
- count_sorted=count_sorted,
235
+ count_sorted=[read_count_sorted, base_count_sorted],
235
236
  color_palette=toulligqc_colors['pie_chart_palette'],
236
237
  one_d_square=True,
237
238
  result_directory=result_directory)
@@ -245,10 +246,11 @@ def barcode_percentage_pie_chart_1dsqr_fail(dataframe_dict_1dsqr, barcode_select
245
246
 
246
247
  graph_name = "1D² read fail barcode distribution"
247
248
 
248
- count_sorted = dataframe_dict_1dsqr["read.fail.barcoded"]
249
+ read_count_sorted = dataframe_dict_1dsqr["read.fail.barcoded"]
250
+ base_count_sorted = dataframe_dict_1dsqr["base.fail.barcoded"]
249
251
 
250
252
  return _pie_chart_graph(graph_name=graph_name,
251
- count_sorted=count_sorted,
253
+ count_sorted=[read_count_sorted, base_count_sorted],
252
254
  color_palette=toulligqc_colors['pie_chart_palette'],
253
255
  one_d_square=True,
254
256
  result_directory=result_directory)
@@ -98,6 +98,17 @@ def series_cols_boolean_elements(dataframe, column_name1: str, column_name2: str
98
98
  return dataframe[column_name1].loc[dataframe[column_name2] == bool(boolean)]
99
99
 
100
100
 
101
+ def df_cols_boolean_elements(dataframe, column_name1: list, column_name2: str, boolean: bool) -> pd.Series:
102
+ """
103
+ Returns a Panda's Series object with the number of values of different columns filtered by a boolean
104
+ :param dataframe: dataframe_1d
105
+ :param column_name1: 1st column to filter
106
+ :param column_name2: 2nd column to filter
107
+ :param boolean: access columns of dataframe by boolean array
108
+ """
109
+ return dataframe[column_name1].loc[dataframe[column_name2] == bool(boolean)]
110
+
111
+
101
112
  def sorted_series_boolean_elements_divided(dataframe, column_name1: str, column_name2: str, boolean: bool,
102
113
  denominator: int):
103
114
  """
@@ -130,6 +141,22 @@ def extract_barcode_info(extractor, result_dict, barcode_selection, dataframe_di
130
141
  if element not in barcodes_found and element != 'other barcodes':
131
142
  sys.stderr.write("Warning: The barcode {} doesn't exist in input data\n".format(element))
132
143
 
144
+
145
+ # Get barcodes frequency by Bases
146
+ df_base_pass_barcode = series_cols_boolean_elements(df, ["barcode_arrangement", "sequence_length"],
147
+ "passes_filtering", True)
148
+
149
+ dataframe_dict["base.pass.barcoded"] = _barcode_bases(extractor, barcode_selection, result_dict,
150
+ "base.pass.barcoded",
151
+ df_base_pass_barcode)
152
+
153
+ df_base_fail_barcode = series_cols_boolean_elements(df, ["barcode_arrangement", "sequence_length"],
154
+ "passes_filtering", False)
155
+
156
+ dataframe_dict["base.fail.barcoded"] = _barcode_bases(extractor, barcode_selection, result_dict,
157
+ "base.fail.barcoded",
158
+ df_base_fail_barcode)
159
+
133
160
  # Get barcodes frequency by read type
134
161
  series_read_pass_barcode = series_cols_boolean_elements(df, "barcode_arrangement",
135
162
  "passes_filtering", True)
@@ -294,6 +321,53 @@ def _barcode_frequency(extractor, barcode_selection, result_dict, entry: str, df
294
321
  return count_sorted
295
322
 
296
323
 
324
+ def _barcode_bases(extractor, barcode_selection, result_dict, entry: str, df_filtered) -> pd.Series:
325
+ """
326
+ Count bases by values of barcode_selection, computes sum of counts by barcode_selection, and sum of unclassified counts.
327
+ Regroup all non used barcodes in index "other"
328
+ Compute all frequency values for each number of barcoded bases
329
+ :param result_dict: result dictionary with statistics
330
+ :param entry: entry about barcoded counts
331
+ :param prefix: key prefix
332
+ :return: Series with all barcodes (used, non used, and unclassified) frequencies
333
+ """
334
+ # Regroup all barcoded and sum all read lengths in df
335
+ all_barcode_count = df_filtered.groupby('barcode_arrangement')['sequence_length'].sum()
336
+
337
+ # Retain only existing barcodes from barcode_selection list
338
+ barcodes_found = set(df_filtered['barcode_arrangement'].unique())
339
+ barcode_selection_existing = [x for x in barcode_selection if x in barcodes_found]
340
+
341
+ # Sort by list of barcode_selection
342
+ count_sorted = all_barcode_count.sort_index()[barcode_selection_existing]
343
+ # Replace all NaN values to zero
344
+ count_sorted.fillna(0, downcast='int16', inplace=True)
345
+
346
+ # Compute sum of all used barcodes without barcode 'unclassified'
347
+ set_result_value(extractor, result_dict, entry + '.count', sum(count_sorted.drop("unclassified")))
348
+
349
+ # Replace entry name ie read.pass/fail.barcode with read.pass/fail.non.used.barcodes.count
350
+ non_used_barcodes_count_key = entry.replace(".barcoded", ".non.used.barcodes.count")
351
+
352
+ # Compute all reads of barcodes that are not in the barcode_selection list
353
+ other_barcode_count = sum(all_barcode_count) - sum(count_sorted)
354
+ set_result_value(extractor, result_dict, non_used_barcodes_count_key, other_barcode_count)
355
+
356
+ # Create Series for all non-used barcode counts and rename index array with "other"
357
+ other_all_barcode_count = pd.Series(other_barcode_count, index=['other barcodes'])
358
+
359
+ # Append Series of non-used barcode counts to the Series of barcode_selection counts
360
+ count_sorted = count_sorted.append(other_all_barcode_count).sort_index()
361
+
362
+ # Compute frequency for all barcode counts and save into dataframe_dict
363
+ for barcode in count_sorted.to_dict():
364
+ frequency_value = count_sorted[barcode] * 100 / sum(count_sorted)
365
+ set_result_value(extractor, result_dict, entry.replace(".barcoded", ".") + barcode + ".frequency",
366
+ frequency_value)
367
+
368
+ return count_sorted
369
+
370
+
297
371
  def log_task(quiet, msg, start_time, end_time):
298
372
  if not quiet:
299
373
  delta = end_time - start_time
@@ -42,7 +42,7 @@ from toulligqc.sequencing_summary_common import set_result_value
42
42
  from toulligqc.sequencing_summary_common import log_task
43
43
  from toulligqc.sequencing_summary_common import add_image_to_result
44
44
  from toulligqc.sequencing_summary_common import read_first_line_file
45
-
45
+ from toulligqc.common import is_numpy_1_24
46
46
 
47
47
  class SequencingSummaryExtractor:
48
48
  """
@@ -280,8 +280,7 @@ class SequencingSummaryExtractor:
280
280
  add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory))
281
281
  add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))
282
282
  add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory))
283
-
284
- add_image_to_result(self.quiet, images, time.time(), pgg.all_scatterplot(self.dataframe_dict, self.images_directory))
283
+ add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))
285
284
  add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory))
286
285
  add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
287
286
  add_image_to_result(self.quiet, images, time.time(), pgg.speed_over_time(self.dataframe_dict, self.images_directory))
@@ -345,7 +344,7 @@ class SequencingSummaryExtractor:
345
344
  sequencing_summary_datatypes = {
346
345
  'channel': np.int16,
347
346
  'start_time': np.float64,
348
- 'passes_filtering': np.bool,
347
+ 'passes_filtering': np.bool_ if is_numpy_1_24 else np.bool,
349
348
  'sequence_length_template': np.uint32,
350
349
  'mean_qscore_template': np.float32,
351
350
  'duration': np.float32}
@@ -44,6 +44,7 @@ from toulligqc.sequencing_summary_common import log_task
44
44
  from toulligqc.sequencing_summary_common import add_image_to_result
45
45
  from toulligqc.sequencing_summary_common import read_first_line_file
46
46
  from toulligqc.sequencing_summary_extractor import SequencingSummaryExtractor as SSE
47
+ from toulligqc.common import is_numpy_1_24
47
48
 
48
49
 
49
50
  class OneDSquareSequencingSummaryExtractor(SSE):
@@ -285,8 +286,8 @@ class OneDSquareSequencingSummaryExtractor(SSE):
285
286
  add_image_to_result(self.quiet, images, time.time(), pgg2.dsqr_read_quality_multiboxplot(result_dict, self.dataframe_dict_1dsqr, self.images_directory))
286
287
  add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))
287
288
  add_image_to_result(self.quiet, images, time.time(), pgg2.dsqr_allphred_score_frequency(result_dict, self.dataframe_dict_1dsqr, self.images_directory))
288
- add_image_to_result(self.quiet, images, time.time(), pgg.all_scatterplot(self.dataframe_dict, self.images_directory))
289
- add_image_to_result(self.quiet, images, time.time(), pgg2.scatterplot_1dsqr(self.dataframe_dict_1dsqr, self.images_directory))
289
+ add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))
290
+ add_image_to_result(self.quiet, images, time.time(), pgg2.twod_density(self.dataframe_dict_1dsqr, self.images_directory))
290
291
  add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.sse.dataframe_1d, self.images_directory))
291
292
  add_image_to_result(self.quiet, images, time.time(), pgg2.sequence_length_over_time_dsqr(self.dataframe_dict_1dsqr, self.images_directory))
292
293
  add_image_to_result(self.quiet, images, time.time(), pgg2.phred_score_over_time_dsqr(result_dict, self.dataframe_dict_1dsqr, self.images_directory))
@@ -346,7 +347,7 @@ class OneDSquareSequencingSummaryExtractor(SSE):
346
347
  ]
347
348
 
348
349
  sequencing_summary_datatypes = {
349
- 'passes_filtering': np.bool,
350
+ 'passes_filtering': np.bool_ if is_numpy_1_24 else np.bool,
350
351
  'sequence_length': np.uint32,
351
352
  'mean_qscore': np.float32,
352
353
  'start_time1': np.float64,
@@ -0,0 +1 @@
1
+ __version__ = '2.3'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 1.2
2
2
  Name: toulligqc
3
- Version: 2.2.3
3
+ Version: 2.3
4
4
  Summary: A post sequencing QC tool for Oxford Nanopore sequencers
5
5
  Home-page: https://github.com/GenomicParisCentre/toulligQC
6
6
  Author: Genomic Paris Centre team
@@ -1 +0,0 @@
1
- __version__ = '2.2.3'
File without changes
File without changes
File without changes
File without changes
File without changes