uht-tooling 0.1.9__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uht_tooling/cli.py +153 -4
- uht_tooling/config.py +137 -0
- uht_tooling/tools.py +143 -0
- uht_tooling/workflows/gui.py +19 -0
- uht_tooling/workflows/mut_rate.py +484 -124
- uht_tooling/workflows/mutation_caller.py +11 -2
- uht_tooling/workflows/umi_hunter.py +9 -4
- {uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/METADATA +123 -5
- uht_tooling-0.3.0.dist-info/RECORD +20 -0
- uht_tooling-0.1.9.dist-info/RECORD +0 -18
- {uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/WHEEL +0 -0
- {uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/entry_points.txt +0 -0
- {uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,7 @@ import math
|
|
|
16
16
|
import tempfile
|
|
17
17
|
from pathlib import Path
|
|
18
18
|
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
|
19
|
+
from tqdm import tqdm
|
|
19
20
|
|
|
20
21
|
# Use a built-in Matplotlib style ("ggplot") for consistency
|
|
21
22
|
plt.style.use("ggplot")
|
|
@@ -219,7 +220,12 @@ def compute_mismatch_stats_sam(sam_file, refs_dict):
|
|
|
219
220
|
|
|
220
221
|
logging.info(f"Computing mismatch stats for {sam_file}")
|
|
221
222
|
samfile = pysam.AlignmentFile(sam_file, "r")
|
|
222
|
-
for
|
|
223
|
+
# Count total aligned reads for progress bar
|
|
224
|
+
total_reads = sum(1 for _ in samfile.fetch())
|
|
225
|
+
samfile.close()
|
|
226
|
+
|
|
227
|
+
samfile = pysam.AlignmentFile(sam_file, "r")
|
|
228
|
+
for read in tqdm(samfile.fetch(), desc="Computing mismatch stats", total=total_reads, unit="read"):
|
|
223
229
|
if read.is_unmapped or read.query_sequence is None:
|
|
224
230
|
continue
|
|
225
231
|
ref_name = samfile.get_reference_name(read.reference_id)
|
|
@@ -743,40 +749,50 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
|
|
|
743
749
|
|
|
744
750
|
plt.tight_layout()
|
|
745
751
|
|
|
746
|
-
#
|
|
752
|
+
# Create detailed/qc_plots/ subdirectory for QC plots
|
|
753
|
+
detailed_dir = os.path.join(results_dir, "detailed")
|
|
754
|
+
qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
|
|
755
|
+
os.makedirs(qc_plots_dir, exist_ok=True)
|
|
756
|
+
|
|
757
|
+
# Save the plot to detailed/qc_plots/
|
|
747
758
|
project_name = os.path.basename(results_dir)
|
|
748
|
-
qc_plot_path = os.path.join(
|
|
759
|
+
qc_plot_path = os.path.join(qc_plots_dir, f"qc_plot_{project_name}.png")
|
|
749
760
|
fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
|
|
750
761
|
plt.close(fig)
|
|
751
|
-
|
|
762
|
+
|
|
752
763
|
logging.info(f"QC plot saved to: {qc_plot_path}")
|
|
753
|
-
|
|
754
|
-
# Save data as CSV
|
|
755
|
-
qc_data_path = os.path.join(
|
|
764
|
+
|
|
765
|
+
# Save data as CSV to detailed/
|
|
766
|
+
qc_data_path = os.path.join(detailed_dir, "simple_qc_data.csv")
|
|
756
767
|
with open(qc_data_path, 'w') as f:
|
|
757
768
|
f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
|
|
758
769
|
f.write("total_mappable_bases,n_segments\n")
|
|
759
|
-
|
|
770
|
+
|
|
760
771
|
for q, r in zip(quality_thresholds, qc_results):
|
|
761
772
|
f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
|
|
762
773
|
f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
|
|
763
774
|
f.write(f"{r['total_mappable_bases']},{r['n_segments']}\n")
|
|
764
|
-
|
|
775
|
+
|
|
765
776
|
logging.info(f"Simple QC data saved to: {qc_data_path}")
|
|
766
|
-
|
|
777
|
+
|
|
767
778
|
except Exception as e:
|
|
768
779
|
logging.error(f"Error creating simple QC plots: {e}")
|
|
780
|
+
|
|
781
|
+
def create_comprehensive_qc_plots(quality_thresholds, qc_results, results_dir):
|
|
769
782
|
"""
|
|
770
783
|
Create comprehensive QC plots with error bars and uncertainty quantification.
|
|
771
|
-
|
|
784
|
+
|
|
772
785
|
Args:
|
|
773
786
|
quality_thresholds: List of quality score thresholds
|
|
774
787
|
qc_results: List of comprehensive analysis results
|
|
775
788
|
results_dir: Directory to save the plots
|
|
776
|
-
optimal_qscore: Optimal Q-score threshold (optional)
|
|
777
|
-
optimal_result: Optimal result data (optional)
|
|
778
789
|
"""
|
|
779
790
|
try:
|
|
791
|
+
# Create detailed/qc_plots/ subdirectory
|
|
792
|
+
detailed_dir = os.path.join(results_dir, "detailed")
|
|
793
|
+
qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
|
|
794
|
+
os.makedirs(qc_plots_dir, exist_ok=True)
|
|
795
|
+
|
|
780
796
|
# Extract data for plotting
|
|
781
797
|
aa_mutations = [r['mean_aa_mutations'] for r in qc_results]
|
|
782
798
|
aa_errors = [r['std_aa_mutations'] for r in qc_results]
|
|
@@ -785,46 +801,46 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
|
|
|
785
801
|
mappable_bases = [r['mappable_bases'] for r in qc_results]
|
|
786
802
|
net_rates = [r['net_rate'] for r in qc_results]
|
|
787
803
|
net_rate_errors = [r['net_rate_error'] for r in qc_results]
|
|
788
|
-
|
|
804
|
+
|
|
789
805
|
# Create main QC plot with error bars
|
|
790
806
|
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12))
|
|
791
|
-
|
|
807
|
+
|
|
792
808
|
# Top plot: AA mutations per gene with error bars
|
|
793
809
|
color1 = '#2E8B57'
|
|
794
|
-
ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
|
|
795
|
-
fmt='o', capsize=5, capthick=2, markersize=8,
|
|
810
|
+
ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
|
|
811
|
+
fmt='o', capsize=5, capthick=2, markersize=8,
|
|
796
812
|
color=color1, ecolor=color1, alpha=0.8, label='Mean ± Std')
|
|
797
|
-
|
|
813
|
+
|
|
798
814
|
# Add confidence intervals as shaded area
|
|
799
|
-
ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
|
|
815
|
+
ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
|
|
800
816
|
alpha=0.3, color=color1, label='95% Confidence Interval')
|
|
801
|
-
|
|
817
|
+
|
|
802
818
|
ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
|
|
803
819
|
ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
|
|
804
820
|
ax1.tick_params(axis='y', labelcolor=color1)
|
|
805
|
-
ax1.set_title('AA Mutations per Gene vs Quality Score Filter (with Error Propagation)',
|
|
821
|
+
ax1.set_title('AA Mutations per Gene vs Quality Score Filter (with Error Propagation)',
|
|
806
822
|
fontsize=14, fontweight='bold')
|
|
807
823
|
ax1.grid(True, alpha=0.3)
|
|
808
824
|
ax1.legend(frameon=False, fontsize=10)
|
|
809
|
-
|
|
825
|
+
|
|
810
826
|
# Add data point labels
|
|
811
827
|
for i, (q, aa_mut, aa_err) in enumerate(zip(quality_thresholds, aa_mutations, aa_errors)):
|
|
812
|
-
ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
|
|
813
|
-
(q, aa_mut), xytext=(5, 5),
|
|
828
|
+
ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
|
|
829
|
+
(q, aa_mut), xytext=(5, 5),
|
|
814
830
|
textcoords='offset points', fontsize=8, alpha=0.8, color=color1)
|
|
815
|
-
|
|
831
|
+
|
|
816
832
|
# Bottom plot: Mappable bases and AA mutations per gene
|
|
817
833
|
color2 = '#FF6B6B'
|
|
818
834
|
color3 = '#4169E1'
|
|
819
|
-
|
|
835
|
+
|
|
820
836
|
# Mappable bases (left y-axis)
|
|
821
837
|
ax2_twin = ax2.twinx()
|
|
822
|
-
ax2_twin.scatter(quality_thresholds, mappable_bases,
|
|
823
|
-
s=100, alpha=0.7, color=color2, edgecolors='black',
|
|
838
|
+
ax2_twin.scatter(quality_thresholds, mappable_bases,
|
|
839
|
+
s=100, alpha=0.7, color=color2, edgecolors='black',
|
|
824
840
|
linewidth=1, marker='s', label='Mappable Bases')
|
|
825
841
|
ax2_twin.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
|
|
826
842
|
ax2_twin.tick_params(axis='y', labelcolor=color2)
|
|
827
|
-
|
|
843
|
+
|
|
828
844
|
# AA mutations per gene with error bars (right y-axis)
|
|
829
845
|
ax2.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
|
|
830
846
|
fmt='^', capsize=5, capthick=2, markersize=8,
|
|
@@ -832,34 +848,34 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
|
|
|
832
848
|
ax2.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color3)
|
|
833
849
|
ax2.tick_params(axis='y', labelcolor=color3)
|
|
834
850
|
ax2.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
|
|
835
|
-
ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
|
|
851
|
+
ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
|
|
836
852
|
fontsize=14, fontweight='bold')
|
|
837
853
|
ax2.grid(True, alpha=0.3)
|
|
838
|
-
|
|
854
|
+
|
|
839
855
|
# Add legends
|
|
840
856
|
lines1, labels1 = ax2.get_legend_handles_labels()
|
|
841
857
|
lines2, labels2 = ax2_twin.get_legend_handles_labels()
|
|
842
858
|
ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
|
|
843
|
-
|
|
859
|
+
|
|
844
860
|
# Add data point labels for mappable bases
|
|
845
861
|
for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
|
|
846
|
-
ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
|
|
862
|
+
ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
|
|
847
863
|
textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
|
|
848
|
-
|
|
864
|
+
|
|
849
865
|
plt.tight_layout()
|
|
850
|
-
|
|
851
|
-
# Save the comprehensive plot
|
|
852
|
-
qc_plot_path = os.path.join(
|
|
866
|
+
|
|
867
|
+
# Save the comprehensive plot to detailed/qc_plots/
|
|
868
|
+
qc_plot_path = os.path.join(qc_plots_dir, "comprehensive_qc_analysis.png")
|
|
853
869
|
fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
|
|
854
870
|
plt.close(fig)
|
|
855
|
-
|
|
871
|
+
|
|
856
872
|
logging.info(f"Comprehensive QC plot saved to: {qc_plot_path}")
|
|
857
|
-
|
|
873
|
+
|
|
858
874
|
# Create error analysis plot
|
|
859
875
|
create_error_analysis_plot(quality_thresholds, qc_results, results_dir)
|
|
860
|
-
|
|
861
|
-
# Save comprehensive data as CSV
|
|
862
|
-
qc_data_path = os.path.join(
|
|
876
|
+
|
|
877
|
+
# Save comprehensive data as CSV to detailed/
|
|
878
|
+
qc_data_path = os.path.join(detailed_dir, "comprehensive_qc_data.csv")
|
|
863
879
|
with open(qc_data_path, 'w') as f:
|
|
864
880
|
f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
|
|
865
881
|
f.write("mappable_bases,hit_rate,hit_rate_ci_lower,hit_rate_ci_upper,")
|
|
@@ -869,7 +885,7 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
|
|
|
869
885
|
f.write("bg_qscore_mean,bg_qscore_std,bg_qscore_uncertainty,")
|
|
870
886
|
f.write("hit_weighted_rate,hit_weighted_error,bg_weighted_rate,bg_weighted_error,")
|
|
871
887
|
f.write("net_weighted_rate,net_weighted_error,lambda_bp_weighted,lambda_error_weighted\n")
|
|
872
|
-
|
|
888
|
+
|
|
873
889
|
for q, r in zip(quality_thresholds, qc_results):
|
|
874
890
|
f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
|
|
875
891
|
f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
|
|
@@ -878,93 +894,98 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
|
|
|
878
894
|
f.write(f"{r['bg_rate']:.6f},{r['bg_rate_ci'][0]:.6f},{r['bg_rate_ci'][1]:.6f},")
|
|
879
895
|
f.write(f"{r['net_rate']:.6f},{r['net_rate_error']:.6f},")
|
|
880
896
|
f.write(f"{r['lambda_bp']:.6f},{r['lambda_error']:.6f},{r['alignment_error']:.6f},")
|
|
881
|
-
|
|
897
|
+
|
|
882
898
|
# Q-score information
|
|
883
899
|
hit_qscore_mean = r['hit_qscore_stats']['mean_qscore'] if r['hit_qscore_stats'] else 0.0
|
|
884
900
|
hit_qscore_std = r['hit_qscore_stats']['std_qscore'] if r['hit_qscore_stats'] else 0.0
|
|
885
901
|
bg_qscore_mean = r['bg_qscore_stats']['mean_qscore'] if r['bg_qscore_stats'] else 0.0
|
|
886
902
|
bg_qscore_std = r['bg_qscore_stats']['std_qscore'] if r['bg_qscore_stats'] else 0.0
|
|
887
|
-
|
|
903
|
+
|
|
888
904
|
f.write(f"{hit_qscore_mean:.2f},{hit_qscore_std:.2f},{r['hit_qscore_uncertainty']:.6f},")
|
|
889
905
|
f.write(f"{bg_qscore_mean:.2f},{bg_qscore_std:.2f},{r['bg_qscore_uncertainty']:.6f},")
|
|
890
906
|
f.write(f"{r.get('hit_weighted_rate', 0.0):.6f},{r.get('hit_weighted_error', 0.0):.6f},")
|
|
891
907
|
f.write(f"{r.get('bg_weighted_rate', 0.0):.6f},{r.get('bg_weighted_error', 0.0):.6f},")
|
|
892
908
|
f.write(f"{r.get('net_weighted_rate', 0.0):.6f},{r.get('net_weighted_error', 0.0):.6f},")
|
|
893
909
|
f.write(f"{r.get('lambda_bp_weighted', 0.0):.6f},{r.get('lambda_error_weighted', 0.0):.6f}\n")
|
|
894
|
-
|
|
910
|
+
|
|
895
911
|
logging.info(f"Comprehensive QC data saved to: {qc_data_path}")
|
|
896
|
-
|
|
912
|
+
|
|
897
913
|
except Exception as e:
|
|
898
914
|
logging.error(f"Error creating comprehensive QC plots: {e}")
|
|
899
915
|
|
|
900
916
|
def create_error_analysis_plot(quality_thresholds, qc_results, results_dir):
|
|
901
917
|
"""
|
|
902
918
|
Create a detailed error analysis plot showing different sources of uncertainty.
|
|
903
|
-
|
|
919
|
+
|
|
904
920
|
Args:
|
|
905
921
|
quality_thresholds: List of quality score thresholds
|
|
906
922
|
qc_results: List of comprehensive analysis results
|
|
907
923
|
results_dir: Directory to save the plot
|
|
908
924
|
"""
|
|
909
925
|
try:
|
|
926
|
+
# Create detailed/qc_plots/ subdirectory
|
|
927
|
+
detailed_dir = os.path.join(results_dir, "detailed")
|
|
928
|
+
qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
|
|
929
|
+
os.makedirs(qc_plots_dir, exist_ok=True)
|
|
930
|
+
|
|
910
931
|
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
|
|
911
|
-
|
|
932
|
+
|
|
912
933
|
# Extract error components
|
|
913
934
|
aa_std = [r['std_aa_mutations'] for r in qc_results]
|
|
914
935
|
net_rate_errors = [r['net_rate_error'] for r in qc_results]
|
|
915
936
|
lambda_errors = [r['lambda_error'] for r in qc_results]
|
|
916
937
|
alignment_errors = [r['alignment_error'] for r in qc_results]
|
|
917
938
|
mappable_bases = [r['mappable_bases'] for r in qc_results]
|
|
918
|
-
|
|
939
|
+
|
|
919
940
|
# Plot 1: AA mutation uncertainty vs quality threshold
|
|
920
941
|
ax1.plot(quality_thresholds, aa_std, 'o-', color='#2E8B57', linewidth=2, markersize=6)
|
|
921
942
|
ax1.set_xlabel('Quality Score Threshold')
|
|
922
943
|
ax1.set_ylabel('AA Mutation Standard Deviation')
|
|
923
944
|
ax1.set_title('AA Mutation Uncertainty vs Quality Filter')
|
|
924
945
|
ax1.grid(True, alpha=0.3)
|
|
925
|
-
|
|
946
|
+
|
|
926
947
|
# Plot 2: Net rate error vs quality threshold
|
|
927
948
|
ax2.plot(quality_thresholds, net_rate_errors, 's-', color='#FF6B6B', linewidth=2, markersize=6)
|
|
928
949
|
ax2.set_xlabel('Quality Score Threshold')
|
|
929
950
|
ax2.set_ylabel('Net Mutation Rate Error')
|
|
930
951
|
ax2.set_title('Net Rate Error vs Quality Filter')
|
|
931
952
|
ax2.grid(True, alpha=0.3)
|
|
932
|
-
|
|
953
|
+
|
|
933
954
|
# Plot 3: Lambda error vs quality threshold
|
|
934
955
|
ax3.plot(quality_thresholds, lambda_errors, '^-', color='#4169E1', linewidth=2, markersize=6)
|
|
935
956
|
ax3.set_xlabel('Quality Score Threshold')
|
|
936
957
|
ax3.set_ylabel('Lambda Error (mutations per copy)')
|
|
937
958
|
ax3.set_title('Lambda Error vs Quality Filter')
|
|
938
959
|
ax3.grid(True, alpha=0.3)
|
|
939
|
-
|
|
960
|
+
|
|
940
961
|
# Plot 4: Alignment error vs mappable bases
|
|
941
962
|
ax4.scatter(mappable_bases, alignment_errors, s=100, alpha=0.7, color='#FF8C00')
|
|
942
963
|
ax4.set_xlabel('Mappable Bases')
|
|
943
|
-
ax4.set_ylabel('Alignment Error (1
|
|
964
|
+
ax4.set_ylabel('Alignment Error (1/sqrt(reads))')
|
|
944
965
|
ax4.set_title('Alignment Error vs Read Count')
|
|
945
966
|
ax4.grid(True, alpha=0.3)
|
|
946
|
-
|
|
967
|
+
|
|
947
968
|
# Add quality threshold labels to scatter plot
|
|
948
969
|
for i, q in enumerate(quality_thresholds):
|
|
949
|
-
ax4.annotate(f'Q{q}', (mappable_bases[i], alignment_errors[i]),
|
|
970
|
+
ax4.annotate(f'Q{q}', (mappable_bases[i], alignment_errors[i]),
|
|
950
971
|
xytext=(5, 5), textcoords='offset points', fontsize=8)
|
|
951
|
-
|
|
972
|
+
|
|
952
973
|
plt.tight_layout()
|
|
953
|
-
|
|
954
|
-
# Save error analysis plot
|
|
955
|
-
error_plot_path = os.path.join(
|
|
974
|
+
|
|
975
|
+
# Save error analysis plot to detailed/qc_plots/
|
|
976
|
+
error_plot_path = os.path.join(qc_plots_dir, "error_analysis.png")
|
|
956
977
|
fig.savefig(error_plot_path, dpi=300, bbox_inches='tight')
|
|
957
978
|
plt.close(fig)
|
|
958
|
-
|
|
979
|
+
|
|
959
980
|
logging.info(f"Error analysis plot saved to: {error_plot_path}")
|
|
960
|
-
|
|
981
|
+
|
|
961
982
|
except Exception as e:
|
|
962
983
|
logging.error(f"Error creating error analysis plot: {e}")
|
|
963
984
|
|
|
964
985
|
def create_qc_plot(quality_thresholds, aa_mutations, mappable_bases, results_dir):
|
|
965
986
|
"""
|
|
966
987
|
Create a dual-axis plot showing quality score threshold vs AA mutations per gene and mappable bases.
|
|
967
|
-
|
|
988
|
+
|
|
968
989
|
Args:
|
|
969
990
|
quality_thresholds: List of quality score thresholds
|
|
970
991
|
aa_mutations: List of corresponding AA mutations per gene
|
|
@@ -972,68 +993,73 @@ def create_qc_plot(quality_thresholds, aa_mutations, mappable_bases, results_dir
|
|
|
972
993
|
results_dir: Directory to save the plot
|
|
973
994
|
"""
|
|
974
995
|
try:
|
|
996
|
+
# Create detailed/qc_plots/ subdirectory
|
|
997
|
+
detailed_dir = os.path.join(results_dir, "detailed")
|
|
998
|
+
qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
|
|
999
|
+
os.makedirs(qc_plots_dir, exist_ok=True)
|
|
1000
|
+
|
|
975
1001
|
# Create the plot with dual y-axes
|
|
976
1002
|
fig, ax1 = plt.subplots(figsize=(12, 8))
|
|
977
|
-
|
|
1003
|
+
|
|
978
1004
|
# Left y-axis: AA mutations per gene
|
|
979
1005
|
color1 = '#2E8B57'
|
|
980
1006
|
ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
|
|
981
1007
|
ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
|
|
982
|
-
ax1.scatter(quality_thresholds, aa_mutations,
|
|
1008
|
+
ax1.scatter(quality_thresholds, aa_mutations,
|
|
983
1009
|
s=100, alpha=0.7, color=color1, edgecolors='black', linewidth=1, label='AA Mutations per Gene')
|
|
984
1010
|
ax1.tick_params(axis='y', labelcolor=color1)
|
|
985
|
-
|
|
1011
|
+
|
|
986
1012
|
# Right y-axis: Mappable bases
|
|
987
1013
|
ax2 = ax1.twinx()
|
|
988
1014
|
color2 = '#FF6B6B'
|
|
989
1015
|
ax2.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
|
|
990
|
-
ax2.scatter(quality_thresholds, mappable_bases,
|
|
1016
|
+
ax2.scatter(quality_thresholds, mappable_bases,
|
|
991
1017
|
s=100, alpha=0.7, color=color2, edgecolors='black', linewidth=1, marker='s', label='Mappable Bases')
|
|
992
1018
|
ax2.tick_params(axis='y', labelcolor=color2)
|
|
993
|
-
|
|
1019
|
+
|
|
994
1020
|
# Customize the plot
|
|
995
1021
|
ax1.set_title('AA Mutations per Gene and Mappable Bases vs Quality Score Filter', fontsize=14, fontweight='bold')
|
|
996
|
-
|
|
1022
|
+
|
|
997
1023
|
# Add grid for better readability
|
|
998
1024
|
ax1.grid(True, alpha=0.3)
|
|
999
|
-
|
|
1025
|
+
|
|
1000
1026
|
# Customize ticks and spines
|
|
1001
1027
|
ax1.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
|
|
1002
1028
|
ax1.tick_params(axis='both', which='minor', direction='in', length=3)
|
|
1003
1029
|
ax1.spines['top'].set_visible(False)
|
|
1004
1030
|
ax1.spines['right'].set_visible(False)
|
|
1005
|
-
|
|
1031
|
+
|
|
1006
1032
|
# Add data point labels for AA mutations
|
|
1007
1033
|
for i, (q, aa_mut) in enumerate(zip(quality_thresholds, aa_mutations)):
|
|
1008
|
-
ax1.annotate(f'Q{q}', (q, aa_mut), xytext=(5, 5),
|
|
1034
|
+
ax1.annotate(f'Q{q}', (q, aa_mut), xytext=(5, 5),
|
|
1009
1035
|
textcoords='offset points', fontsize=9, alpha=0.8, color=color1)
|
|
1010
|
-
|
|
1036
|
+
|
|
1011
1037
|
# Add data point labels for mappable bases
|
|
1012
1038
|
for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
|
|
1013
|
-
ax2.annotate(f'{
|
|
1039
|
+
ax2.annotate(f'{bases}', (q, bases), xytext=(5, -15),
|
|
1014
1040
|
textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
|
|
1015
|
-
|
|
1041
|
+
|
|
1016
1042
|
# Add legend
|
|
1017
1043
|
lines1, labels1 = ax1.get_legend_handles_labels()
|
|
1018
1044
|
lines2, labels2 = ax2.get_legend_handles_labels()
|
|
1019
1045
|
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
|
|
1020
|
-
|
|
1021
|
-
# Save the plot
|
|
1022
|
-
qc_plot_path = os.path.join(
|
|
1046
|
+
|
|
1047
|
+
# Save the plot to detailed/qc_plots/
|
|
1048
|
+
qc_plot_path = os.path.join(qc_plots_dir, "qc_mutation_rate_vs_quality.png")
|
|
1023
1049
|
fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
|
|
1024
1050
|
plt.close(fig)
|
|
1025
|
-
|
|
1051
|
+
|
|
1026
1052
|
logging.info(f"QC plot saved to: {qc_plot_path}")
|
|
1027
|
-
|
|
1028
|
-
# Also save data as CSV
|
|
1029
|
-
qc_data_path = os.path.join(
|
|
1053
|
+
|
|
1054
|
+
# Also save data as CSV to detailed/qc_plots/
|
|
1055
|
+
qc_data_path = os.path.join(qc_plots_dir, "qc_mutation_rate_vs_quality.csv")
|
|
1030
1056
|
with open(qc_data_path, 'w') as f:
|
|
1031
1057
|
f.write("quality_threshold,aa_mutations_per_gene,mappable_bases\n")
|
|
1032
1058
|
for q, aa_mut, bases in zip(quality_thresholds, aa_mutations, mappable_bases):
|
|
1033
1059
|
f.write(f"{q},{aa_mut:.6f},{bases}\n")
|
|
1034
|
-
|
|
1060
|
+
|
|
1035
1061
|
logging.info(f"QC data saved to: {qc_data_path}")
|
|
1036
|
-
|
|
1062
|
+
|
|
1037
1063
|
except Exception as e:
|
|
1038
1064
|
logging.error(f"Error creating QC plot: {e}")
|
|
1039
1065
|
|
|
@@ -1330,74 +1356,74 @@ def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_f
|
|
|
1330
1356
|
except Exception as e:
|
|
1331
1357
|
logging.error(f"Error in segmented analysis: {e}")
|
|
1332
1358
|
return None
|
|
1359
|
+
|
|
1360
|
+
def calculate_qscore_weighted_mismatches(sam_file, ref_seq, qscore_stats):
|
|
1333
1361
|
"""
|
|
1334
1362
|
Calculate mismatches weighted by Q-score uncertainty with proper sampling error.
|
|
1335
|
-
|
|
1363
|
+
|
|
1336
1364
|
Args:
|
|
1337
1365
|
sam_file: Path to SAM file
|
|
1338
1366
|
ref_seq: Reference sequence
|
|
1339
1367
|
qscore_stats: Q-score statistics from extract_qscores_from_sam
|
|
1340
|
-
|
|
1368
|
+
|
|
1341
1369
|
Returns:
|
|
1342
1370
|
tuple: (weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes)
|
|
1343
1371
|
"""
|
|
1344
1372
|
try:
|
|
1345
|
-
import pysam
|
|
1346
|
-
|
|
1347
1373
|
weighted_mismatches = 0.0
|
|
1348
1374
|
total_weighted_coverage = 0.0
|
|
1349
1375
|
raw_mismatches = 0
|
|
1350
1376
|
raw_coverage = 0
|
|
1351
|
-
|
|
1377
|
+
|
|
1352
1378
|
# Store position-level data for proper sampling error calculation
|
|
1353
1379
|
position_weights = []
|
|
1354
1380
|
position_outcomes = []
|
|
1355
|
-
|
|
1381
|
+
|
|
1356
1382
|
position_qscores = qscore_stats['position_avg_qscores']
|
|
1357
|
-
|
|
1383
|
+
|
|
1358
1384
|
with pysam.AlignmentFile(sam_file, "r") as samfile:
|
|
1359
1385
|
for read in samfile:
|
|
1360
1386
|
if read.is_unmapped:
|
|
1361
1387
|
continue
|
|
1362
|
-
|
|
1388
|
+
|
|
1363
1389
|
# Get aligned pairs (read_pos, ref_pos)
|
|
1364
1390
|
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
|
|
1365
1391
|
if ref_pos is None or read_pos is None:
|
|
1366
1392
|
continue
|
|
1367
|
-
|
|
1393
|
+
|
|
1368
1394
|
if ref_pos >= len(ref_seq):
|
|
1369
1395
|
continue
|
|
1370
|
-
|
|
1396
|
+
|
|
1371
1397
|
# Get base calls
|
|
1372
1398
|
read_base = read.query_sequence[read_pos].upper()
|
|
1373
1399
|
ref_base = ref_seq[ref_pos].upper()
|
|
1374
|
-
|
|
1400
|
+
|
|
1375
1401
|
# Skip if either base is N
|
|
1376
1402
|
if read_base == 'N' or ref_base == 'N':
|
|
1377
1403
|
continue
|
|
1378
|
-
|
|
1404
|
+
|
|
1379
1405
|
# Get Q-score for this position
|
|
1380
1406
|
qscore = position_qscores.get(ref_pos, qscore_stats['mean_qscore'])
|
|
1381
1407
|
uncertainty_factor = qscore_uncertainty_factor(qscore)
|
|
1382
|
-
|
|
1408
|
+
|
|
1383
1409
|
# Weight by uncertainty (lower Q-score = higher uncertainty = lower weight)
|
|
1384
1410
|
weight = 1.0 - uncertainty_factor
|
|
1385
|
-
|
|
1411
|
+
|
|
1386
1412
|
# Store position-level data
|
|
1387
1413
|
position_weights.append(weight)
|
|
1388
1414
|
position_outcomes.append(1 if read_base != ref_base else 0)
|
|
1389
|
-
|
|
1415
|
+
|
|
1390
1416
|
# Count coverage
|
|
1391
1417
|
total_weighted_coverage += weight
|
|
1392
1418
|
raw_coverage += 1
|
|
1393
|
-
|
|
1419
|
+
|
|
1394
1420
|
# Count mismatches
|
|
1395
1421
|
if read_base != ref_base:
|
|
1396
1422
|
weighted_mismatches += weight
|
|
1397
1423
|
raw_mismatches += 1
|
|
1398
|
-
|
|
1424
|
+
|
|
1399
1425
|
return weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes
|
|
1400
|
-
|
|
1426
|
+
|
|
1401
1427
|
except Exception as e:
|
|
1402
1428
|
logging.error(f"Error calculating Q-score weighted mismatches: {e}")
|
|
1403
1429
|
return 0.0, 0.0, 0, 0, [], []
|
|
@@ -1827,6 +1853,289 @@ def simulate_aa_distribution(lambda_bp, cds_seq, n_trials=1000):
|
|
|
1827
1853
|
|
|
1828
1854
|
return aa_diffs
|
|
1829
1855
|
|
|
1856
|
+
def create_output_directories(results_dir):
|
|
1857
|
+
"""
|
|
1858
|
+
Create the output directory structure with detailed/ and detailed/qc_plots/ subdirectories.
|
|
1859
|
+
|
|
1860
|
+
Args:
|
|
1861
|
+
results_dir: Base results directory path
|
|
1862
|
+
|
|
1863
|
+
Returns:
|
|
1864
|
+
dict: Paths to created directories
|
|
1865
|
+
"""
|
|
1866
|
+
results_dir = Path(results_dir)
|
|
1867
|
+
detailed_dir = results_dir / "detailed"
|
|
1868
|
+
qc_plots_dir = detailed_dir / "qc_plots"
|
|
1869
|
+
|
|
1870
|
+
detailed_dir.mkdir(parents=True, exist_ok=True)
|
|
1871
|
+
qc_plots_dir.mkdir(parents=True, exist_ok=True)
|
|
1872
|
+
|
|
1873
|
+
logging.info(f"Created output directories: {detailed_dir}, {qc_plots_dir}")
|
|
1874
|
+
|
|
1875
|
+
return {
|
|
1876
|
+
'results_dir': results_dir,
|
|
1877
|
+
'detailed_dir': detailed_dir,
|
|
1878
|
+
'qc_plots_dir': qc_plots_dir,
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1881
|
+
def write_key_findings(results_dir, consensus_info, simple_lambda, simple_aa_mean, is_protein, hit_seq):
|
|
1882
|
+
"""
|
|
1883
|
+
Generate lay-user executive summary KEY_FINDINGS.txt.
|
|
1884
|
+
|
|
1885
|
+
Args:
|
|
1886
|
+
results_dir: Results directory path
|
|
1887
|
+
consensus_info: Consensus AA mutation estimate from QC analysis
|
|
1888
|
+
simple_lambda: Simple lambda (bp mutations per copy) from main analysis
|
|
1889
|
+
simple_aa_mean: Simple AA mutation mean from Monte Carlo simulation
|
|
1890
|
+
is_protein: Whether the region is protein-coding
|
|
1891
|
+
hit_seq: The hit sequence (for length calculation)
|
|
1892
|
+
"""
|
|
1893
|
+
key_findings_path = Path(results_dir) / "KEY_FINDINGS.txt"
|
|
1894
|
+
|
|
1895
|
+
with open(key_findings_path, "w") as f:
|
|
1896
|
+
f.write("=" * 60 + "\n")
|
|
1897
|
+
f.write("EP LIBRARY PROFILER - KEY FINDINGS\n")
|
|
1898
|
+
f.write("=" * 60 + "\n\n")
|
|
1899
|
+
|
|
1900
|
+
# Determine which value to use as the "headline" number
|
|
1901
|
+
if consensus_info and consensus_info.get("consensus_mean") is not None:
|
|
1902
|
+
headline_aa = consensus_info["consensus_mean"]
|
|
1903
|
+
headline_std = consensus_info.get("consensus_std", 0.0)
|
|
1904
|
+
method_note = "consensus (precision-weighted average across Q-score thresholds)"
|
|
1905
|
+
elif simple_aa_mean is not None:
|
|
1906
|
+
headline_aa = simple_aa_mean
|
|
1907
|
+
headline_std = 0.0 # Simple method doesn't provide error
|
|
1908
|
+
method_note = "Monte Carlo simulation (single Q-score)"
|
|
1909
|
+
else:
|
|
1910
|
+
headline_aa = None
|
|
1911
|
+
headline_std = 0.0
|
|
1912
|
+
method_note = "N/A"
|
|
1913
|
+
|
|
1914
|
+
f.write("EXPECTED AMINO ACID MUTATIONS PER GENE COPY\n")
|
|
1915
|
+
f.write("-" * 45 + "\n")
|
|
1916
|
+
if is_protein and headline_aa is not None:
|
|
1917
|
+
f.write(f" {headline_aa:.2f} +/- {headline_std:.2f} AA mutations per gene copy\n")
|
|
1918
|
+
f.write(f" (Method: {method_note})\n\n")
|
|
1919
|
+
|
|
1920
|
+
# Plain-language interpretation using Poisson distribution
|
|
1921
|
+
f.write("WHAT THIS MEANS (Poisson distribution):\n")
|
|
1922
|
+
f.write("-" * 45 + "\n")
|
|
1923
|
+
if headline_aa > 0:
|
|
1924
|
+
# P(k=0) = e^(-lambda)
|
|
1925
|
+
p_wildtype = np.exp(-headline_aa) * 100
|
|
1926
|
+
# P(k=1) = lambda * e^(-lambda)
|
|
1927
|
+
p_one_mut = headline_aa * np.exp(-headline_aa) * 100
|
|
1928
|
+
# P(k>=2) = 1 - P(0) - P(1)
|
|
1929
|
+
p_two_plus = 100 - p_wildtype - p_one_mut
|
|
1930
|
+
|
|
1931
|
+
f.write(f" ~{p_wildtype:.1f}% of gene copies are wild-type (0 AA mutations)\n")
|
|
1932
|
+
f.write(f" ~{p_one_mut:.1f}% have exactly 1 AA mutation\n")
|
|
1933
|
+
f.write(f" ~{p_two_plus:.1f}% have 2 or more AA mutations\n\n")
|
|
1934
|
+
else:
|
|
1935
|
+
f.write(" Nearly all gene copies are expected to be wild-type.\n\n")
|
|
1936
|
+
else:
|
|
1937
|
+
if not is_protein:
|
|
1938
|
+
f.write(" Region is not protein-coding; AA mutation estimate not applicable.\n\n")
|
|
1939
|
+
else:
|
|
1940
|
+
f.write(" AA mutation estimate could not be calculated.\n\n")
|
|
1941
|
+
|
|
1942
|
+
# Quality assessment
|
|
1943
|
+
f.write("QUALITY ASSESSMENT\n")
|
|
1944
|
+
f.write("-" * 45 + "\n")
|
|
1945
|
+
if consensus_info:
|
|
1946
|
+
n_thresholds = len(consensus_info.get("thresholds_used", []))
|
|
1947
|
+
min_bases = consensus_info.get("min_mappable_bases", 0)
|
|
1948
|
+
note = consensus_info.get("note", "")
|
|
1949
|
+
|
|
1950
|
+
if n_thresholds >= 3 and note != "FELL_BACK_TO_MAX_COVERAGE":
|
|
1951
|
+
f.write(" GOOD - Multiple Q-score thresholds contributed to consensus\n")
|
|
1952
|
+
elif n_thresholds >= 1:
|
|
1953
|
+
f.write(" ACCEPTABLE - Limited Q-score thresholds available\n")
|
|
1954
|
+
else:
|
|
1955
|
+
f.write(" LOW COVERAGE - Results may be unreliable\n")
|
|
1956
|
+
|
|
1957
|
+
if note == "FELL_BACK_TO_MAX_COVERAGE":
|
|
1958
|
+
f.write(" WARNING: Fell back to max-coverage threshold due to low coverage\n")
|
|
1959
|
+
else:
|
|
1960
|
+
f.write(" UNKNOWN - Consensus analysis not available\n")
|
|
1961
|
+
|
|
1962
|
+
f.write("\n")
|
|
1963
|
+
f.write("FOR DETAILED TECHNICAL INFORMATION\n")
|
|
1964
|
+
f.write("-" * 45 + "\n")
|
|
1965
|
+
f.write(" See the detailed/ folder for:\n")
|
|
1966
|
+
f.write(" - methodology_notes.txt: Full explanation of calculations\n")
|
|
1967
|
+
f.write(" - lambda_comparison.csv: Side-by-side lambda estimates\n")
|
|
1968
|
+
f.write(" - comprehensive_qc_data.csv: All Q-score threshold results\n")
|
|
1969
|
+
f.write("\n")
|
|
1970
|
+
|
|
1971
|
+
logging.info(f"Wrote KEY_FINDINGS.txt to: {key_findings_path}")
|
|
1972
|
+
|
|
1973
|
+
def write_lambda_comparison(detailed_dir, simple_lambda, simple_aa_mean, consensus_info, hit_seq_length):
|
|
1974
|
+
"""
|
|
1975
|
+
Write CSV comparing all lambda estimates side-by-side.
|
|
1976
|
+
|
|
1977
|
+
Args:
|
|
1978
|
+
detailed_dir: Path to detailed/ directory
|
|
1979
|
+
simple_lambda: Simple lambda (bp mutations per copy)
|
|
1980
|
+
simple_aa_mean: Simple AA mutation mean from Monte Carlo
|
|
1981
|
+
consensus_info: Consensus info from QC analysis
|
|
1982
|
+
hit_seq_length: Length of the hit sequence
|
|
1983
|
+
"""
|
|
1984
|
+
lambda_csv_path = Path(detailed_dir) / "lambda_comparison.csv"
|
|
1985
|
+
|
|
1986
|
+
with open(lambda_csv_path, "w") as f:
|
|
1987
|
+
f.write("method,lambda_bp,lambda_error,aa_estimate,aa_error,notes\n")
|
|
1988
|
+
|
|
1989
|
+
# Simple method (from main analysis)
|
|
1990
|
+
simple_error = "N/A" # Simple method doesn't compute error
|
|
1991
|
+
simple_aa_err = "N/A"
|
|
1992
|
+
f.write(f"simple,(hit_rate - bg_rate) * seq_len,{simple_lambda:.6f},{simple_error},")
|
|
1993
|
+
if simple_aa_mean is not None:
|
|
1994
|
+
f.write(f"{simple_aa_mean:.4f},{simple_aa_err},")
|
|
1995
|
+
else:
|
|
1996
|
+
f.write("N/A,N/A,")
|
|
1997
|
+
f.write("Used for KDE plot and Monte Carlo simulation\n")
|
|
1998
|
+
|
|
1999
|
+
# Consensus method (from QC analysis)
|
|
2000
|
+
if consensus_info and consensus_info.get("consensus_mean") is not None:
|
|
2001
|
+
consensus_mean = consensus_info["consensus_mean"]
|
|
2002
|
+
consensus_std = consensus_info.get("consensus_std", 0.0)
|
|
2003
|
+
thresholds = consensus_info.get("thresholds_used", [])
|
|
2004
|
+
# Consensus is in AA mutations, back-calculate approximate lambda
|
|
2005
|
+
# Rough approximation: lambda_bp ~ 3 * aa_mutations
|
|
2006
|
+
approx_lambda = consensus_mean * 3.0
|
|
2007
|
+
approx_lambda_err = consensus_std * 3.0
|
|
2008
|
+
f.write(f"consensus_weighted,{approx_lambda:.6f},{approx_lambda_err:.6f},")
|
|
2009
|
+
f.write(f"{consensus_mean:.4f},{consensus_std:.4f},")
|
|
2010
|
+
f.write(f"Precision-weighted across Q-scores: {thresholds}\n")
|
|
2011
|
+
else:
|
|
2012
|
+
f.write("consensus_weighted,N/A,N/A,N/A,N/A,Not computed or insufficient data\n")
|
|
2013
|
+
|
|
2014
|
+
logging.info(f"Wrote lambda_comparison.csv to: {lambda_csv_path}")
|
|
2015
|
+
|
|
2016
|
+
def write_methodology_notes(detailed_dir):
|
|
2017
|
+
"""
|
|
2018
|
+
Write detailed methodology documentation explaining each lambda calculation method.
|
|
2019
|
+
|
|
2020
|
+
Args:
|
|
2021
|
+
detailed_dir: Path to detailed/ directory
|
|
2022
|
+
"""
|
|
2023
|
+
methodology_path = Path(detailed_dir) / "methodology_notes.txt"
|
|
2024
|
+
|
|
2025
|
+
content = """EP LIBRARY PROFILER - METHODOLOGY NOTES
|
|
2026
|
+
=======================================
|
|
2027
|
+
|
|
2028
|
+
This document explains the different mutation rate estimates produced by the
|
|
2029
|
+
EP library profiler and which outputs use which estimates.
|
|
2030
|
+
|
|
2031
|
+
|
|
2032
|
+
LAMBDA CALCULATION METHODS
|
|
2033
|
+
--------------------------
|
|
2034
|
+
|
|
2035
|
+
1. SIMPLE LAMBDA (used for KDE plot and Monte Carlo simulation)
|
|
2036
|
+
|
|
2037
|
+
Formula: lambda_bp = (hit_rate - bg_rate) * sequence_length
|
|
2038
|
+
|
|
2039
|
+
Where:
|
|
2040
|
+
- hit_rate = total_mismatches / total_covered_bases (in target region)
|
|
2041
|
+
- bg_rate = total_mismatches / total_covered_bases (in plasmid excluding target)
|
|
2042
|
+
- sequence_length = length of target CDS in base pairs
|
|
2043
|
+
|
|
2044
|
+
This method:
|
|
2045
|
+
- Does NOT include error propagation
|
|
2046
|
+
- Does NOT weight by Q-score
|
|
2047
|
+
- Is fast and provides a point estimate
|
|
2048
|
+
|
|
2049
|
+
Used in:
|
|
2050
|
+
- summary_panels.png/pdf (Panel 4: KDE of AA mutations)
|
|
2051
|
+
- summary.txt
|
|
2052
|
+
- aa_mutation_distribution.csv
|
|
2053
|
+
|
|
2054
|
+
|
|
2055
|
+
2. Q-SCORE WEIGHTED LAMBDA (used in comprehensive QC analysis)
|
|
2056
|
+
|
|
2057
|
+
Formula: lambda_bp_weighted = net_weighted_rate * sequence_length
|
|
2058
|
+
|
|
2059
|
+
Where:
|
|
2060
|
+
- net_weighted_rate = hit_weighted_rate - bg_weighted_rate
|
|
2061
|
+
- Weighted rates account for per-base Q-score uncertainty
|
|
2062
|
+
- Weights = 1 - sqrt(10^(-Q/10)) for each position
|
|
2063
|
+
|
|
2064
|
+
This method:
|
|
2065
|
+
- DOES include error propagation
|
|
2066
|
+
- DOES weight by Q-score (higher Q-score = higher weight)
|
|
2067
|
+
- Provides confidence intervals
|
|
2068
|
+
|
|
2069
|
+
Used in:
|
|
2070
|
+
- comprehensive_qc_data.csv
|
|
2071
|
+
- error_analysis.png
|
|
2072
|
+
|
|
2073
|
+
|
|
2074
|
+
3. CONSENSUS LAMBDA (recommended for reporting)
|
|
2075
|
+
|
|
2076
|
+
Formula: Precision-weighted average across Q-score thresholds
|
|
2077
|
+
|
|
2078
|
+
weights[i] = 1 / std_aa_mutations[i]
|
|
2079
|
+
consensus_mean = sum(weights * means) / sum(weights)
|
|
2080
|
+
|
|
2081
|
+
This method:
|
|
2082
|
+
- Aggregates estimates from multiple Q-score filtering thresholds
|
|
2083
|
+
- Weights by precision (lower uncertainty = higher weight)
|
|
2084
|
+
- Requires minimum coverage threshold (default 1000 mappable bases)
|
|
2085
|
+
- Provides the most robust estimate when multiple thresholds pass QC
|
|
2086
|
+
|
|
2087
|
+
Used in:
|
|
2088
|
+
- aa_mutation_consensus.txt
|
|
2089
|
+
- KEY_FINDINGS.txt
|
|
2090
|
+
- QC plots (red dashed line)
|
|
2091
|
+
|
|
2092
|
+
|
|
2093
|
+
WHICH VALUE SHOULD I USE?
|
|
2094
|
+
-------------------------
|
|
2095
|
+
|
|
2096
|
+
For publication/reporting:
|
|
2097
|
+
Use the CONSENSUS value from aa_mutation_consensus.txt or KEY_FINDINGS.txt
|
|
2098
|
+
This is the most statistically robust estimate.
|
|
2099
|
+
|
|
2100
|
+
For understanding the distribution shape:
|
|
2101
|
+
Use the KDE plot in summary_panels.png
|
|
2102
|
+
Note: This uses the SIMPLE lambda, not the consensus.
|
|
2103
|
+
|
|
2104
|
+
For detailed error analysis:
|
|
2105
|
+
Use comprehensive_qc_data.csv in the detailed/ folder
|
|
2106
|
+
This contains per-Q-score estimates with full error propagation.
|
|
2107
|
+
|
|
2108
|
+
|
|
2109
|
+
OUTPUT FILE REFERENCE
|
|
2110
|
+
---------------------
|
|
2111
|
+
|
|
2112
|
+
Root folder:
|
|
2113
|
+
- KEY_FINDINGS.txt: Executive summary with consensus AA mutations
|
|
2114
|
+
- summary_panels.png/pdf: Main visualization (uses simple lambda for KDE)
|
|
2115
|
+
- aa_mutation_consensus.txt: Consensus estimate details
|
|
2116
|
+
|
|
2117
|
+
detailed/ folder:
|
|
2118
|
+
- methodology_notes.txt: This file
|
|
2119
|
+
- lambda_comparison.csv: Side-by-side comparison of all methods
|
|
2120
|
+
- comprehensive_qc_data.csv: Full QC data with error estimates
|
|
2121
|
+
- simple_qc_data.csv: Simplified QC data
|
|
2122
|
+
- gene_mismatch_rates.csv: Per-position mismatch rates
|
|
2123
|
+
- base_distribution.csv: Base counts at each position
|
|
2124
|
+
- aa_substitutions.csv: Amino acid substitution data
|
|
2125
|
+
- plasmid_coverage.csv: Coverage across plasmid
|
|
2126
|
+
- aa_mutation_distribution.csv: Monte Carlo AA mutation trials
|
|
2127
|
+
|
|
2128
|
+
detailed/qc_plots/ folder:
|
|
2129
|
+
- qc_plot_*.png: Q-score threshold analysis plot
|
|
2130
|
+
- comprehensive_qc_analysis.png: Detailed QC visualization
|
|
2131
|
+
- error_analysis.png: Error component breakdown
|
|
2132
|
+
"""
|
|
2133
|
+
|
|
2134
|
+
with open(methodology_path, "w") as f:
|
|
2135
|
+
f.write(content)
|
|
2136
|
+
|
|
2137
|
+
logging.info(f"Wrote methodology_notes.txt to: {methodology_path}")
|
|
2138
|
+
|
|
1830
2139
|
def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, work_dir, results_dir,
|
|
1831
2140
|
chunks, ref_hit_fasta, plasmid_fasta, hit_seq, hit_id, plasmid_seq, idx):
|
|
1832
2141
|
"""
|
|
@@ -1854,13 +2163,18 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
1854
2163
|
|
|
1855
2164
|
# Ensure work directory exists
|
|
1856
2165
|
os.makedirs(work_dir, exist_ok=True)
|
|
1857
|
-
|
|
2166
|
+
|
|
1858
2167
|
# Create subdirectory for this Q-score analysis
|
|
1859
2168
|
qscore_results_dir = results_dir
|
|
1860
2169
|
if qscore is not None:
|
|
1861
2170
|
qscore_results_dir = os.path.join(results_dir, f"q{qscore}_analysis")
|
|
1862
2171
|
os.makedirs(qscore_results_dir, exist_ok=True)
|
|
1863
|
-
|
|
2172
|
+
|
|
2173
|
+
# Create output directory structure (detailed/ and detailed/qc_plots/)
|
|
2174
|
+
output_dirs = create_output_directories(qscore_results_dir)
|
|
2175
|
+
detailed_dir = output_dirs['detailed_dir']
|
|
2176
|
+
qc_plots_dir = output_dirs['qc_plots_dir']
|
|
2177
|
+
|
|
1864
2178
|
# Write chunks FASTA & align to background‐chunks
|
|
1865
2179
|
chunks_fasta = create_multi_fasta(chunks, work_dir)
|
|
1866
2180
|
sam_chunks = run_minimap2(fastq_path, chunks_fasta, "plasmid_chunks_alignment", work_dir)
|
|
@@ -1976,9 +2290,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
1976
2290
|
qscore_info = f" ({qscore_desc})" if qscore_desc != "unfiltered" else ""
|
|
1977
2291
|
|
|
1978
2292
|
# ----------------------------
|
|
1979
|
-
# SAVE CSV FOR MUTATION RATES (PANEL 1)
|
|
2293
|
+
# SAVE CSV FOR MUTATION RATES (PANEL 1) - to detailed/
|
|
1980
2294
|
# ----------------------------
|
|
1981
|
-
gene_mismatch_csv = os.path.join(
|
|
2295
|
+
gene_mismatch_csv = os.path.join(detailed_dir, "gene_mismatch_rates.csv")
|
|
1982
2296
|
with open(gene_mismatch_csv, "w", newline="") as csvfile:
|
|
1983
2297
|
csvfile.write(f"# gene_id: {hit_id}\n")
|
|
1984
2298
|
csvfile.write(f"# background_rate_per_kb: {bg_rate_per_kb:.6f}\n")
|
|
@@ -1988,9 +2302,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
1988
2302
|
logging.info(f"Saved CSV for gene mismatch rates: {gene_mismatch_csv}")
|
|
1989
2303
|
|
|
1990
2304
|
# ----------------------------
|
|
1991
|
-
# SAVE CSV FOR BASE DISTRIBUTION (PANEL 2)
|
|
2305
|
+
# SAVE CSV FOR BASE DISTRIBUTION (PANEL 2) - to detailed/
|
|
1992
2306
|
# ----------------------------
|
|
1993
|
-
base_dist_csv = os.path.join(
|
|
2307
|
+
base_dist_csv = os.path.join(detailed_dir, "base_distribution.csv")
|
|
1994
2308
|
with open(base_dist_csv, "w", newline="") as csvfile:
|
|
1995
2309
|
csvfile.write(f"# gene_id: {hit_id}\n")
|
|
1996
2310
|
csvfile.write("position_1based,ref_base,A_count,C_count,G_count,T_count,N_count\n")
|
|
@@ -2000,10 +2314,10 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
2000
2314
|
logging.info(f"Saved CSV for base distribution: {base_dist_csv}")
|
|
2001
2315
|
|
|
2002
2316
|
# ----------------------------
|
|
2003
|
-
# SAVE CSV FOR AA SUBSTITUTIONS (PANEL 3) - only if protein
|
|
2317
|
+
# SAVE CSV FOR AA SUBSTITUTIONS (PANEL 3) - to detailed/ - only if protein
|
|
2004
2318
|
# ----------------------------
|
|
2005
2319
|
if is_protein:
|
|
2006
|
-
aa_subst_csv = os.path.join(
|
|
2320
|
+
aa_subst_csv = os.path.join(detailed_dir, "aa_substitutions.csv")
|
|
2007
2321
|
with open(aa_subst_csv, "w", newline="") as csvfile:
|
|
2008
2322
|
csvfile.write(f"# gene_id: {hit_id}\n")
|
|
2009
2323
|
csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
|
|
@@ -2013,9 +2327,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
2013
2327
|
logging.info(f"Saved CSV for AA substitutions: {aa_subst_csv}")
|
|
2014
2328
|
|
|
2015
2329
|
# ----------------------------
|
|
2016
|
-
# SAVE CSV FOR PLASMID COVERAGE (PANEL 4)
|
|
2330
|
+
# SAVE CSV FOR PLASMID COVERAGE (PANEL 4) - to detailed/
|
|
2017
2331
|
# ----------------------------
|
|
2018
|
-
plasmid_cov_csv = os.path.join(
|
|
2332
|
+
plasmid_cov_csv = os.path.join(detailed_dir, "plasmid_coverage.csv")
|
|
2019
2333
|
with open(plasmid_cov_csv, "w", newline="") as csvfile:
|
|
2020
2334
|
csvfile.write("position_1based,coverage\n")
|
|
2021
2335
|
for pos0, cov in enumerate(plasmid_cov):
|
|
@@ -2023,9 +2337,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
2023
2337
|
logging.info(f"Saved CSV for plasmid coverage: {plasmid_cov_csv}")
|
|
2024
2338
|
|
|
2025
2339
|
# ----------------------------
|
|
2026
|
-
# SAVE CSV FOR AA MUTATION DISTRIBUTION (PANEL 3)
|
|
2340
|
+
# SAVE CSV FOR AA MUTATION DISTRIBUTION (PANEL 3) - to detailed/
|
|
2027
2341
|
# ----------------------------
|
|
2028
|
-
aa_dist_csv = os.path.join(
|
|
2342
|
+
aa_dist_csv = os.path.join(detailed_dir, "aa_mutation_distribution.csv")
|
|
2029
2343
|
with open(aa_dist_csv, "w", newline="") as csvfile:
|
|
2030
2344
|
csvfile.write(f"# gene_id: {hit_id}\n")
|
|
2031
2345
|
csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
|
|
@@ -2135,7 +2449,7 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
2135
2449
|
if is_protein and aa_diffs and len(aa_diffs) > 0:
|
|
2136
2450
|
x_vals = np.array(aa_diffs)
|
|
2137
2451
|
unique_vals = np.unique(x_vals)
|
|
2138
|
-
|
|
2452
|
+
|
|
2139
2453
|
if len(unique_vals) > 1:
|
|
2140
2454
|
# Multiple unique values - use KDE or histogram
|
|
2141
2455
|
if HAVE_SCIPY:
|
|
@@ -2149,15 +2463,23 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
2149
2463
|
ax3.set_ylim(bottom=0)
|
|
2150
2464
|
except Exception as e:
|
|
2151
2465
|
logging.warning(f"KDE failed: {e}, falling back to histogram")
|
|
2152
|
-
ax3.hist(x_vals, bins=min(20, len(unique_vals)),
|
|
2466
|
+
ax3.hist(x_vals, bins=min(20, len(unique_vals)),
|
|
2153
2467
|
color="#C44E52", alpha=0.7, density=True, edgecolor='black')
|
|
2154
2468
|
else:
|
|
2155
|
-
ax3.hist(x_vals, bins=min(20, len(unique_vals)),
|
|
2469
|
+
ax3.hist(x_vals, bins=min(20, len(unique_vals)),
|
|
2156
2470
|
color="#C44E52", alpha=0.7, density=True, edgecolor='black')
|
|
2157
2471
|
else:
|
|
2158
2472
|
# Single unique value - just show a bar
|
|
2159
2473
|
ax3.bar(unique_vals, [1.0], color="#C44E52", alpha=0.7, width=0.1)
|
|
2160
2474
|
ax3.set_xlim(unique_vals[0] - 0.5, unique_vals[0] + 0.5)
|
|
2475
|
+
|
|
2476
|
+
# Set title with lambda value for protein-coding sequences
|
|
2477
|
+
ax3.set_title(f"AA Mutation Distribution (Monte Carlo, \u03bb={est_mut_per_copy:.2f}){qscore_info}",
|
|
2478
|
+
fontsize=14, fontweight='bold')
|
|
2479
|
+
ax3.set_xlabel("Number of AA Mutations", fontsize=12)
|
|
2480
|
+
ax3.set_ylabel("Density", fontsize=12)
|
|
2481
|
+
ax3.spines['top'].set_visible(False)
|
|
2482
|
+
ax3.spines['right'].set_visible(False)
|
|
2161
2483
|
else:
|
|
2162
2484
|
# Not protein or no AA differences — display an informative message
|
|
2163
2485
|
ax3.text(
|
|
@@ -2170,7 +2492,7 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
2170
2492
|
color="gray",
|
|
2171
2493
|
transform=ax3.transAxes,
|
|
2172
2494
|
)
|
|
2173
|
-
|
|
2495
|
+
|
|
2174
2496
|
ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
|
|
2175
2497
|
ax3.set_xlabel("Number of AA Mutations", fontsize=12)
|
|
2176
2498
|
ax3.set_ylabel("Density", fontsize=12)
|
|
@@ -2231,9 +2553,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
2231
2553
|
sample_percent[cat] = 0.0
|
|
2232
2554
|
|
|
2233
2555
|
# ----------------------------
|
|
2234
|
-
# GENERATE PDF TABLE (MUTATION SPECTRUM)
|
|
2556
|
+
# GENERATE PDF TABLE (MUTATION SPECTRUM) - to detailed/
|
|
2235
2557
|
# ----------------------------
|
|
2236
|
-
pdf_path = os.path.join(
|
|
2558
|
+
pdf_path = os.path.join(detailed_dir, f"{sample_name}_mutation_spectrum.pdf")
|
|
2237
2559
|
# Prepare table data
|
|
2238
2560
|
table_rows = []
|
|
2239
2561
|
for cat in categories:
|
|
@@ -2341,9 +2663,6 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
|
|
|
2341
2663
|
}
|
|
2342
2664
|
|
|
2343
2665
|
|
|
2344
|
-
|
|
2345
|
-
main()
|
|
2346
|
-
|
|
2347
2666
|
def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
|
|
2348
2667
|
paths: List[Path] = []
|
|
2349
2668
|
for item in inputs:
|
|
@@ -2396,7 +2715,7 @@ def run_ep_library_profile(
|
|
|
2396
2715
|
master_summary_path.write_text(header + "\n", encoding="utf-8")
|
|
2397
2716
|
|
|
2398
2717
|
sample_results: List[Dict[str, object]] = []
|
|
2399
|
-
for fastq in fastq_paths:
|
|
2718
|
+
for fastq in tqdm(fastq_paths, desc="Processing FASTQ files", unit="file"):
|
|
2400
2719
|
result = process_single_fastq(
|
|
2401
2720
|
fastq,
|
|
2402
2721
|
region_fasta,
|
|
@@ -2503,6 +2822,7 @@ def process_single_fastq(
|
|
|
2503
2822
|
|
|
2504
2823
|
logging.info("Running QC analysis to get Q-score results...")
|
|
2505
2824
|
qc_results = None
|
|
2825
|
+
consensus_info = None
|
|
2506
2826
|
try:
|
|
2507
2827
|
qc_results, consensus_info = run_qc_analysis(
|
|
2508
2828
|
str(fastq_path),
|
|
@@ -2563,6 +2883,45 @@ def process_single_fastq(
|
|
|
2563
2883
|
)
|
|
2564
2884
|
analysis_results.append(result)
|
|
2565
2885
|
|
|
2886
|
+
# Generate unified summary files in the sample's root results directory
|
|
2887
|
+
# Get simple lambda from the unfiltered analysis (first result)
|
|
2888
|
+
simple_lambda = 0.0
|
|
2889
|
+
simple_aa_mean = None
|
|
2890
|
+
is_protein = False
|
|
2891
|
+
unfiltered_result = analysis_results[0] if analysis_results else None
|
|
2892
|
+
if unfiltered_result:
|
|
2893
|
+
simple_lambda = unfiltered_result.get('est_mut_per_copy', 0.0)
|
|
2894
|
+
simple_aa_mean = unfiltered_result.get('avg_aa_mutations')
|
|
2895
|
+
is_protein = unfiltered_result.get('is_protein', False)
|
|
2896
|
+
|
|
2897
|
+
# Create output directories and generate summary files
|
|
2898
|
+
output_dirs = create_output_directories(results_dir)
|
|
2899
|
+
detailed_dir = output_dirs['detailed_dir']
|
|
2900
|
+
|
|
2901
|
+
# Write KEY_FINDINGS.txt (lay-user summary)
|
|
2902
|
+
write_key_findings(
|
|
2903
|
+
results_dir,
|
|
2904
|
+
consensus_info,
|
|
2905
|
+
simple_lambda,
|
|
2906
|
+
simple_aa_mean,
|
|
2907
|
+
is_protein,
|
|
2908
|
+
hit_seq,
|
|
2909
|
+
)
|
|
2910
|
+
|
|
2911
|
+
# Write lambda_comparison.csv
|
|
2912
|
+
write_lambda_comparison(
|
|
2913
|
+
detailed_dir,
|
|
2914
|
+
simple_lambda,
|
|
2915
|
+
simple_aa_mean,
|
|
2916
|
+
consensus_info,
|
|
2917
|
+
len(hit_seq),
|
|
2918
|
+
)
|
|
2919
|
+
|
|
2920
|
+
# Write methodology_notes.txt
|
|
2921
|
+
write_methodology_notes(detailed_dir)
|
|
2922
|
+
|
|
2923
|
+
logging.info("Generated unified summary files: KEY_FINDINGS.txt, lambda_comparison.csv, methodology_notes.txt")
|
|
2924
|
+
|
|
2566
2925
|
if work_dir.exists():
|
|
2567
2926
|
shutil.rmtree(work_dir)
|
|
2568
2927
|
logging.info("Removed temporary work directory: %s", work_dir)
|
|
@@ -2573,5 +2932,6 @@ def process_single_fastq(
|
|
|
2573
2932
|
"sample": sample_name,
|
|
2574
2933
|
"results_dir": results_dir,
|
|
2575
2934
|
"analysis_results": analysis_results,
|
|
2935
|
+
"consensus_info": consensus_info,
|
|
2576
2936
|
}
|
|
2577
2937
|
|