uht-tooling 0.1.9__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@ import math
16
16
  import tempfile
17
17
  from pathlib import Path
18
18
  from typing import Dict, Iterable, List, Optional, Sequence, Tuple
19
+ from tqdm import tqdm
19
20
 
20
21
  # Use a built-in Matplotlib style ("ggplot") for consistency
21
22
  plt.style.use("ggplot")
@@ -219,7 +220,12 @@ def compute_mismatch_stats_sam(sam_file, refs_dict):
219
220
 
220
221
  logging.info(f"Computing mismatch stats for {sam_file}")
221
222
  samfile = pysam.AlignmentFile(sam_file, "r")
222
- for read in samfile.fetch():
223
+ # Count total aligned reads for progress bar
224
+ total_reads = sum(1 for _ in samfile.fetch())
225
+ samfile.close()
226
+
227
+ samfile = pysam.AlignmentFile(sam_file, "r")
228
+ for read in tqdm(samfile.fetch(), desc="Computing mismatch stats", total=total_reads, unit="read"):
223
229
  if read.is_unmapped or read.query_sequence is None:
224
230
  continue
225
231
  ref_name = samfile.get_reference_name(read.reference_id)
@@ -743,40 +749,50 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
743
749
 
744
750
  plt.tight_layout()
745
751
 
746
- # Save the plot
752
+ # Create detailed/qc_plots/ subdirectory for QC plots
753
+ detailed_dir = os.path.join(results_dir, "detailed")
754
+ qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
755
+ os.makedirs(qc_plots_dir, exist_ok=True)
756
+
757
+ # Save the plot to detailed/qc_plots/
747
758
  project_name = os.path.basename(results_dir)
748
- qc_plot_path = os.path.join(results_dir, f"qc_plot_{project_name}.png")
759
+ qc_plot_path = os.path.join(qc_plots_dir, f"qc_plot_{project_name}.png")
749
760
  fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
750
761
  plt.close(fig)
751
-
762
+
752
763
  logging.info(f"QC plot saved to: {qc_plot_path}")
753
-
754
- # Save data as CSV
755
- qc_data_path = os.path.join(results_dir, "simple_qc_data.csv")
764
+
765
+ # Save data as CSV to detailed/
766
+ qc_data_path = os.path.join(detailed_dir, "simple_qc_data.csv")
756
767
  with open(qc_data_path, 'w') as f:
757
768
  f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
758
769
  f.write("total_mappable_bases,n_segments\n")
759
-
770
+
760
771
  for q, r in zip(quality_thresholds, qc_results):
761
772
  f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
762
773
  f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
763
774
  f.write(f"{r['total_mappable_bases']},{r['n_segments']}\n")
764
-
775
+
765
776
  logging.info(f"Simple QC data saved to: {qc_data_path}")
766
-
777
+
767
778
  except Exception as e:
768
779
  logging.error(f"Error creating simple QC plots: {e}")
780
+
781
+ def create_comprehensive_qc_plots(quality_thresholds, qc_results, results_dir):
769
782
  """
770
783
  Create comprehensive QC plots with error bars and uncertainty quantification.
771
-
784
+
772
785
  Args:
773
786
  quality_thresholds: List of quality score thresholds
774
787
  qc_results: List of comprehensive analysis results
775
788
  results_dir: Directory to save the plots
776
- optimal_qscore: Optimal Q-score threshold (optional)
777
- optimal_result: Optimal result data (optional)
778
789
  """
779
790
  try:
791
+ # Create detailed/qc_plots/ subdirectory
792
+ detailed_dir = os.path.join(results_dir, "detailed")
793
+ qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
794
+ os.makedirs(qc_plots_dir, exist_ok=True)
795
+
780
796
  # Extract data for plotting
781
797
  aa_mutations = [r['mean_aa_mutations'] for r in qc_results]
782
798
  aa_errors = [r['std_aa_mutations'] for r in qc_results]
@@ -785,46 +801,46 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
785
801
  mappable_bases = [r['mappable_bases'] for r in qc_results]
786
802
  net_rates = [r['net_rate'] for r in qc_results]
787
803
  net_rate_errors = [r['net_rate_error'] for r in qc_results]
788
-
804
+
789
805
  # Create main QC plot with error bars
790
806
  fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12))
791
-
807
+
792
808
  # Top plot: AA mutations per gene with error bars
793
809
  color1 = '#2E8B57'
794
- ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
795
- fmt='o', capsize=5, capthick=2, markersize=8,
810
+ ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
811
+ fmt='o', capsize=5, capthick=2, markersize=8,
796
812
  color=color1, ecolor=color1, alpha=0.8, label='Mean ± Std')
797
-
813
+
798
814
  # Add confidence intervals as shaded area
799
- ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
815
+ ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
800
816
  alpha=0.3, color=color1, label='95% Confidence Interval')
801
-
817
+
802
818
  ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
803
819
  ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
804
820
  ax1.tick_params(axis='y', labelcolor=color1)
805
- ax1.set_title('AA Mutations per Gene vs Quality Score Filter (with Error Propagation)',
821
+ ax1.set_title('AA Mutations per Gene vs Quality Score Filter (with Error Propagation)',
806
822
  fontsize=14, fontweight='bold')
807
823
  ax1.grid(True, alpha=0.3)
808
824
  ax1.legend(frameon=False, fontsize=10)
809
-
825
+
810
826
  # Add data point labels
811
827
  for i, (q, aa_mut, aa_err) in enumerate(zip(quality_thresholds, aa_mutations, aa_errors)):
812
- ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
813
- (q, aa_mut), xytext=(5, 5),
828
+ ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
829
+ (q, aa_mut), xytext=(5, 5),
814
830
  textcoords='offset points', fontsize=8, alpha=0.8, color=color1)
815
-
831
+
816
832
  # Bottom plot: Mappable bases and AA mutations per gene
817
833
  color2 = '#FF6B6B'
818
834
  color3 = '#4169E1'
819
-
835
+
820
836
  # Mappable bases (left y-axis)
821
837
  ax2_twin = ax2.twinx()
822
- ax2_twin.scatter(quality_thresholds, mappable_bases,
823
- s=100, alpha=0.7, color=color2, edgecolors='black',
838
+ ax2_twin.scatter(quality_thresholds, mappable_bases,
839
+ s=100, alpha=0.7, color=color2, edgecolors='black',
824
840
  linewidth=1, marker='s', label='Mappable Bases')
825
841
  ax2_twin.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
826
842
  ax2_twin.tick_params(axis='y', labelcolor=color2)
827
-
843
+
828
844
  # AA mutations per gene with error bars (right y-axis)
829
845
  ax2.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
830
846
  fmt='^', capsize=5, capthick=2, markersize=8,
@@ -832,34 +848,34 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
832
848
  ax2.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color3)
833
849
  ax2.tick_params(axis='y', labelcolor=color3)
834
850
  ax2.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
835
- ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
851
+ ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
836
852
  fontsize=14, fontweight='bold')
837
853
  ax2.grid(True, alpha=0.3)
838
-
854
+
839
855
  # Add legends
840
856
  lines1, labels1 = ax2.get_legend_handles_labels()
841
857
  lines2, labels2 = ax2_twin.get_legend_handles_labels()
842
858
  ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
843
-
859
+
844
860
  # Add data point labels for mappable bases
845
861
  for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
846
- ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
862
+ ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
847
863
  textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
848
-
864
+
849
865
  plt.tight_layout()
850
-
851
- # Save the comprehensive plot
852
- qc_plot_path = os.path.join(results_dir, "comprehensive_qc_analysis.png")
866
+
867
+ # Save the comprehensive plot to detailed/qc_plots/
868
+ qc_plot_path = os.path.join(qc_plots_dir, "comprehensive_qc_analysis.png")
853
869
  fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
854
870
  plt.close(fig)
855
-
871
+
856
872
  logging.info(f"Comprehensive QC plot saved to: {qc_plot_path}")
857
-
873
+
858
874
  # Create error analysis plot
859
875
  create_error_analysis_plot(quality_thresholds, qc_results, results_dir)
860
-
861
- # Save comprehensive data as CSV
862
- qc_data_path = os.path.join(results_dir, "comprehensive_qc_data.csv")
876
+
877
+ # Save comprehensive data as CSV to detailed/
878
+ qc_data_path = os.path.join(detailed_dir, "comprehensive_qc_data.csv")
863
879
  with open(qc_data_path, 'w') as f:
864
880
  f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
865
881
  f.write("mappable_bases,hit_rate,hit_rate_ci_lower,hit_rate_ci_upper,")
@@ -869,7 +885,7 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
869
885
  f.write("bg_qscore_mean,bg_qscore_std,bg_qscore_uncertainty,")
870
886
  f.write("hit_weighted_rate,hit_weighted_error,bg_weighted_rate,bg_weighted_error,")
871
887
  f.write("net_weighted_rate,net_weighted_error,lambda_bp_weighted,lambda_error_weighted\n")
872
-
888
+
873
889
  for q, r in zip(quality_thresholds, qc_results):
874
890
  f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
875
891
  f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
@@ -878,93 +894,98 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
878
894
  f.write(f"{r['bg_rate']:.6f},{r['bg_rate_ci'][0]:.6f},{r['bg_rate_ci'][1]:.6f},")
879
895
  f.write(f"{r['net_rate']:.6f},{r['net_rate_error']:.6f},")
880
896
  f.write(f"{r['lambda_bp']:.6f},{r['lambda_error']:.6f},{r['alignment_error']:.6f},")
881
-
897
+
882
898
  # Q-score information
883
899
  hit_qscore_mean = r['hit_qscore_stats']['mean_qscore'] if r['hit_qscore_stats'] else 0.0
884
900
  hit_qscore_std = r['hit_qscore_stats']['std_qscore'] if r['hit_qscore_stats'] else 0.0
885
901
  bg_qscore_mean = r['bg_qscore_stats']['mean_qscore'] if r['bg_qscore_stats'] else 0.0
886
902
  bg_qscore_std = r['bg_qscore_stats']['std_qscore'] if r['bg_qscore_stats'] else 0.0
887
-
903
+
888
904
  f.write(f"{hit_qscore_mean:.2f},{hit_qscore_std:.2f},{r['hit_qscore_uncertainty']:.6f},")
889
905
  f.write(f"{bg_qscore_mean:.2f},{bg_qscore_std:.2f},{r['bg_qscore_uncertainty']:.6f},")
890
906
  f.write(f"{r.get('hit_weighted_rate', 0.0):.6f},{r.get('hit_weighted_error', 0.0):.6f},")
891
907
  f.write(f"{r.get('bg_weighted_rate', 0.0):.6f},{r.get('bg_weighted_error', 0.0):.6f},")
892
908
  f.write(f"{r.get('net_weighted_rate', 0.0):.6f},{r.get('net_weighted_error', 0.0):.6f},")
893
909
  f.write(f"{r.get('lambda_bp_weighted', 0.0):.6f},{r.get('lambda_error_weighted', 0.0):.6f}\n")
894
-
910
+
895
911
  logging.info(f"Comprehensive QC data saved to: {qc_data_path}")
896
-
912
+
897
913
  except Exception as e:
898
914
  logging.error(f"Error creating comprehensive QC plots: {e}")
899
915
 
900
916
  def create_error_analysis_plot(quality_thresholds, qc_results, results_dir):
901
917
  """
902
918
  Create a detailed error analysis plot showing different sources of uncertainty.
903
-
919
+
904
920
  Args:
905
921
  quality_thresholds: List of quality score thresholds
906
922
  qc_results: List of comprehensive analysis results
907
923
  results_dir: Directory to save the plot
908
924
  """
909
925
  try:
926
+ # Create detailed/qc_plots/ subdirectory
927
+ detailed_dir = os.path.join(results_dir, "detailed")
928
+ qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
929
+ os.makedirs(qc_plots_dir, exist_ok=True)
930
+
910
931
  fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
911
-
932
+
912
933
  # Extract error components
913
934
  aa_std = [r['std_aa_mutations'] for r in qc_results]
914
935
  net_rate_errors = [r['net_rate_error'] for r in qc_results]
915
936
  lambda_errors = [r['lambda_error'] for r in qc_results]
916
937
  alignment_errors = [r['alignment_error'] for r in qc_results]
917
938
  mappable_bases = [r['mappable_bases'] for r in qc_results]
918
-
939
+
919
940
  # Plot 1: AA mutation uncertainty vs quality threshold
920
941
  ax1.plot(quality_thresholds, aa_std, 'o-', color='#2E8B57', linewidth=2, markersize=6)
921
942
  ax1.set_xlabel('Quality Score Threshold')
922
943
  ax1.set_ylabel('AA Mutation Standard Deviation')
923
944
  ax1.set_title('AA Mutation Uncertainty vs Quality Filter')
924
945
  ax1.grid(True, alpha=0.3)
925
-
946
+
926
947
  # Plot 2: Net rate error vs quality threshold
927
948
  ax2.plot(quality_thresholds, net_rate_errors, 's-', color='#FF6B6B', linewidth=2, markersize=6)
928
949
  ax2.set_xlabel('Quality Score Threshold')
929
950
  ax2.set_ylabel('Net Mutation Rate Error')
930
951
  ax2.set_title('Net Rate Error vs Quality Filter')
931
952
  ax2.grid(True, alpha=0.3)
932
-
953
+
933
954
  # Plot 3: Lambda error vs quality threshold
934
955
  ax3.plot(quality_thresholds, lambda_errors, '^-', color='#4169E1', linewidth=2, markersize=6)
935
956
  ax3.set_xlabel('Quality Score Threshold')
936
957
  ax3.set_ylabel('Lambda Error (mutations per copy)')
937
958
  ax3.set_title('Lambda Error vs Quality Filter')
938
959
  ax3.grid(True, alpha=0.3)
939
-
960
+
940
961
  # Plot 4: Alignment error vs mappable bases
941
962
  ax4.scatter(mappable_bases, alignment_errors, s=100, alpha=0.7, color='#FF8C00')
942
963
  ax4.set_xlabel('Mappable Bases')
943
- ax4.set_ylabel('Alignment Error (1/√reads)')
964
+ ax4.set_ylabel('Alignment Error (1/sqrt(reads))')
944
965
  ax4.set_title('Alignment Error vs Read Count')
945
966
  ax4.grid(True, alpha=0.3)
946
-
967
+
947
968
  # Add quality threshold labels to scatter plot
948
969
  for i, q in enumerate(quality_thresholds):
949
- ax4.annotate(f'Q{q}', (mappable_bases[i], alignment_errors[i]),
970
+ ax4.annotate(f'Q{q}', (mappable_bases[i], alignment_errors[i]),
950
971
  xytext=(5, 5), textcoords='offset points', fontsize=8)
951
-
972
+
952
973
  plt.tight_layout()
953
-
954
- # Save error analysis plot
955
- error_plot_path = os.path.join(results_dir, "error_analysis.png")
974
+
975
+ # Save error analysis plot to detailed/qc_plots/
976
+ error_plot_path = os.path.join(qc_plots_dir, "error_analysis.png")
956
977
  fig.savefig(error_plot_path, dpi=300, bbox_inches='tight')
957
978
  plt.close(fig)
958
-
979
+
959
980
  logging.info(f"Error analysis plot saved to: {error_plot_path}")
960
-
981
+
961
982
  except Exception as e:
962
983
  logging.error(f"Error creating error analysis plot: {e}")
963
984
 
964
985
  def create_qc_plot(quality_thresholds, aa_mutations, mappable_bases, results_dir):
965
986
  """
966
987
  Create a dual-axis plot showing quality score threshold vs AA mutations per gene and mappable bases.
967
-
988
+
968
989
  Args:
969
990
  quality_thresholds: List of quality score thresholds
970
991
  aa_mutations: List of corresponding AA mutations per gene
@@ -972,68 +993,73 @@ def create_qc_plot(quality_thresholds, aa_mutations, mappable_bases, results_dir
972
993
  results_dir: Directory to save the plot
973
994
  """
974
995
  try:
996
+ # Create detailed/qc_plots/ subdirectory
997
+ detailed_dir = os.path.join(results_dir, "detailed")
998
+ qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
999
+ os.makedirs(qc_plots_dir, exist_ok=True)
1000
+
975
1001
  # Create the plot with dual y-axes
976
1002
  fig, ax1 = plt.subplots(figsize=(12, 8))
977
-
1003
+
978
1004
  # Left y-axis: AA mutations per gene
979
1005
  color1 = '#2E8B57'
980
1006
  ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
981
1007
  ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
982
- ax1.scatter(quality_thresholds, aa_mutations,
1008
+ ax1.scatter(quality_thresholds, aa_mutations,
983
1009
  s=100, alpha=0.7, color=color1, edgecolors='black', linewidth=1, label='AA Mutations per Gene')
984
1010
  ax1.tick_params(axis='y', labelcolor=color1)
985
-
1011
+
986
1012
  # Right y-axis: Mappable bases
987
1013
  ax2 = ax1.twinx()
988
1014
  color2 = '#FF6B6B'
989
1015
  ax2.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
990
- ax2.scatter(quality_thresholds, mappable_bases,
1016
+ ax2.scatter(quality_thresholds, mappable_bases,
991
1017
  s=100, alpha=0.7, color=color2, edgecolors='black', linewidth=1, marker='s', label='Mappable Bases')
992
1018
  ax2.tick_params(axis='y', labelcolor=color2)
993
-
1019
+
994
1020
  # Customize the plot
995
1021
  ax1.set_title('AA Mutations per Gene and Mappable Bases vs Quality Score Filter', fontsize=14, fontweight='bold')
996
-
1022
+
997
1023
  # Add grid for better readability
998
1024
  ax1.grid(True, alpha=0.3)
999
-
1025
+
1000
1026
  # Customize ticks and spines
1001
1027
  ax1.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
1002
1028
  ax1.tick_params(axis='both', which='minor', direction='in', length=3)
1003
1029
  ax1.spines['top'].set_visible(False)
1004
1030
  ax1.spines['right'].set_visible(False)
1005
-
1031
+
1006
1032
  # Add data point labels for AA mutations
1007
1033
  for i, (q, aa_mut) in enumerate(zip(quality_thresholds, aa_mutations)):
1008
- ax1.annotate(f'Q{q}', (q, aa_mut), xytext=(5, 5),
1034
+ ax1.annotate(f'Q{q}', (q, aa_mut), xytext=(5, 5),
1009
1035
  textcoords='offset points', fontsize=9, alpha=0.8, color=color1)
1010
-
1036
+
1011
1037
  # Add data point labels for mappable bases
1012
1038
  for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
1013
- ax2.annotate(f'{reads}', (q, reads), xytext=(5, -15),
1039
+ ax2.annotate(f'{bases}', (q, bases), xytext=(5, -15),
1014
1040
  textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
1015
-
1041
+
1016
1042
  # Add legend
1017
1043
  lines1, labels1 = ax1.get_legend_handles_labels()
1018
1044
  lines2, labels2 = ax2.get_legend_handles_labels()
1019
1045
  ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
1020
-
1021
- # Save the plot
1022
- qc_plot_path = os.path.join(results_dir, "qc_mutation_rate_vs_quality.png")
1046
+
1047
+ # Save the plot to detailed/qc_plots/
1048
+ qc_plot_path = os.path.join(qc_plots_dir, "qc_mutation_rate_vs_quality.png")
1023
1049
  fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
1024
1050
  plt.close(fig)
1025
-
1051
+
1026
1052
  logging.info(f"QC plot saved to: {qc_plot_path}")
1027
-
1028
- # Also save data as CSV for reference
1029
- qc_data_path = os.path.join(results_dir, "qc_mutation_rate_vs_quality.csv")
1053
+
1054
+ # Also save data as CSV to detailed/qc_plots/
1055
+ qc_data_path = os.path.join(qc_plots_dir, "qc_mutation_rate_vs_quality.csv")
1030
1056
  with open(qc_data_path, 'w') as f:
1031
1057
  f.write("quality_threshold,aa_mutations_per_gene,mappable_bases\n")
1032
1058
  for q, aa_mut, bases in zip(quality_thresholds, aa_mutations, mappable_bases):
1033
1059
  f.write(f"{q},{aa_mut:.6f},{bases}\n")
1034
-
1060
+
1035
1061
  logging.info(f"QC data saved to: {qc_data_path}")
1036
-
1062
+
1037
1063
  except Exception as e:
1038
1064
  logging.error(f"Error creating QC plot: {e}")
1039
1065
 
@@ -1330,74 +1356,74 @@ def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_f
1330
1356
  except Exception as e:
1331
1357
  logging.error(f"Error in segmented analysis: {e}")
1332
1358
  return None
1359
+
1360
+ def calculate_qscore_weighted_mismatches(sam_file, ref_seq, qscore_stats):
1333
1361
  """
1334
1362
  Calculate mismatches weighted by Q-score uncertainty with proper sampling error.
1335
-
1363
+
1336
1364
  Args:
1337
1365
  sam_file: Path to SAM file
1338
1366
  ref_seq: Reference sequence
1339
1367
  qscore_stats: Q-score statistics from extract_qscores_from_sam
1340
-
1368
+
1341
1369
  Returns:
1342
1370
  tuple: (weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes)
1343
1371
  """
1344
1372
  try:
1345
- import pysam
1346
-
1347
1373
  weighted_mismatches = 0.0
1348
1374
  total_weighted_coverage = 0.0
1349
1375
  raw_mismatches = 0
1350
1376
  raw_coverage = 0
1351
-
1377
+
1352
1378
  # Store position-level data for proper sampling error calculation
1353
1379
  position_weights = []
1354
1380
  position_outcomes = []
1355
-
1381
+
1356
1382
  position_qscores = qscore_stats['position_avg_qscores']
1357
-
1383
+
1358
1384
  with pysam.AlignmentFile(sam_file, "r") as samfile:
1359
1385
  for read in samfile:
1360
1386
  if read.is_unmapped:
1361
1387
  continue
1362
-
1388
+
1363
1389
  # Get aligned pairs (read_pos, ref_pos)
1364
1390
  for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
1365
1391
  if ref_pos is None or read_pos is None:
1366
1392
  continue
1367
-
1393
+
1368
1394
  if ref_pos >= len(ref_seq):
1369
1395
  continue
1370
-
1396
+
1371
1397
  # Get base calls
1372
1398
  read_base = read.query_sequence[read_pos].upper()
1373
1399
  ref_base = ref_seq[ref_pos].upper()
1374
-
1400
+
1375
1401
  # Skip if either base is N
1376
1402
  if read_base == 'N' or ref_base == 'N':
1377
1403
  continue
1378
-
1404
+
1379
1405
  # Get Q-score for this position
1380
1406
  qscore = position_qscores.get(ref_pos, qscore_stats['mean_qscore'])
1381
1407
  uncertainty_factor = qscore_uncertainty_factor(qscore)
1382
-
1408
+
1383
1409
  # Weight by uncertainty (lower Q-score = higher uncertainty = lower weight)
1384
1410
  weight = 1.0 - uncertainty_factor
1385
-
1411
+
1386
1412
  # Store position-level data
1387
1413
  position_weights.append(weight)
1388
1414
  position_outcomes.append(1 if read_base != ref_base else 0)
1389
-
1415
+
1390
1416
  # Count coverage
1391
1417
  total_weighted_coverage += weight
1392
1418
  raw_coverage += 1
1393
-
1419
+
1394
1420
  # Count mismatches
1395
1421
  if read_base != ref_base:
1396
1422
  weighted_mismatches += weight
1397
1423
  raw_mismatches += 1
1398
-
1424
+
1399
1425
  return weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes
1400
-
1426
+
1401
1427
  except Exception as e:
1402
1428
  logging.error(f"Error calculating Q-score weighted mismatches: {e}")
1403
1429
  return 0.0, 0.0, 0, 0, [], []
@@ -1827,6 +1853,289 @@ def simulate_aa_distribution(lambda_bp, cds_seq, n_trials=1000):
1827
1853
 
1828
1854
  return aa_diffs
1829
1855
 
1856
+ def create_output_directories(results_dir):
1857
+ """
1858
+ Create the output directory structure with detailed/ and detailed/qc_plots/ subdirectories.
1859
+
1860
+ Args:
1861
+ results_dir: Base results directory path
1862
+
1863
+ Returns:
1864
+ dict: Paths to created directories
1865
+ """
1866
+ results_dir = Path(results_dir)
1867
+ detailed_dir = results_dir / "detailed"
1868
+ qc_plots_dir = detailed_dir / "qc_plots"
1869
+
1870
+ detailed_dir.mkdir(parents=True, exist_ok=True)
1871
+ qc_plots_dir.mkdir(parents=True, exist_ok=True)
1872
+
1873
+ logging.info(f"Created output directories: {detailed_dir}, {qc_plots_dir}")
1874
+
1875
+ return {
1876
+ 'results_dir': results_dir,
1877
+ 'detailed_dir': detailed_dir,
1878
+ 'qc_plots_dir': qc_plots_dir,
1879
+ }
1880
+
1881
+ def write_key_findings(results_dir, consensus_info, simple_lambda, simple_aa_mean, is_protein, hit_seq):
1882
+ """
1883
+ Generate lay-user executive summary KEY_FINDINGS.txt.
1884
+
1885
+ Args:
1886
+ results_dir: Results directory path
1887
+ consensus_info: Consensus AA mutation estimate from QC analysis
1888
+ simple_lambda: Simple lambda (bp mutations per copy) from main analysis
1889
+ simple_aa_mean: Simple AA mutation mean from Monte Carlo simulation
1890
+ is_protein: Whether the region is protein-coding
1891
+ hit_seq: The hit sequence (for length calculation)
1892
+ """
1893
+ key_findings_path = Path(results_dir) / "KEY_FINDINGS.txt"
1894
+
1895
+ with open(key_findings_path, "w") as f:
1896
+ f.write("=" * 60 + "\n")
1897
+ f.write("EP LIBRARY PROFILER - KEY FINDINGS\n")
1898
+ f.write("=" * 60 + "\n\n")
1899
+
1900
+ # Determine which value to use as the "headline" number
1901
+ if consensus_info and consensus_info.get("consensus_mean") is not None:
1902
+ headline_aa = consensus_info["consensus_mean"]
1903
+ headline_std = consensus_info.get("consensus_std", 0.0)
1904
+ method_note = "consensus (precision-weighted average across Q-score thresholds)"
1905
+ elif simple_aa_mean is not None:
1906
+ headline_aa = simple_aa_mean
1907
+ headline_std = 0.0 # Simple method doesn't provide error
1908
+ method_note = "Monte Carlo simulation (single Q-score)"
1909
+ else:
1910
+ headline_aa = None
1911
+ headline_std = 0.0
1912
+ method_note = "N/A"
1913
+
1914
+ f.write("EXPECTED AMINO ACID MUTATIONS PER GENE COPY\n")
1915
+ f.write("-" * 45 + "\n")
1916
+ if is_protein and headline_aa is not None:
1917
+ f.write(f" {headline_aa:.2f} +/- {headline_std:.2f} AA mutations per gene copy\n")
1918
+ f.write(f" (Method: {method_note})\n\n")
1919
+
1920
+ # Plain-language interpretation using Poisson distribution
1921
+ f.write("WHAT THIS MEANS (Poisson distribution):\n")
1922
+ f.write("-" * 45 + "\n")
1923
+ if headline_aa > 0:
1924
+ # P(k=0) = e^(-lambda)
1925
+ p_wildtype = np.exp(-headline_aa) * 100
1926
+ # P(k=1) = lambda * e^(-lambda)
1927
+ p_one_mut = headline_aa * np.exp(-headline_aa) * 100
1928
+ # P(k>=2) = 1 - P(0) - P(1)
1929
+ p_two_plus = 100 - p_wildtype - p_one_mut
1930
+
1931
+ f.write(f" ~{p_wildtype:.1f}% of gene copies are wild-type (0 AA mutations)\n")
1932
+ f.write(f" ~{p_one_mut:.1f}% have exactly 1 AA mutation\n")
1933
+ f.write(f" ~{p_two_plus:.1f}% have 2 or more AA mutations\n\n")
1934
+ else:
1935
+ f.write(" Nearly all gene copies are expected to be wild-type.\n\n")
1936
+ else:
1937
+ if not is_protein:
1938
+ f.write(" Region is not protein-coding; AA mutation estimate not applicable.\n\n")
1939
+ else:
1940
+ f.write(" AA mutation estimate could not be calculated.\n\n")
1941
+
1942
+ # Quality assessment
1943
+ f.write("QUALITY ASSESSMENT\n")
1944
+ f.write("-" * 45 + "\n")
1945
+ if consensus_info:
1946
+ n_thresholds = len(consensus_info.get("thresholds_used", []))
1947
+ min_bases = consensus_info.get("min_mappable_bases", 0)
1948
+ note = consensus_info.get("note", "")
1949
+
1950
+ if n_thresholds >= 3 and note != "FELL_BACK_TO_MAX_COVERAGE":
1951
+ f.write(" GOOD - Multiple Q-score thresholds contributed to consensus\n")
1952
+ elif n_thresholds >= 1:
1953
+ f.write(" ACCEPTABLE - Limited Q-score thresholds available\n")
1954
+ else:
1955
+ f.write(" LOW COVERAGE - Results may be unreliable\n")
1956
+
1957
+ if note == "FELL_BACK_TO_MAX_COVERAGE":
1958
+ f.write(" WARNING: Fell back to max-coverage threshold due to low coverage\n")
1959
+ else:
1960
+ f.write(" UNKNOWN - Consensus analysis not available\n")
1961
+
1962
+ f.write("\n")
1963
+ f.write("FOR DETAILED TECHNICAL INFORMATION\n")
1964
+ f.write("-" * 45 + "\n")
1965
+ f.write(" See the detailed/ folder for:\n")
1966
+ f.write(" - methodology_notes.txt: Full explanation of calculations\n")
1967
+ f.write(" - lambda_comparison.csv: Side-by-side lambda estimates\n")
1968
+ f.write(" - comprehensive_qc_data.csv: All Q-score threshold results\n")
1969
+ f.write("\n")
1970
+
1971
+ logging.info(f"Wrote KEY_FINDINGS.txt to: {key_findings_path}")
1972
+
1973
+ def write_lambda_comparison(detailed_dir, simple_lambda, simple_aa_mean, consensus_info, hit_seq_length):
1974
+ """
1975
+ Write CSV comparing all lambda estimates side-by-side.
1976
+
1977
+ Args:
1978
+ detailed_dir: Path to detailed/ directory
1979
+ simple_lambda: Simple lambda (bp mutations per copy)
1980
+ simple_aa_mean: Simple AA mutation mean from Monte Carlo
1981
+ consensus_info: Consensus info from QC analysis
1982
+ hit_seq_length: Length of the hit sequence
1983
+ """
1984
+ lambda_csv_path = Path(detailed_dir) / "lambda_comparison.csv"
1985
+
1986
+ with open(lambda_csv_path, "w") as f:
1987
+ f.write("method,lambda_bp,lambda_error,aa_estimate,aa_error,notes\n")
1988
+
1989
+ # Simple method (from main analysis)
1990
+ simple_error = "N/A" # Simple method doesn't compute error
1991
+ simple_aa_err = "N/A"
1992
+ f.write(f"simple,(hit_rate - bg_rate) * seq_len,{simple_lambda:.6f},{simple_error},")
1993
+ if simple_aa_mean is not None:
1994
+ f.write(f"{simple_aa_mean:.4f},{simple_aa_err},")
1995
+ else:
1996
+ f.write("N/A,N/A,")
1997
+ f.write("Used for KDE plot and Monte Carlo simulation\n")
1998
+
1999
+ # Consensus method (from QC analysis)
2000
+ if consensus_info and consensus_info.get("consensus_mean") is not None:
2001
+ consensus_mean = consensus_info["consensus_mean"]
2002
+ consensus_std = consensus_info.get("consensus_std", 0.0)
2003
+ thresholds = consensus_info.get("thresholds_used", [])
2004
+ # Consensus is in AA mutations, back-calculate approximate lambda
2005
+ # Rough approximation: lambda_bp ~ 3 * aa_mutations
2006
+ approx_lambda = consensus_mean * 3.0
2007
+ approx_lambda_err = consensus_std * 3.0
2008
+ f.write(f"consensus_weighted,{approx_lambda:.6f},{approx_lambda_err:.6f},")
2009
+ f.write(f"{consensus_mean:.4f},{consensus_std:.4f},")
2010
+ f.write(f"Precision-weighted across Q-scores: {thresholds}\n")
2011
+ else:
2012
+ f.write("consensus_weighted,N/A,N/A,N/A,N/A,Not computed or insufficient data\n")
2013
+
2014
+ logging.info(f"Wrote lambda_comparison.csv to: {lambda_csv_path}")
2015
+
2016
+ def write_methodology_notes(detailed_dir):
2017
+ """
2018
+ Write detailed methodology documentation explaining each lambda calculation method.
2019
+
2020
+ Args:
2021
+ detailed_dir: Path to detailed/ directory
2022
+ """
2023
+ methodology_path = Path(detailed_dir) / "methodology_notes.txt"
2024
+
2025
+ content = """EP LIBRARY PROFILER - METHODOLOGY NOTES
2026
+ =======================================
2027
+
2028
+ This document explains the different mutation rate estimates produced by the
2029
+ EP library profiler and which outputs use which estimates.
2030
+
2031
+
2032
+ LAMBDA CALCULATION METHODS
2033
+ --------------------------
2034
+
2035
+ 1. SIMPLE LAMBDA (used for KDE plot and Monte Carlo simulation)
2036
+
2037
+ Formula: lambda_bp = (hit_rate - bg_rate) * sequence_length
2038
+
2039
+ Where:
2040
+ - hit_rate = total_mismatches / total_covered_bases (in target region)
2041
+ - bg_rate = total_mismatches / total_covered_bases (in plasmid excluding target)
2042
+ - sequence_length = length of target CDS in base pairs
2043
+
2044
+ This method:
2045
+ - Does NOT include error propagation
2046
+ - Does NOT weight by Q-score
2047
+ - Is fast and provides a point estimate
2048
+
2049
+ Used in:
2050
+ - summary_panels.png/pdf (Panel 4: KDE of AA mutations)
2051
+ - summary.txt
2052
+ - aa_mutation_distribution.csv
2053
+
2054
+
2055
+ 2. Q-SCORE WEIGHTED LAMBDA (used in comprehensive QC analysis)
2056
+
2057
+ Formula: lambda_bp_weighted = net_weighted_rate * sequence_length
2058
+
2059
+ Where:
2060
+ - net_weighted_rate = hit_weighted_rate - bg_weighted_rate
2061
+ - Weighted rates account for per-base Q-score uncertainty
2062
+ - Weights = 1 - sqrt(10^(-Q/10)) for each position
2063
+
2064
+ This method:
2065
+ - DOES include error propagation
2066
+ - DOES weight by Q-score (higher Q-score = higher weight)
2067
+ - Provides confidence intervals
2068
+
2069
+ Used in:
2070
+ - comprehensive_qc_data.csv
2071
+ - error_analysis.png
2072
+
2073
+
2074
+ 3. CONSENSUS LAMBDA (recommended for reporting)
2075
+
2076
+ Formula: Precision-weighted average across Q-score thresholds
2077
+
2078
+ weights[i] = 1 / std_aa_mutations[i]
2079
+ consensus_mean = sum(weights * means) / sum(weights)
2080
+
2081
+ This method:
2082
+ - Aggregates estimates from multiple Q-score filtering thresholds
2083
+ - Weights by precision (lower uncertainty = higher weight)
2084
+ - Requires minimum coverage threshold (default 1000 mappable bases)
2085
+ - Provides the most robust estimate when multiple thresholds pass QC
2086
+
2087
+ Used in:
2088
+ - aa_mutation_consensus.txt
2089
+ - KEY_FINDINGS.txt
2090
+ - QC plots (red dashed line)
2091
+
2092
+
2093
+ WHICH VALUE SHOULD I USE?
2094
+ -------------------------
2095
+
2096
+ For publication/reporting:
2097
+ Use the CONSENSUS value from aa_mutation_consensus.txt or KEY_FINDINGS.txt
2098
+ This is the most statistically robust estimate.
2099
+
2100
+ For understanding the distribution shape:
2101
+ Use the KDE plot in summary_panels.png
2102
+ Note: This uses the SIMPLE lambda, not the consensus.
2103
+
2104
+ For detailed error analysis:
2105
+ Use comprehensive_qc_data.csv in the detailed/ folder
2106
+ This contains per-Q-score estimates with full error propagation.
2107
+
2108
+
2109
+ OUTPUT FILE REFERENCE
2110
+ ---------------------
2111
+
2112
+ Root folder:
2113
+ - KEY_FINDINGS.txt: Executive summary with consensus AA mutations
2114
+ - summary_panels.png/pdf: Main visualization (uses simple lambda for KDE)
2115
+ - aa_mutation_consensus.txt: Consensus estimate details
2116
+
2117
+ detailed/ folder:
2118
+ - methodology_notes.txt: This file
2119
+ - lambda_comparison.csv: Side-by-side comparison of all methods
2120
+ - comprehensive_qc_data.csv: Full QC data with error estimates
2121
+ - simple_qc_data.csv: Simplified QC data
2122
+ - gene_mismatch_rates.csv: Per-position mismatch rates
2123
+ - base_distribution.csv: Base counts at each position
2124
+ - aa_substitutions.csv: Amino acid substitution data
2125
+ - plasmid_coverage.csv: Coverage across plasmid
2126
+ - aa_mutation_distribution.csv: Monte Carlo AA mutation trials
2127
+
2128
+ detailed/qc_plots/ folder:
2129
+ - qc_plot_*.png: Q-score threshold analysis plot
2130
+ - comprehensive_qc_analysis.png: Detailed QC visualization
2131
+ - error_analysis.png: Error component breakdown
2132
+ """
2133
+
2134
+ with open(methodology_path, "w") as f:
2135
+ f.write(content)
2136
+
2137
+ logging.info(f"Wrote methodology_notes.txt to: {methodology_path}")
2138
+
1830
2139
  def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, work_dir, results_dir,
1831
2140
  chunks, ref_hit_fasta, plasmid_fasta, hit_seq, hit_id, plasmid_seq, idx):
1832
2141
  """
@@ -1854,13 +2163,18 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
1854
2163
 
1855
2164
  # Ensure work directory exists
1856
2165
  os.makedirs(work_dir, exist_ok=True)
1857
-
2166
+
1858
2167
  # Create subdirectory for this Q-score analysis
1859
2168
  qscore_results_dir = results_dir
1860
2169
  if qscore is not None:
1861
2170
  qscore_results_dir = os.path.join(results_dir, f"q{qscore}_analysis")
1862
2171
  os.makedirs(qscore_results_dir, exist_ok=True)
1863
-
2172
+
2173
+ # Create output directory structure (detailed/ and detailed/qc_plots/)
2174
+ output_dirs = create_output_directories(qscore_results_dir)
2175
+ detailed_dir = output_dirs['detailed_dir']
2176
+ qc_plots_dir = output_dirs['qc_plots_dir']
2177
+
1864
2178
  # Write chunks FASTA & align to background‐chunks
1865
2179
  chunks_fasta = create_multi_fasta(chunks, work_dir)
1866
2180
  sam_chunks = run_minimap2(fastq_path, chunks_fasta, "plasmid_chunks_alignment", work_dir)
@@ -1976,9 +2290,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
1976
2290
  qscore_info = f" ({qscore_desc})" if qscore_desc != "unfiltered" else ""
1977
2291
 
1978
2292
  # ----------------------------
1979
- # SAVE CSV FOR MUTATION RATES (PANEL 1)
2293
+ # SAVE CSV FOR MUTATION RATES (PANEL 1) - to detailed/
1980
2294
  # ----------------------------
1981
- gene_mismatch_csv = os.path.join(qscore_results_dir, "gene_mismatch_rates.csv")
2295
+ gene_mismatch_csv = os.path.join(detailed_dir, "gene_mismatch_rates.csv")
1982
2296
  with open(gene_mismatch_csv, "w", newline="") as csvfile:
1983
2297
  csvfile.write(f"# gene_id: {hit_id}\n")
1984
2298
  csvfile.write(f"# background_rate_per_kb: {bg_rate_per_kb:.6f}\n")
@@ -1988,9 +2302,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
1988
2302
  logging.info(f"Saved CSV for gene mismatch rates: {gene_mismatch_csv}")
1989
2303
 
1990
2304
  # ----------------------------
1991
- # SAVE CSV FOR BASE DISTRIBUTION (PANEL 2)
2305
+ # SAVE CSV FOR BASE DISTRIBUTION (PANEL 2) - to detailed/
1992
2306
  # ----------------------------
1993
- base_dist_csv = os.path.join(qscore_results_dir, "base_distribution.csv")
2307
+ base_dist_csv = os.path.join(detailed_dir, "base_distribution.csv")
1994
2308
  with open(base_dist_csv, "w", newline="") as csvfile:
1995
2309
  csvfile.write(f"# gene_id: {hit_id}\n")
1996
2310
  csvfile.write("position_1based,ref_base,A_count,C_count,G_count,T_count,N_count\n")
@@ -2000,10 +2314,10 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2000
2314
  logging.info(f"Saved CSV for base distribution: {base_dist_csv}")
2001
2315
 
2002
2316
  # ----------------------------
2003
- # SAVE CSV FOR AA SUBSTITUTIONS (PANEL 3) - only if protein
2317
+ # SAVE CSV FOR AA SUBSTITUTIONS (PANEL 3) - to detailed/ - only if protein
2004
2318
  # ----------------------------
2005
2319
  if is_protein:
2006
- aa_subst_csv = os.path.join(qscore_results_dir, "aa_substitutions.csv")
2320
+ aa_subst_csv = os.path.join(detailed_dir, "aa_substitutions.csv")
2007
2321
  with open(aa_subst_csv, "w", newline="") as csvfile:
2008
2322
  csvfile.write(f"# gene_id: {hit_id}\n")
2009
2323
  csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
@@ -2013,9 +2327,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2013
2327
  logging.info(f"Saved CSV for AA substitutions: {aa_subst_csv}")
2014
2328
 
2015
2329
  # ----------------------------
2016
- # SAVE CSV FOR PLASMID COVERAGE (PANEL 4)
2330
+ # SAVE CSV FOR PLASMID COVERAGE (PANEL 4) - to detailed/
2017
2331
  # ----------------------------
2018
- plasmid_cov_csv = os.path.join(qscore_results_dir, "plasmid_coverage.csv")
2332
+ plasmid_cov_csv = os.path.join(detailed_dir, "plasmid_coverage.csv")
2019
2333
  with open(plasmid_cov_csv, "w", newline="") as csvfile:
2020
2334
  csvfile.write("position_1based,coverage\n")
2021
2335
  for pos0, cov in enumerate(plasmid_cov):
@@ -2023,9 +2337,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2023
2337
  logging.info(f"Saved CSV for plasmid coverage: {plasmid_cov_csv}")
2024
2338
 
2025
2339
  # ----------------------------
2026
- # SAVE CSV FOR AA MUTATION DISTRIBUTION (PANEL 3)
2340
+ # SAVE CSV FOR AA MUTATION DISTRIBUTION (PANEL 3) - to detailed/
2027
2341
  # ----------------------------
2028
- aa_dist_csv = os.path.join(qscore_results_dir, "aa_mutation_distribution.csv")
2342
+ aa_dist_csv = os.path.join(detailed_dir, "aa_mutation_distribution.csv")
2029
2343
  with open(aa_dist_csv, "w", newline="") as csvfile:
2030
2344
  csvfile.write(f"# gene_id: {hit_id}\n")
2031
2345
  csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
@@ -2135,7 +2449,7 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2135
2449
  if is_protein and aa_diffs and len(aa_diffs) > 0:
2136
2450
  x_vals = np.array(aa_diffs)
2137
2451
  unique_vals = np.unique(x_vals)
2138
-
2452
+
2139
2453
  if len(unique_vals) > 1:
2140
2454
  # Multiple unique values - use KDE or histogram
2141
2455
  if HAVE_SCIPY:
@@ -2149,15 +2463,23 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2149
2463
  ax3.set_ylim(bottom=0)
2150
2464
  except Exception as e:
2151
2465
  logging.warning(f"KDE failed: {e}, falling back to histogram")
2152
- ax3.hist(x_vals, bins=min(20, len(unique_vals)),
2466
+ ax3.hist(x_vals, bins=min(20, len(unique_vals)),
2153
2467
  color="#C44E52", alpha=0.7, density=True, edgecolor='black')
2154
2468
  else:
2155
- ax3.hist(x_vals, bins=min(20, len(unique_vals)),
2469
+ ax3.hist(x_vals, bins=min(20, len(unique_vals)),
2156
2470
  color="#C44E52", alpha=0.7, density=True, edgecolor='black')
2157
2471
  else:
2158
2472
  # Single unique value - just show a bar
2159
2473
  ax3.bar(unique_vals, [1.0], color="#C44E52", alpha=0.7, width=0.1)
2160
2474
  ax3.set_xlim(unique_vals[0] - 0.5, unique_vals[0] + 0.5)
2475
+
2476
+ # Set title with lambda value for protein-coding sequences
2477
+ ax3.set_title(f"AA Mutation Distribution (Monte Carlo, \u03bb={est_mut_per_copy:.2f}){qscore_info}",
2478
+ fontsize=14, fontweight='bold')
2479
+ ax3.set_xlabel("Number of AA Mutations", fontsize=12)
2480
+ ax3.set_ylabel("Density", fontsize=12)
2481
+ ax3.spines['top'].set_visible(False)
2482
+ ax3.spines['right'].set_visible(False)
2161
2483
  else:
2162
2484
  # Not protein or no AA differences — display an informative message
2163
2485
  ax3.text(
@@ -2170,7 +2492,7 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2170
2492
  color="gray",
2171
2493
  transform=ax3.transAxes,
2172
2494
  )
2173
-
2495
+
2174
2496
  ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
2175
2497
  ax3.set_xlabel("Number of AA Mutations", fontsize=12)
2176
2498
  ax3.set_ylabel("Density", fontsize=12)
@@ -2231,9 +2553,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2231
2553
  sample_percent[cat] = 0.0
2232
2554
 
2233
2555
  # ----------------------------
2234
- # GENERATE PDF TABLE (MUTATION SPECTRUM)
2556
+ # GENERATE PDF TABLE (MUTATION SPECTRUM) - to detailed/
2235
2557
  # ----------------------------
2236
- pdf_path = os.path.join(qscore_results_dir, f"{sample_name}_mutation_spectrum.pdf")
2558
+ pdf_path = os.path.join(detailed_dir, f"{sample_name}_mutation_spectrum.pdf")
2237
2559
  # Prepare table data
2238
2560
  table_rows = []
2239
2561
  for cat in categories:
@@ -2341,9 +2663,6 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
2341
2663
  }
2342
2664
 
2343
2665
 
2344
-
2345
- main()
2346
-
2347
2666
  def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
2348
2667
  paths: List[Path] = []
2349
2668
  for item in inputs:
@@ -2396,7 +2715,7 @@ def run_ep_library_profile(
2396
2715
  master_summary_path.write_text(header + "\n", encoding="utf-8")
2397
2716
 
2398
2717
  sample_results: List[Dict[str, object]] = []
2399
- for fastq in fastq_paths:
2718
+ for fastq in tqdm(fastq_paths, desc="Processing FASTQ files", unit="file"):
2400
2719
  result = process_single_fastq(
2401
2720
  fastq,
2402
2721
  region_fasta,
@@ -2503,6 +2822,7 @@ def process_single_fastq(
2503
2822
 
2504
2823
  logging.info("Running QC analysis to get Q-score results...")
2505
2824
  qc_results = None
2825
+ consensus_info = None
2506
2826
  try:
2507
2827
  qc_results, consensus_info = run_qc_analysis(
2508
2828
  str(fastq_path),
@@ -2563,6 +2883,45 @@ def process_single_fastq(
2563
2883
  )
2564
2884
  analysis_results.append(result)
2565
2885
 
2886
+ # Generate unified summary files in the sample's root results directory
2887
+ # Get simple lambda from the unfiltered analysis (first result)
2888
+ simple_lambda = 0.0
2889
+ simple_aa_mean = None
2890
+ is_protein = False
2891
+ unfiltered_result = analysis_results[0] if analysis_results else None
2892
+ if unfiltered_result:
2893
+ simple_lambda = unfiltered_result.get('est_mut_per_copy', 0.0)
2894
+ simple_aa_mean = unfiltered_result.get('avg_aa_mutations')
2895
+ is_protein = unfiltered_result.get('is_protein', False)
2896
+
2897
+ # Create output directories and generate summary files
2898
+ output_dirs = create_output_directories(results_dir)
2899
+ detailed_dir = output_dirs['detailed_dir']
2900
+
2901
+ # Write KEY_FINDINGS.txt (lay-user summary)
2902
+ write_key_findings(
2903
+ results_dir,
2904
+ consensus_info,
2905
+ simple_lambda,
2906
+ simple_aa_mean,
2907
+ is_protein,
2908
+ hit_seq,
2909
+ )
2910
+
2911
+ # Write lambda_comparison.csv
2912
+ write_lambda_comparison(
2913
+ detailed_dir,
2914
+ simple_lambda,
2915
+ simple_aa_mean,
2916
+ consensus_info,
2917
+ len(hit_seq),
2918
+ )
2919
+
2920
+ # Write methodology_notes.txt
2921
+ write_methodology_notes(detailed_dir)
2922
+
2923
+ logging.info("Generated unified summary files: KEY_FINDINGS.txt, lambda_comparison.csv, methodology_notes.txt")
2924
+
2566
2925
  if work_dir.exists():
2567
2926
  shutil.rmtree(work_dir)
2568
2927
  logging.info("Removed temporary work directory: %s", work_dir)
@@ -2573,5 +2932,6 @@ def process_single_fastq(
2573
2932
  "sample": sample_name,
2574
2933
  "results_dir": results_dir,
2575
2934
  "analysis_results": analysis_results,
2935
+ "consensus_info": consensus_info,
2576
2936
  }
2577
2937