PyPI - uht-tooling - Versions diffs - 0.1.9__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

uht-tooling 0.1.9py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

uht_tooling/cli.py +153 -4
uht_tooling/config.py +137 -0
uht_tooling/tools.py +143 -0
uht_tooling/workflows/gui.py +19 -0
uht_tooling/workflows/mut_rate.py +484 -124
uht_tooling/workflows/mutation_caller.py +11 -2
uht_tooling/workflows/umi_hunter.py +9 -4
{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/METADATA +123 -5
uht_tooling-0.3.0.dist-info/RECORD +20 -0
uht_tooling-0.1.9.dist-info/RECORD +0 -18
{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/WHEEL +0 -0
{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/entry_points.txt +0 -0
{uht_tooling-0.1.9.dist-info → uht_tooling-0.3.0.dist-info}/top_level.txt +0 -0

uht_tooling/workflows/mut_rate.py CHANGED Viewed

@@ -16,6 +16,7 @@ import math
 import tempfile
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+from tqdm import tqdm
 # Use a built-in Matplotlib style ("ggplot") for consistency
 plt.style.use("ggplot")
@@ -219,7 +220,12 @@ def compute_mismatch_stats_sam(sam_file, refs_dict):
     logging.info(f"Computing mismatch stats for {sam_file}")
     samfile = pysam.AlignmentFile(sam_file, "r")
-    for read in samfile.fetch():
+    # Count total aligned reads for progress bar
+    total_reads = sum(1 for _ in samfile.fetch())
+    samfile.close()
+    samfile = pysam.AlignmentFile(sam_file, "r")
+    for read in tqdm(samfile.fetch(), desc="Computing mismatch stats", total=total_reads, unit="read"):
         if read.is_unmapped or read.query_sequence is None:
             continue
         ref_name = samfile.get_reference_name(read.reference_id)
@@ -743,40 +749,50 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
         plt.tight_layout()
-        # Save the plot
+        # Create detailed/qc_plots/ subdirectory for QC plots
+        detailed_dir = os.path.join(results_dir, "detailed")
+        qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
+        os.makedirs(qc_plots_dir, exist_ok=True)
+        # Save the plot to detailed/qc_plots/
         project_name = os.path.basename(results_dir)
-        qc_plot_path = os.path.join(results_dir, f"qc_plot_{project_name}.png")
+        qc_plot_path = os.path.join(qc_plots_dir, f"qc_plot_{project_name}.png")
         fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
         plt.close(fig)
         logging.info(f"QC plot saved to: {qc_plot_path}")
-        # Save data as CSV
-        qc_data_path = os.path.join(results_dir, "simple_qc_data.csv")
+        # Save data as CSV to detailed/
+        qc_data_path = os.path.join(detailed_dir, "simple_qc_data.csv")
         with open(qc_data_path, 'w') as f:
             f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
             f.write("total_mappable_bases,n_segments\n")
             for q, r in zip(quality_thresholds, qc_results):
                 f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
                 f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
                 f.write(f"{r['total_mappable_bases']},{r['n_segments']}\n")
         logging.info(f"Simple QC data saved to: {qc_data_path}")
     except Exception as e:
         logging.error(f"Error creating simple QC plots: {e}")
+def create_comprehensive_qc_plots(quality_thresholds, qc_results, results_dir):
     """
     Create comprehensive QC plots with error bars and uncertainty quantification.
     Args:
         quality_thresholds: List of quality score thresholds
         qc_results: List of comprehensive analysis results
         results_dir: Directory to save the plots
-        optimal_qscore: Optimal Q-score threshold (optional)
-        optimal_result: Optimal result data (optional)
     """
     try:
+        # Create detailed/qc_plots/ subdirectory
+        detailed_dir = os.path.join(results_dir, "detailed")
+        qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
+        os.makedirs(qc_plots_dir, exist_ok=True)
         # Extract data for plotting
         aa_mutations = [r['mean_aa_mutations'] for r in qc_results]
         aa_errors = [r['std_aa_mutations'] for r in qc_results]
@@ -785,46 +801,46 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
         mappable_bases = [r['mappable_bases'] for r in qc_results]
         net_rates = [r['net_rate'] for r in qc_results]
         net_rate_errors = [r['net_rate_error'] for r in qc_results]
         # Create main QC plot with error bars
         fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12))
         # Top plot: AA mutations per gene with error bars
         color1 = '#2E8B57'
-        ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
-                    fmt='o', capsize=5, capthick=2, markersize=8,
+        ax1.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
+                    fmt='o', capsize=5, capthick=2, markersize=8,
                     color=color1, ecolor=color1, alpha=0.8, label='Mean ± Std')
         # Add confidence intervals as shaded area
-        ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
+        ax1.fill_between(quality_thresholds, aa_ci_lower, aa_ci_upper,
                         alpha=0.3, color=color1, label='95% Confidence Interval')
         ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
         ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
         ax1.tick_params(axis='y', labelcolor=color1)
-        ax1.set_title('AA Mutations per Gene vs Quality Score Filter (with Error Propagation)',
+        ax1.set_title('AA Mutations per Gene vs Quality Score Filter (with Error Propagation)',
                      fontsize=14, fontweight='bold')
         ax1.grid(True, alpha=0.3)
         ax1.legend(frameon=False, fontsize=10)
         # Add data point labels
         for i, (q, aa_mut, aa_err) in enumerate(zip(quality_thresholds, aa_mutations, aa_errors)):
-            ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
-                        (q, aa_mut), xytext=(5, 5),
+            ax1.annotate(f'Q{q}\n{aa_mut:.3f}±{aa_err:.3f}',
+                        (q, aa_mut), xytext=(5, 5),
                         textcoords='offset points', fontsize=8, alpha=0.8, color=color1)
         # Bottom plot: Mappable bases and AA mutations per gene
         color2 = '#FF6B6B'
         color3 = '#4169E1'
         # Mappable bases (left y-axis)
         ax2_twin = ax2.twinx()
-        ax2_twin.scatter(quality_thresholds, mappable_bases,
-                        s=100, alpha=0.7, color=color2, edgecolors='black',
+        ax2_twin.scatter(quality_thresholds, mappable_bases,
+                        s=100, alpha=0.7, color=color2, edgecolors='black',
                         linewidth=1, marker='s', label='Mappable Bases')
         ax2_twin.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
         ax2_twin.tick_params(axis='y', labelcolor=color2)
         # AA mutations per gene with error bars (right y-axis)
         ax2.errorbar(quality_thresholds, aa_mutations, yerr=aa_errors,
                     fmt='^', capsize=5, capthick=2, markersize=8,
@@ -832,34 +848,34 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
         ax2.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color3)
         ax2.tick_params(axis='y', labelcolor=color3)
         ax2.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
-        ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
+        ax2.set_title('Mappable Bases and AA Mutations per Gene vs Quality Score Filter',
                      fontsize=14, fontweight='bold')
         ax2.grid(True, alpha=0.3)
         # Add legends
         lines1, labels1 = ax2.get_legend_handles_labels()
         lines2, labels2 = ax2_twin.get_legend_handles_labels()
         ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
         # Add data point labels for mappable bases
         for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
-            ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
+            ax2_twin.annotate(f'{bases}', (q, bases), xytext=(5, -15),
                              textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
         plt.tight_layout()
-        # Save the comprehensive plot
-        qc_plot_path = os.path.join(results_dir, "comprehensive_qc_analysis.png")
+        # Save the comprehensive plot to detailed/qc_plots/
+        qc_plot_path = os.path.join(qc_plots_dir, "comprehensive_qc_analysis.png")
         fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
         plt.close(fig)
         logging.info(f"Comprehensive QC plot saved to: {qc_plot_path}")
         # Create error analysis plot
         create_error_analysis_plot(quality_thresholds, qc_results, results_dir)
-        # Save comprehensive data as CSV
-        qc_data_path = os.path.join(results_dir, "comprehensive_qc_data.csv")
+        # Save comprehensive data as CSV to detailed/
+        qc_data_path = os.path.join(detailed_dir, "comprehensive_qc_data.csv")
         with open(qc_data_path, 'w') as f:
             f.write("quality_threshold,mean_aa_mutations,std_aa_mutations,ci_lower,ci_upper,")
             f.write("mappable_bases,hit_rate,hit_rate_ci_lower,hit_rate_ci_upper,")
@@ -869,7 +885,7 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
             f.write("bg_qscore_mean,bg_qscore_std,bg_qscore_uncertainty,")
             f.write("hit_weighted_rate,hit_weighted_error,bg_weighted_rate,bg_weighted_error,")
             f.write("net_weighted_rate,net_weighted_error,lambda_bp_weighted,lambda_error_weighted\n")
             for q, r in zip(quality_thresholds, qc_results):
                 f.write(f"{q},{r['mean_aa_mutations']:.6f},{r['std_aa_mutations']:.6f},")
                 f.write(f"{r['ci_lower']:.6f},{r['ci_upper']:.6f},")
@@ -878,93 +894,98 @@ def create_simple_qc_plots(quality_thresholds, qc_results, results_dir, consensu
                 f.write(f"{r['bg_rate']:.6f},{r['bg_rate_ci'][0]:.6f},{r['bg_rate_ci'][1]:.6f},")
                 f.write(f"{r['net_rate']:.6f},{r['net_rate_error']:.6f},")
                 f.write(f"{r['lambda_bp']:.6f},{r['lambda_error']:.6f},{r['alignment_error']:.6f},")
                 # Q-score information
                 hit_qscore_mean = r['hit_qscore_stats']['mean_qscore'] if r['hit_qscore_stats'] else 0.0
                 hit_qscore_std = r['hit_qscore_stats']['std_qscore'] if r['hit_qscore_stats'] else 0.0
                 bg_qscore_mean = r['bg_qscore_stats']['mean_qscore'] if r['bg_qscore_stats'] else 0.0
                 bg_qscore_std = r['bg_qscore_stats']['std_qscore'] if r['bg_qscore_stats'] else 0.0
                 f.write(f"{hit_qscore_mean:.2f},{hit_qscore_std:.2f},{r['hit_qscore_uncertainty']:.6f},")
                 f.write(f"{bg_qscore_mean:.2f},{bg_qscore_std:.2f},{r['bg_qscore_uncertainty']:.6f},")
                 f.write(f"{r.get('hit_weighted_rate', 0.0):.6f},{r.get('hit_weighted_error', 0.0):.6f},")
                 f.write(f"{r.get('bg_weighted_rate', 0.0):.6f},{r.get('bg_weighted_error', 0.0):.6f},")
                 f.write(f"{r.get('net_weighted_rate', 0.0):.6f},{r.get('net_weighted_error', 0.0):.6f},")
                 f.write(f"{r.get('lambda_bp_weighted', 0.0):.6f},{r.get('lambda_error_weighted', 0.0):.6f}\n")
         logging.info(f"Comprehensive QC data saved to: {qc_data_path}")
     except Exception as e:
         logging.error(f"Error creating comprehensive QC plots: {e}")
 def create_error_analysis_plot(quality_thresholds, qc_results, results_dir):
     """
     Create a detailed error analysis plot showing different sources of uncertainty.
     Args:
         quality_thresholds: List of quality score thresholds
         qc_results: List of comprehensive analysis results
         results_dir: Directory to save the plot
     """
     try:
+        # Create detailed/qc_plots/ subdirectory
+        detailed_dir = os.path.join(results_dir, "detailed")
+        qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
+        os.makedirs(qc_plots_dir, exist_ok=True)
         fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
         # Extract error components
         aa_std = [r['std_aa_mutations'] for r in qc_results]
         net_rate_errors = [r['net_rate_error'] for r in qc_results]
         lambda_errors = [r['lambda_error'] for r in qc_results]
         alignment_errors = [r['alignment_error'] for r in qc_results]
         mappable_bases = [r['mappable_bases'] for r in qc_results]
         # Plot 1: AA mutation uncertainty vs quality threshold
         ax1.plot(quality_thresholds, aa_std, 'o-', color='#2E8B57', linewidth=2, markersize=6)
         ax1.set_xlabel('Quality Score Threshold')
         ax1.set_ylabel('AA Mutation Standard Deviation')
         ax1.set_title('AA Mutation Uncertainty vs Quality Filter')
         ax1.grid(True, alpha=0.3)
         # Plot 2: Net rate error vs quality threshold
         ax2.plot(quality_thresholds, net_rate_errors, 's-', color='#FF6B6B', linewidth=2, markersize=6)
         ax2.set_xlabel('Quality Score Threshold')
         ax2.set_ylabel('Net Mutation Rate Error')
         ax2.set_title('Net Rate Error vs Quality Filter')
         ax2.grid(True, alpha=0.3)
         # Plot 3: Lambda error vs quality threshold
         ax3.plot(quality_thresholds, lambda_errors, '^-', color='#4169E1', linewidth=2, markersize=6)
         ax3.set_xlabel('Quality Score Threshold')
         ax3.set_ylabel('Lambda Error (mutations per copy)')
         ax3.set_title('Lambda Error vs Quality Filter')
         ax3.grid(True, alpha=0.3)
         # Plot 4: Alignment error vs mappable bases
         ax4.scatter(mappable_bases, alignment_errors, s=100, alpha=0.7, color='#FF8C00')
         ax4.set_xlabel('Mappable Bases')
-        ax4.set_ylabel('Alignment Error (1/√reads)')
+        ax4.set_ylabel('Alignment Error (1/sqrt(reads))')
         ax4.set_title('Alignment Error vs Read Count')
         ax4.grid(True, alpha=0.3)
         # Add quality threshold labels to scatter plot
         for i, q in enumerate(quality_thresholds):
-            ax4.annotate(f'Q{q}', (mappable_bases[i], alignment_errors[i]),
+            ax4.annotate(f'Q{q}', (mappable_bases[i], alignment_errors[i]),
                         xytext=(5, 5), textcoords='offset points', fontsize=8)
         plt.tight_layout()
-        # Save error analysis plot
-        error_plot_path = os.path.join(results_dir, "error_analysis.png")
+        # Save error analysis plot to detailed/qc_plots/
+        error_plot_path = os.path.join(qc_plots_dir, "error_analysis.png")
         fig.savefig(error_plot_path, dpi=300, bbox_inches='tight')
         plt.close(fig)
         logging.info(f"Error analysis plot saved to: {error_plot_path}")
     except Exception as e:
         logging.error(f"Error creating error analysis plot: {e}")
 def create_qc_plot(quality_thresholds, aa_mutations, mappable_bases, results_dir):
     """
     Create a dual-axis plot showing quality score threshold vs AA mutations per gene and mappable bases.
     Args:
         quality_thresholds: List of quality score thresholds
         aa_mutations: List of corresponding AA mutations per gene
@@ -972,68 +993,73 @@ def create_qc_plot(quality_thresholds, aa_mutations, mappable_bases, results_dir
         results_dir: Directory to save the plot
     """
     try:
+        # Create detailed/qc_plots/ subdirectory
+        detailed_dir = os.path.join(results_dir, "detailed")
+        qc_plots_dir = os.path.join(detailed_dir, "qc_plots")
+        os.makedirs(qc_plots_dir, exist_ok=True)
         # Create the plot with dual y-axes
         fig, ax1 = plt.subplots(figsize=(12, 8))
         # Left y-axis: AA mutations per gene
         color1 = '#2E8B57'
         ax1.set_xlabel('Quality Score Threshold', fontsize=12, fontweight='bold')
         ax1.set_ylabel('Estimated AA Mutations per Gene', fontsize=12, fontweight='bold', color=color1)
-        ax1.scatter(quality_thresholds, aa_mutations,
+        ax1.scatter(quality_thresholds, aa_mutations,
                    s=100, alpha=0.7, color=color1, edgecolors='black', linewidth=1, label='AA Mutations per Gene')
         ax1.tick_params(axis='y', labelcolor=color1)
         # Right y-axis: Mappable bases
         ax2 = ax1.twinx()
         color2 = '#FF6B6B'
         ax2.set_ylabel('Number of Mappable Bases', fontsize=12, fontweight='bold', color=color2)
-        ax2.scatter(quality_thresholds, mappable_bases,
+        ax2.scatter(quality_thresholds, mappable_bases,
                    s=100, alpha=0.7, color=color2, edgecolors='black', linewidth=1, marker='s', label='Mappable Bases')
         ax2.tick_params(axis='y', labelcolor=color2)
         # Customize the plot
         ax1.set_title('AA Mutations per Gene and Mappable Bases vs Quality Score Filter', fontsize=14, fontweight='bold')
         # Add grid for better readability
         ax1.grid(True, alpha=0.3)
         # Customize ticks and spines
         ax1.tick_params(axis='both', which='major', labelsize=10, direction='in', length=6)
         ax1.tick_params(axis='both', which='minor', direction='in', length=3)
         ax1.spines['top'].set_visible(False)
         ax1.spines['right'].set_visible(False)
         # Add data point labels for AA mutations
         for i, (q, aa_mut) in enumerate(zip(quality_thresholds, aa_mutations)):
-            ax1.annotate(f'Q{q}', (q, aa_mut), xytext=(5, 5),
+            ax1.annotate(f'Q{q}', (q, aa_mut), xytext=(5, 5),
                         textcoords='offset points', fontsize=9, alpha=0.8, color=color1)
         # Add data point labels for mappable bases
         for i, (q, bases) in enumerate(zip(quality_thresholds, mappable_bases)):
-            ax2.annotate(f'{reads}', (q, reads), xytext=(5, -15),
+            ax2.annotate(f'{bases}', (q, bases), xytext=(5, -15),
                         textcoords='offset points', fontsize=8, alpha=0.8, color=color2)
         # Add legend
         lines1, labels1 = ax1.get_legend_handles_labels()
         lines2, labels2 = ax2.get_legend_handles_labels()
         ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper right', frameon=False, fontsize=10)
-        # Save the plot
-        qc_plot_path = os.path.join(results_dir, "qc_mutation_rate_vs_quality.png")
+        # Save the plot to detailed/qc_plots/
+        qc_plot_path = os.path.join(qc_plots_dir, "qc_mutation_rate_vs_quality.png")
         fig.savefig(qc_plot_path, dpi=300, bbox_inches='tight')
         plt.close(fig)
         logging.info(f"QC plot saved to: {qc_plot_path}")
-        # Also save data as CSV for reference
-        qc_data_path = os.path.join(results_dir, "qc_mutation_rate_vs_quality.csv")
+        # Also save data as CSV to detailed/qc_plots/
+        qc_data_path = os.path.join(qc_plots_dir, "qc_mutation_rate_vs_quality.csv")
         with open(qc_data_path, 'w') as f:
             f.write("quality_threshold,aa_mutations_per_gene,mappable_bases\n")
             for q, aa_mut, bases in zip(quality_thresholds, aa_mutations, mappable_bases):
                 f.write(f"{q},{aa_mut:.6f},{bases}\n")
         logging.info(f"QC data saved to: {qc_data_path}")
     except Exception as e:
         logging.error(f"Error creating QC plot: {e}")
@@ -1330,74 +1356,74 @@ def run_segmented_analysis(segment_files, quality_threshold, work_dir, ref_hit_f
     except Exception as e:
         logging.error(f"Error in segmented analysis: {e}")
         return None
+def calculate_qscore_weighted_mismatches(sam_file, ref_seq, qscore_stats):
     """
     Calculate mismatches weighted by Q-score uncertainty with proper sampling error.
     Args:
         sam_file: Path to SAM file
         ref_seq: Reference sequence
         qscore_stats: Q-score statistics from extract_qscores_from_sam
     Returns:
         tuple: (weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes)
     """
     try:
-        import pysam
         weighted_mismatches = 0.0
         total_weighted_coverage = 0.0
         raw_mismatches = 0
         raw_coverage = 0
         # Store position-level data for proper sampling error calculation
         position_weights = []
         position_outcomes = []
         position_qscores = qscore_stats['position_avg_qscores']
         with pysam.AlignmentFile(sam_file, "r") as samfile:
             for read in samfile:
                 if read.is_unmapped:
                     continue
                 # Get aligned pairs (read_pos, ref_pos)
                 for read_pos, ref_pos in read.get_aligned_pairs(matches_only=False):
                     if ref_pos is None or read_pos is None:
                         continue
                     if ref_pos >= len(ref_seq):
                         continue
                     # Get base calls
                     read_base = read.query_sequence[read_pos].upper()
                     ref_base = ref_seq[ref_pos].upper()
                     # Skip if either base is N
                     if read_base == 'N' or ref_base == 'N':
                         continue
                     # Get Q-score for this position
                     qscore = position_qscores.get(ref_pos, qscore_stats['mean_qscore'])
                     uncertainty_factor = qscore_uncertainty_factor(qscore)
                     # Weight by uncertainty (lower Q-score = higher uncertainty = lower weight)
                     weight = 1.0 - uncertainty_factor
                     # Store position-level data
                     position_weights.append(weight)
                     position_outcomes.append(1 if read_base != ref_base else 0)
                     # Count coverage
                     total_weighted_coverage += weight
                     raw_coverage += 1
                     # Count mismatches
                     if read_base != ref_base:
                         weighted_mismatches += weight
                         raw_mismatches += 1
         return weighted_mismatches, total_weighted_coverage, raw_mismatches, raw_coverage, position_weights, position_outcomes
     except Exception as e:
         logging.error(f"Error calculating Q-score weighted mismatches: {e}")
         return 0.0, 0.0, 0, 0, [], []
@@ -1827,6 +1853,289 @@ def simulate_aa_distribution(lambda_bp, cds_seq, n_trials=1000):
     return aa_diffs
+def create_output_directories(results_dir):
+    """
+    Create the output directory structure with detailed/ and detailed/qc_plots/ subdirectories.
+    Args:
+        results_dir: Base results directory path
+    Returns:
+        dict: Paths to created directories
+    """
+    results_dir = Path(results_dir)
+    detailed_dir = results_dir / "detailed"
+    qc_plots_dir = detailed_dir / "qc_plots"
+    detailed_dir.mkdir(parents=True, exist_ok=True)
+    qc_plots_dir.mkdir(parents=True, exist_ok=True)
+    logging.info(f"Created output directories: {detailed_dir}, {qc_plots_dir}")
+    return {
+        'results_dir': results_dir,
+        'detailed_dir': detailed_dir,
+        'qc_plots_dir': qc_plots_dir,
+    }
+def write_key_findings(results_dir, consensus_info, simple_lambda, simple_aa_mean, is_protein, hit_seq):
+    """
+    Generate lay-user executive summary KEY_FINDINGS.txt.
+    Args:
+        results_dir: Results directory path
+        consensus_info: Consensus AA mutation estimate from QC analysis
+        simple_lambda: Simple lambda (bp mutations per copy) from main analysis
+        simple_aa_mean: Simple AA mutation mean from Monte Carlo simulation
+        is_protein: Whether the region is protein-coding
+        hit_seq: The hit sequence (for length calculation)
+    """
+    key_findings_path = Path(results_dir) / "KEY_FINDINGS.txt"
+    with open(key_findings_path, "w") as f:
+        f.write("=" * 60 + "\n")
+        f.write("EP LIBRARY PROFILER - KEY FINDINGS\n")
+        f.write("=" * 60 + "\n\n")
+        # Determine which value to use as the "headline" number
+        if consensus_info and consensus_info.get("consensus_mean") is not None:
+            headline_aa = consensus_info["consensus_mean"]
+            headline_std = consensus_info.get("consensus_std", 0.0)
+            method_note = "consensus (precision-weighted average across Q-score thresholds)"
+        elif simple_aa_mean is not None:
+            headline_aa = simple_aa_mean
+            headline_std = 0.0  # Simple method doesn't provide error
+            method_note = "Monte Carlo simulation (single Q-score)"
+        else:
+            headline_aa = None
+            headline_std = 0.0
+            method_note = "N/A"
+        f.write("EXPECTED AMINO ACID MUTATIONS PER GENE COPY\n")
+        f.write("-" * 45 + "\n")
+        if is_protein and headline_aa is not None:
+            f.write(f"  {headline_aa:.2f} +/- {headline_std:.2f} AA mutations per gene copy\n")
+            f.write(f"  (Method: {method_note})\n\n")
+            # Plain-language interpretation using Poisson distribution
+            f.write("WHAT THIS MEANS (Poisson distribution):\n")
+            f.write("-" * 45 + "\n")
+            if headline_aa > 0:
+                # P(k=0) = e^(-lambda)
+                p_wildtype = np.exp(-headline_aa) * 100
+                # P(k=1) = lambda * e^(-lambda)
+                p_one_mut = headline_aa * np.exp(-headline_aa) * 100
+                # P(k>=2) = 1 - P(0) - P(1)
+                p_two_plus = 100 - p_wildtype - p_one_mut
+                f.write(f"  ~{p_wildtype:.1f}% of gene copies are wild-type (0 AA mutations)\n")
+                f.write(f"  ~{p_one_mut:.1f}% have exactly 1 AA mutation\n")
+                f.write(f"  ~{p_two_plus:.1f}% have 2 or more AA mutations\n\n")
+            else:
+                f.write("  Nearly all gene copies are expected to be wild-type.\n\n")
+        else:
+            if not is_protein:
+                f.write("  Region is not protein-coding; AA mutation estimate not applicable.\n\n")
+            else:
+                f.write("  AA mutation estimate could not be calculated.\n\n")
+        # Quality assessment
+        f.write("QUALITY ASSESSMENT\n")
+        f.write("-" * 45 + "\n")
+        if consensus_info:
+            n_thresholds = len(consensus_info.get("thresholds_used", []))
+            min_bases = consensus_info.get("min_mappable_bases", 0)
+            note = consensus_info.get("note", "")
+            if n_thresholds >= 3 and note != "FELL_BACK_TO_MAX_COVERAGE":
+                f.write("  GOOD - Multiple Q-score thresholds contributed to consensus\n")
+            elif n_thresholds >= 1:
+                f.write("  ACCEPTABLE - Limited Q-score thresholds available\n")
+            else:
+                f.write("  LOW COVERAGE - Results may be unreliable\n")
+            if note == "FELL_BACK_TO_MAX_COVERAGE":
+                f.write("  WARNING: Fell back to max-coverage threshold due to low coverage\n")
+        else:
+            f.write("  UNKNOWN - Consensus analysis not available\n")
+        f.write("\n")
+        f.write("FOR DETAILED TECHNICAL INFORMATION\n")
+        f.write("-" * 45 + "\n")
+        f.write("  See the detailed/ folder for:\n")
+        f.write("    - methodology_notes.txt: Full explanation of calculations\n")
+        f.write("    - lambda_comparison.csv: Side-by-side lambda estimates\n")
+        f.write("    - comprehensive_qc_data.csv: All Q-score threshold results\n")
+        f.write("\n")
+    logging.info(f"Wrote KEY_FINDINGS.txt to: {key_findings_path}")
+def write_lambda_comparison(detailed_dir, simple_lambda, simple_aa_mean, consensus_info, hit_seq_length):
+    """
+    Write CSV comparing all lambda estimates side-by-side.
+    Args:
+        detailed_dir: Path to detailed/ directory
+        simple_lambda: Simple lambda (bp mutations per copy)
+        simple_aa_mean: Simple AA mutation mean from Monte Carlo
+        consensus_info: Consensus info from QC analysis
+        hit_seq_length: Length of the hit sequence
+    """
+    lambda_csv_path = Path(detailed_dir) / "lambda_comparison.csv"
+    with open(lambda_csv_path, "w") as f:
+        f.write("method,lambda_bp,lambda_error,aa_estimate,aa_error,notes\n")
+        # Simple method (from main analysis)
+        simple_error = "N/A"  # Simple method doesn't compute error
+        simple_aa_err = "N/A"
+        f.write(f"simple,(hit_rate - bg_rate) * seq_len,{simple_lambda:.6f},{simple_error},")
+        if simple_aa_mean is not None:
+            f.write(f"{simple_aa_mean:.4f},{simple_aa_err},")
+        else:
+            f.write("N/A,N/A,")
+        f.write("Used for KDE plot and Monte Carlo simulation\n")
+        # Consensus method (from QC analysis)
+        if consensus_info and consensus_info.get("consensus_mean") is not None:
+            consensus_mean = consensus_info["consensus_mean"]
+            consensus_std = consensus_info.get("consensus_std", 0.0)
+            thresholds = consensus_info.get("thresholds_used", [])
+            # Consensus is in AA mutations, back-calculate approximate lambda
+            # Rough approximation: lambda_bp ~ 3 * aa_mutations
+            approx_lambda = consensus_mean * 3.0
+            approx_lambda_err = consensus_std * 3.0
+            f.write(f"consensus_weighted,{approx_lambda:.6f},{approx_lambda_err:.6f},")
+            f.write(f"{consensus_mean:.4f},{consensus_std:.4f},")
+            f.write(f"Precision-weighted across Q-scores: {thresholds}\n")
+        else:
+            f.write("consensus_weighted,N/A,N/A,N/A,N/A,Not computed or insufficient data\n")
+    logging.info(f"Wrote lambda_comparison.csv to: {lambda_csv_path}")
+def write_methodology_notes(detailed_dir):
+    """
+    Write detailed methodology documentation explaining each lambda calculation method.
+    Args:
+        detailed_dir: Path to detailed/ directory
+    """
+    methodology_path = Path(detailed_dir) / "methodology_notes.txt"
+    content = """EP LIBRARY PROFILER - METHODOLOGY NOTES
+=======================================
+This document explains the different mutation rate estimates produced by the
+EP library profiler and which outputs use which estimates.
+LAMBDA CALCULATION METHODS
+--------------------------
+1. SIMPLE LAMBDA (used for KDE plot and Monte Carlo simulation)
+   Formula: lambda_bp = (hit_rate - bg_rate) * sequence_length
+   Where:
+   - hit_rate = total_mismatches / total_covered_bases (in target region)
+   - bg_rate = total_mismatches / total_covered_bases (in plasmid excluding target)
+   - sequence_length = length of target CDS in base pairs
+   This method:
+   - Does NOT include error propagation
+   - Does NOT weight by Q-score
+   - Is fast and provides a point estimate
+   Used in:
+   - summary_panels.png/pdf (Panel 4: KDE of AA mutations)
+   - summary.txt
+   - aa_mutation_distribution.csv
+2. Q-SCORE WEIGHTED LAMBDA (used in comprehensive QC analysis)
+   Formula: lambda_bp_weighted = net_weighted_rate * sequence_length
+   Where:
+   - net_weighted_rate = hit_weighted_rate - bg_weighted_rate
+   - Weighted rates account for per-base Q-score uncertainty
+   - Weights = 1 - sqrt(10^(-Q/10)) for each position
+   This method:
+   - DOES include error propagation
+   - DOES weight by Q-score (higher Q-score = higher weight)
+   - Provides confidence intervals
+   Used in:
+   - comprehensive_qc_data.csv
+   - error_analysis.png
+3. CONSENSUS LAMBDA (recommended for reporting)
+   Formula: Precision-weighted average across Q-score thresholds
+   weights[i] = 1 / std_aa_mutations[i]
+   consensus_mean = sum(weights * means) / sum(weights)
+   This method:
+   - Aggregates estimates from multiple Q-score filtering thresholds
+   - Weights by precision (lower uncertainty = higher weight)
+   - Requires minimum coverage threshold (default 1000 mappable bases)
+   - Provides the most robust estimate when multiple thresholds pass QC
+   Used in:
+   - aa_mutation_consensus.txt
+   - KEY_FINDINGS.txt
+   - QC plots (red dashed line)
+WHICH VALUE SHOULD I USE?
+-------------------------
+For publication/reporting:
+  Use the CONSENSUS value from aa_mutation_consensus.txt or KEY_FINDINGS.txt
+  This is the most statistically robust estimate.
+For understanding the distribution shape:
+  Use the KDE plot in summary_panels.png
+  Note: This uses the SIMPLE lambda, not the consensus.
+For detailed error analysis:
+  Use comprehensive_qc_data.csv in the detailed/ folder
+  This contains per-Q-score estimates with full error propagation.
+OUTPUT FILE REFERENCE
+---------------------
+Root folder:
+  - KEY_FINDINGS.txt: Executive summary with consensus AA mutations
+  - summary_panels.png/pdf: Main visualization (uses simple lambda for KDE)
+  - aa_mutation_consensus.txt: Consensus estimate details
+detailed/ folder:
+  - methodology_notes.txt: This file
+  - lambda_comparison.csv: Side-by-side comparison of all methods
+  - comprehensive_qc_data.csv: Full QC data with error estimates
+  - simple_qc_data.csv: Simplified QC data
+  - gene_mismatch_rates.csv: Per-position mismatch rates
+  - base_distribution.csv: Base counts at each position
+  - aa_substitutions.csv: Amino acid substitution data
+  - plasmid_coverage.csv: Coverage across plasmid
+  - aa_mutation_distribution.csv: Monte Carlo AA mutation trials
+detailed/qc_plots/ folder:
+  - qc_plot_*.png: Q-score threshold analysis plot
+  - comprehensive_qc_analysis.png: Detailed QC visualization
+  - error_analysis.png: Error component breakdown
+"""
+    with open(methodology_path, "w") as f:
+        f.write(content)
+    logging.info(f"Wrote methodology_notes.txt to: {methodology_path}")
 def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, work_dir, results_dir,
                                  chunks, ref_hit_fasta, plasmid_fasta, hit_seq, hit_id, plasmid_seq, idx):
     """
@@ -1854,13 +2163,18 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
     # Ensure work directory exists
     os.makedirs(work_dir, exist_ok=True)
     # Create subdirectory for this Q-score analysis
     qscore_results_dir = results_dir
     if qscore is not None:
         qscore_results_dir = os.path.join(results_dir, f"q{qscore}_analysis")
         os.makedirs(qscore_results_dir, exist_ok=True)
+    # Create output directory structure (detailed/ and detailed/qc_plots/)
+    output_dirs = create_output_directories(qscore_results_dir)
+    detailed_dir = output_dirs['detailed_dir']
+    qc_plots_dir = output_dirs['qc_plots_dir']
     # Write chunks FASTA & align to background‐chunks
     chunks_fasta = create_multi_fasta(chunks, work_dir)
     sam_chunks   = run_minimap2(fastq_path, chunks_fasta, "plasmid_chunks_alignment", work_dir)
@@ -1976,9 +2290,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
     qscore_info = f" ({qscore_desc})" if qscore_desc != "unfiltered" else ""
     # ----------------------------
-    # SAVE CSV FOR MUTATION RATES (PANEL 1)
+    # SAVE CSV FOR MUTATION RATES (PANEL 1) - to detailed/
     # ----------------------------
-    gene_mismatch_csv = os.path.join(qscore_results_dir, "gene_mismatch_rates.csv")
+    gene_mismatch_csv = os.path.join(detailed_dir, "gene_mismatch_rates.csv")
     with open(gene_mismatch_csv, "w", newline="") as csvfile:
         csvfile.write(f"# gene_id: {hit_id}\n")
         csvfile.write(f"# background_rate_per_kb: {bg_rate_per_kb:.6f}\n")
@@ -1988,9 +2302,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
     logging.info(f"Saved CSV for gene mismatch rates: {gene_mismatch_csv}")
     # ----------------------------
-    # SAVE CSV FOR BASE DISTRIBUTION (PANEL 2)
+    # SAVE CSV FOR BASE DISTRIBUTION (PANEL 2) - to detailed/
     # ----------------------------
-    base_dist_csv = os.path.join(qscore_results_dir, "base_distribution.csv")
+    base_dist_csv = os.path.join(detailed_dir, "base_distribution.csv")
     with open(base_dist_csv, "w", newline="") as csvfile:
         csvfile.write(f"# gene_id: {hit_id}\n")
         csvfile.write("position_1based,ref_base,A_count,C_count,G_count,T_count,N_count\n")
@@ -2000,10 +2314,10 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
     logging.info(f"Saved CSV for base distribution: {base_dist_csv}")
     # ----------------------------
-    # SAVE CSV FOR AA SUBSTITUTIONS (PANEL 3) - only if protein
+    # SAVE CSV FOR AA SUBSTITUTIONS (PANEL 3) - to detailed/ - only if protein
     # ----------------------------
     if is_protein:
-        aa_subst_csv = os.path.join(qscore_results_dir, "aa_substitutions.csv")
+        aa_subst_csv = os.path.join(detailed_dir, "aa_substitutions.csv")
         with open(aa_subst_csv, "w", newline="") as csvfile:
             csvfile.write(f"# gene_id: {hit_id}\n")
             csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
@@ -2013,9 +2327,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
         logging.info(f"Saved CSV for AA substitutions: {aa_subst_csv}")
     # ----------------------------
-    # SAVE CSV FOR PLASMID COVERAGE (PANEL 4)
+    # SAVE CSV FOR PLASMID COVERAGE (PANEL 4) - to detailed/
     # ----------------------------
-    plasmid_cov_csv = os.path.join(qscore_results_dir, "plasmid_coverage.csv")
+    plasmid_cov_csv = os.path.join(detailed_dir, "plasmid_coverage.csv")
     with open(plasmid_cov_csv, "w", newline="") as csvfile:
         csvfile.write("position_1based,coverage\n")
         for pos0, cov in enumerate(plasmid_cov):
@@ -2023,9 +2337,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
     logging.info(f"Saved CSV for plasmid coverage: {plasmid_cov_csv}")
     # ----------------------------
-    # SAVE CSV FOR AA MUTATION DISTRIBUTION (PANEL 3)
+    # SAVE CSV FOR AA MUTATION DISTRIBUTION (PANEL 3) - to detailed/
     # ----------------------------
-    aa_dist_csv = os.path.join(qscore_results_dir, "aa_mutation_distribution.csv")
+    aa_dist_csv = os.path.join(detailed_dir, "aa_mutation_distribution.csv")
     with open(aa_dist_csv, "w", newline="") as csvfile:
         csvfile.write(f"# gene_id: {hit_id}\n")
         csvfile.write(f"# lambda_bp_mut: {est_mut_per_copy:.6f}\n")
@@ -2135,7 +2449,7 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
     if is_protein and aa_diffs and len(aa_diffs) > 0:
         x_vals = np.array(aa_diffs)
         unique_vals = np.unique(x_vals)
         if len(unique_vals) > 1:
             # Multiple unique values - use KDE or histogram
             if HAVE_SCIPY:
@@ -2149,15 +2463,23 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
                     ax3.set_ylim(bottom=0)
                 except Exception as e:
                     logging.warning(f"KDE failed: {e}, falling back to histogram")
-                    ax3.hist(x_vals, bins=min(20, len(unique_vals)),
+                    ax3.hist(x_vals, bins=min(20, len(unique_vals)),
                             color="#C44E52", alpha=0.7, density=True, edgecolor='black')
             else:
-                ax3.hist(x_vals, bins=min(20, len(unique_vals)),
+                ax3.hist(x_vals, bins=min(20, len(unique_vals)),
                         color="#C44E52", alpha=0.7, density=True, edgecolor='black')
         else:
             # Single unique value - just show a bar
             ax3.bar(unique_vals, [1.0], color="#C44E52", alpha=0.7, width=0.1)
             ax3.set_xlim(unique_vals[0] - 0.5, unique_vals[0] + 0.5)
+        # Set title with lambda value for protein-coding sequences
+        ax3.set_title(f"AA Mutation Distribution (Monte Carlo, \u03bb={est_mut_per_copy:.2f}){qscore_info}",
+                     fontsize=14, fontweight='bold')
+        ax3.set_xlabel("Number of AA Mutations", fontsize=12)
+        ax3.set_ylabel("Density", fontsize=12)
+        ax3.spines['top'].set_visible(False)
+        ax3.spines['right'].set_visible(False)
     else:
         # Not protein or no AA differences — display an informative message
         ax3.text(
@@ -2170,7 +2492,7 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
             color="gray",
             transform=ax3.transAxes,
         )
         ax3.set_title("AA Mutation Distribution", fontsize=14, fontweight='bold')
         ax3.set_xlabel("Number of AA Mutations", fontsize=12)
         ax3.set_ylabel("Density", fontsize=12)
@@ -2231,9 +2553,9 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
             sample_percent[cat] = 0.0
     # ----------------------------
-    # GENERATE PDF TABLE (MUTATION SPECTRUM)
+    # GENERATE PDF TABLE (MUTATION SPECTRUM) - to detailed/
     # ----------------------------
-    pdf_path = os.path.join(qscore_results_dir, f"{sample_name}_mutation_spectrum.pdf")
+    pdf_path = os.path.join(detailed_dir, f"{sample_name}_mutation_spectrum.pdf")
     # Prepare table data
     table_rows = []
     for cat in categories:
@@ -2341,9 +2663,6 @@ def run_main_analysis_for_qscore(fastq_path, qscore, qscore_desc, sample_name, w
     }
-    main()
 def expand_fastq_inputs(inputs: Iterable[str]) -> List[Path]:
     paths: List[Path] = []
     for item in inputs:
@@ -2396,7 +2715,7 @@ def run_ep_library_profile(
     master_summary_path.write_text(header + "\n", encoding="utf-8")
     sample_results: List[Dict[str, object]] = []
-    for fastq in fastq_paths:
+    for fastq in tqdm(fastq_paths, desc="Processing FASTQ files", unit="file"):
         result = process_single_fastq(
             fastq,
             region_fasta,
@@ -2503,6 +2822,7 @@ def process_single_fastq(
     logging.info("Running QC analysis to get Q-score results...")
     qc_results = None
+    consensus_info = None
     try:
         qc_results, consensus_info = run_qc_analysis(
             str(fastq_path),
@@ -2563,6 +2883,45 @@ def process_single_fastq(
         )
         analysis_results.append(result)
+    # Generate unified summary files in the sample's root results directory
+    # Get simple lambda from the unfiltered analysis (first result)
+    simple_lambda = 0.0
+    simple_aa_mean = None
+    is_protein = False
+    unfiltered_result = analysis_results[0] if analysis_results else None
+    if unfiltered_result:
+        simple_lambda = unfiltered_result.get('est_mut_per_copy', 0.0)
+        simple_aa_mean = unfiltered_result.get('avg_aa_mutations')
+        is_protein = unfiltered_result.get('is_protein', False)
+    # Create output directories and generate summary files
+    output_dirs = create_output_directories(results_dir)
+    detailed_dir = output_dirs['detailed_dir']
+    # Write KEY_FINDINGS.txt (lay-user summary)
+    write_key_findings(
+        results_dir,
+        consensus_info,
+        simple_lambda,
+        simple_aa_mean,
+        is_protein,
+        hit_seq,
+    )
+    # Write lambda_comparison.csv
+    write_lambda_comparison(
+        detailed_dir,
+        simple_lambda,
+        simple_aa_mean,
+        consensus_info,
+        len(hit_seq),
+    )
+    # Write methodology_notes.txt
+    write_methodology_notes(detailed_dir)
+    logging.info("Generated unified summary files: KEY_FINDINGS.txt, lambda_comparison.csv, methodology_notes.txt")
     if work_dir.exists():
         shutil.rmtree(work_dir)
         logging.info("Removed temporary work directory: %s", work_dir)
@@ -2573,5 +2932,6 @@ def process_single_fastq(
         "sample": sample_name,
         "results_dir": results_dir,
         "analysis_results": analysis_results,
+        "consensus_info": consensus_info,
     }

uht-tooling 0.1.9__py3-none-any.whl → 0.3.0__py3-none-any.whl

uht-tooling 0.1.9py3-none-any.whl → 0.3.0py3-none-any.whl