smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. smftools/__init__.py +9 -4
  2. smftools/_version.py +1 -1
  3. smftools/cli.py +184 -0
  4. smftools/config/__init__.py +1 -0
  5. smftools/config/conversion.yaml +33 -0
  6. smftools/config/deaminase.yaml +56 -0
  7. smftools/config/default.yaml +253 -0
  8. smftools/config/direct.yaml +17 -0
  9. smftools/config/experiment_config.py +1191 -0
  10. smftools/hmm/HMM.py +1576 -0
  11. smftools/hmm/__init__.py +20 -0
  12. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  13. smftools/hmm/call_hmm_peaks.py +106 -0
  14. smftools/{tools → hmm}/display_hmm.py +3 -3
  15. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  16. smftools/{tools → hmm}/train_hmm.py +1 -1
  17. smftools/informatics/__init__.py +0 -2
  18. smftools/informatics/archived/deaminase_smf.py +132 -0
  19. smftools/informatics/fast5_to_pod5.py +4 -1
  20. smftools/informatics/helpers/__init__.py +3 -4
  21. smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
  22. smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
  23. smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
  24. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
  25. smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
  26. smftools/informatics/helpers/discover_input_files.py +100 -0
  27. smftools/informatics/helpers/extract_base_identities.py +29 -3
  28. smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
  29. smftools/informatics/helpers/find_conversion_sites.py +5 -4
  30. smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
  31. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  32. smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
  33. smftools/informatics/helpers/split_and_index_BAM.py +1 -5
  34. smftools/load_adata.py +1346 -0
  35. smftools/machine_learning/__init__.py +12 -0
  36. smftools/machine_learning/data/__init__.py +2 -0
  37. smftools/machine_learning/data/anndata_data_module.py +234 -0
  38. smftools/machine_learning/evaluation/__init__.py +2 -0
  39. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  40. smftools/machine_learning/evaluation/evaluators.py +223 -0
  41. smftools/machine_learning/inference/__init__.py +3 -0
  42. smftools/machine_learning/inference/inference_utils.py +27 -0
  43. smftools/machine_learning/inference/lightning_inference.py +68 -0
  44. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  45. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  46. smftools/machine_learning/models/base.py +295 -0
  47. smftools/machine_learning/models/cnn.py +138 -0
  48. smftools/machine_learning/models/lightning_base.py +345 -0
  49. smftools/machine_learning/models/mlp.py +26 -0
  50. smftools/{tools → machine_learning}/models/positional.py +3 -2
  51. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  52. smftools/machine_learning/models/sklearn_models.py +273 -0
  53. smftools/machine_learning/models/transformer.py +303 -0
  54. smftools/machine_learning/training/__init__.py +2 -0
  55. smftools/machine_learning/training/train_lightning_model.py +135 -0
  56. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  57. smftools/plotting/__init__.py +4 -1
  58. smftools/plotting/autocorrelation_plotting.py +611 -0
  59. smftools/plotting/general_plotting.py +566 -89
  60. smftools/plotting/hmm_plotting.py +260 -0
  61. smftools/plotting/qc_plotting.py +270 -0
  62. smftools/preprocessing/__init__.py +13 -8
  63. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  64. smftools/preprocessing/append_base_context.py +122 -0
  65. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  66. smftools/preprocessing/calculate_complexity_II.py +248 -0
  67. smftools/preprocessing/calculate_coverage.py +10 -1
  68. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  69. smftools/preprocessing/clean_NaN.py +17 -1
  70. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  71. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  72. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  73. smftools/preprocessing/invert_adata.py +12 -5
  74. smftools/preprocessing/load_sample_sheet.py +19 -4
  75. smftools/readwrite.py +849 -43
  76. smftools/tools/__init__.py +3 -32
  77. smftools/tools/calculate_umap.py +5 -5
  78. smftools/tools/general_tools.py +3 -3
  79. smftools/tools/position_stats.py +468 -106
  80. smftools/tools/read_stats.py +115 -1
  81. smftools/tools/spatial_autocorrelation.py +562 -0
  82. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
  83. smftools-0.2.1.dist-info/RECORD +161 -0
  84. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  85. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  86. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  87. smftools/informatics/load_adata.py +0 -182
  88. smftools/preprocessing/append_C_context.py +0 -82
  89. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  90. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  91. smftools/preprocessing/filter_reads_on_length.py +0 -51
  92. smftools/tools/call_hmm_peaks.py +0 -105
  93. smftools/tools/data/__init__.py +0 -2
  94. smftools/tools/data/anndata_data_module.py +0 -90
  95. smftools/tools/evaluation/__init__.py +0 -0
  96. smftools/tools/inference/__init__.py +0 -1
  97. smftools/tools/inference/lightning_inference.py +0 -41
  98. smftools/tools/models/base.py +0 -14
  99. smftools/tools/models/cnn.py +0 -34
  100. smftools/tools/models/lightning_base.py +0 -41
  101. smftools/tools/models/mlp.py +0 -17
  102. smftools/tools/models/sklearn_models.py +0 -40
  103. smftools/tools/models/transformer.py +0 -133
  104. smftools/tools/training/__init__.py +0 -1
  105. smftools/tools/training/train_lightning_model.py +0 -47
  106. smftools-0.1.7.dist-info/RECORD +0 -136
  107. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  108. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  109. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  110. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  111. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  112. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  113. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  114. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  115. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  116. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  117. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  118. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  119. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  120. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,20 @@
1
+ from .apply_hmm_batched import apply_hmm_batched
2
+ from .calculate_distances import calculate_distances
3
+ from .call_hmm_peaks import call_hmm_peaks
4
+ from .display_hmm import display_hmm
5
+ from .hmm_readwrite import load_hmm, save_hmm
6
+ from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
7
+ from .train_hmm import train_hmm
8
+
9
+
10
+ __all__ = [
11
+ "apply_hmm_batched",
12
+ "calculate_distances",
13
+ "call_hmm_peaks",
14
+ "display_hmm",
15
+ "load_hmm",
16
+ "refine_nucleosome_calls",
17
+ "infer_nucleosomes_in_large_bound",
18
+ "save_hmm",
19
+ "train_hmm"
20
+ ]
@@ -3,14 +3,11 @@ import pandas as pd
3
3
  import torch
4
4
  from tqdm import tqdm
5
5
 
6
- def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
6
+ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A", "C"], device="cpu", threshold=0.7, deaminase_footprinting=False):
7
7
  """
8
8
  Applies an HMM model to an AnnData object using tensor-based sequence inputs.
9
9
  If multiple methbases are passed, generates a combined feature set.
10
10
  """
11
- import numpy as np
12
- import torch
13
- from tqdm import tqdm
14
11
 
15
12
  model.to(device)
16
13
 
@@ -74,6 +71,7 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
74
71
  for methbase in methbases:
75
72
  mask = {
76
73
  "a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
74
+ "c": ref_subset.var[f"{ref}_any_C_site"] == True,
77
75
  "gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
78
76
  "cpg": ref_subset.var[f"{ref}_CpG_site"] == True
79
77
  }[methbase.lower()]
@@ -150,6 +148,8 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
150
148
  adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
151
149
 
152
150
  # --- Binarization + Distance ---
151
+ coordinates = adata.var_names.astype(int).values
152
+
153
153
  for feature in tqdm(all_features, desc="Finalizing Layers"):
154
154
  bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
155
155
  counts = np.zeros(adata.shape[0], dtype=int)
@@ -158,9 +158,11 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
158
158
  intervals = []
159
159
  for start, length, prob in intervals:
160
160
  if prob > threshold:
161
- bin_matrix[row_idx, start:start+length] = 1
161
+ start_idx = np.searchsorted(coordinates, start, side="left")
162
+ end_idx = np.searchsorted(coordinates, start + length - 1, side="right")
163
+ bin_matrix[row_idx, start_idx:end_idx] = 1
162
164
  counts[row_idx] += 1
163
- adata.layers[f"{feature}"] = bin_matrix
165
+ adata.layers[feature] = bin_matrix
164
166
  adata.obs[f"n_{feature}"] = counts
165
167
  adata.obs[f"{feature}_distances"] = calculate_batch_distances(adata.obs[feature].tolist(), threshold)
166
168
 
@@ -202,7 +204,6 @@ def classify_batch(predicted_states_batch, probabilities_batch, coordinates, cla
202
204
  Returns:
203
205
  List of classifications for each sequence.
204
206
  """
205
- import numpy as np
206
207
 
207
208
  state_labels = ["Non-Methylated", "Methylated"]
208
209
  target_idx = state_labels.index(target_state)
@@ -0,0 +1,106 @@
1
+ def call_hmm_peaks(
2
+ adata,
3
+ feature_configs,
4
+ obs_column='Reference_strand',
5
+ site_types=['GpC_site', 'CpG_site'],
6
+ save_plot=False,
7
+ output_dir=None,
8
+ date_tag=None,
9
+ inplace=False
10
+ ):
11
+ import numpy as np
12
+ import pandas as pd
13
+ import matplotlib.pyplot as plt
14
+ from scipy.signal import find_peaks
15
+
16
+ if not inplace:
17
+ adata = adata.copy()
18
+
19
+ # Ensure obs_column is categorical
20
+ if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
21
+ adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
22
+
23
+ coordinates = adata.var_names.astype(int).values
24
+ peak_columns = []
25
+
26
+ obs_updates = {}
27
+
28
+ for feature_layer, config in feature_configs.items():
29
+ min_distance = config.get('min_distance', 200)
30
+ peak_width = config.get('peak_width', 200)
31
+ peak_prominence = config.get('peak_prominence', 0.2)
32
+ peak_threshold = config.get('peak_threshold', 0.8)
33
+
34
+ matrix = adata.layers[feature_layer]
35
+ means = np.mean(matrix, axis=0)
36
+ peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
37
+ peak_centers = coordinates[peak_indices]
38
+ adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
39
+
40
+ # Plot
41
+ plt.figure(figsize=(6, 3))
42
+ plt.plot(coordinates, means)
43
+ plt.title(f"{feature_layer} with peak calls")
44
+ plt.xlabel("Genomic position")
45
+ plt.ylabel("Mean intensity")
46
+ for i, center in enumerate(peak_centers):
47
+ start, end = center - peak_width // 2, center + peak_width // 2
48
+ plt.axvspan(start, end, color='purple', alpha=0.2)
49
+ plt.axvline(center, color='red', linestyle='--')
50
+ aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
51
+ plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
52
+ if save_plot and output_dir:
53
+ filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
54
+ plt.savefig(filename, bbox_inches='tight')
55
+ print(f"Saved plot to {filename}")
56
+ else:
57
+ plt.show()
58
+
59
+ feature_peak_columns = []
60
+ for center in peak_centers:
61
+ start, end = center - peak_width // 2, center + peak_width // 2
62
+ colname = f'{feature_layer}_peak_{center}'
63
+ peak_columns.append(colname)
64
+ feature_peak_columns.append(colname)
65
+
66
+ peak_mask = (coordinates >= start) & (coordinates <= end)
67
+ adata.var[colname] = peak_mask
68
+
69
+ region = matrix[:, peak_mask]
70
+ obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
71
+ obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
72
+ obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
73
+
74
+ for site_type in site_types:
75
+ adata.obs[f'{site_type}_sum_around_{center}'] = 0
76
+ adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
77
+
78
+ for ref in adata.obs[obs_column].cat.categories:
79
+ ref_idx = adata.obs[obs_column] == ref
80
+ mask_key = f"{ref}_{site_type}"
81
+ for site_type in site_types:
82
+ if mask_key not in adata.var:
83
+ continue
84
+ site_mask = adata.var[mask_key].values
85
+ site_coords = coordinates[site_mask]
86
+ region_mask = (site_coords >= start) & (site_coords <= end)
87
+ if not region_mask.any():
88
+ continue
89
+ full_mask = site_mask.copy()
90
+ full_mask[site_mask] = region_mask
91
+ site_region = adata[ref_idx, full_mask].X
92
+ if hasattr(site_region, "A"):
93
+ site_region = site_region.A
94
+ if site_region.shape[1] > 0:
95
+ adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
96
+ adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
97
+ else:
98
+ pass
99
+
100
+ adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
101
+ print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
102
+
103
+ adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
104
+ adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
105
+
106
+ return adata if not inplace else None
@@ -1,16 +1,16 @@
1
1
  def display_hmm(hmm, state_labels=["Non-Methylated", "Methylated"], obs_labels=["0", "1"]):
2
2
  import torch
3
- print("\n🔹 **HMM Model Overview**")
3
+ print("\n**HMM Model Overview**")
4
4
  print(hmm)
5
5
 
6
- print("\n🔹 **Transition Matrix**")
6
+ print("\n**Transition Matrix**")
7
7
  transition_matrix = torch.exp(hmm.edges).detach().cpu().numpy()
8
8
  for i, row in enumerate(transition_matrix):
9
9
  label = state_labels[i] if state_labels else f"State {i}"
10
10
  formatted_row = ", ".join(f"{p:.6f}" for p in row)
11
11
  print(f"{label}: [{formatted_row}]")
12
12
 
13
- print("\n🔹 **Emission Probabilities**")
13
+ print("\n**Emission Probabilities**")
14
14
  for i, dist in enumerate(hmm.distributions):
15
15
  label = state_labels[i] if state_labels else f"State {i}"
16
16
  probs = dist.probs.detach().cpu().numpy()
@@ -56,7 +56,7 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
56
56
  adata.layers[f"{layer_name}_hexamers"] = hexamer_layer
57
57
  adata.layers[f"{layer_name}_octamers"] = octamer_layer
58
58
 
59
- print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
59
+ print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
60
60
  return adata
61
61
 
62
62
  def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_layer, nan_mask_layer, nuc_size=147, linker_size=50, exclusion_buffer=30, device="cpu"):
@@ -100,5 +100,5 @@ def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_laye
100
100
  pos_cursor += 1
101
101
 
102
102
  adata.layers[f"{large_bound_layer}_phased_nucleosomes"] = inferred_layer
103
- print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
103
+ print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
104
104
  return adata
@@ -11,7 +11,7 @@ def train_hmm(
11
11
  pad_value=0,
12
12
  ):
13
13
  """
14
- Trains a 2-state DenseHMM model on binary methylation data.
14
+ Trains a 2-state DenseHMM model on binary methylation/deamination data.
15
15
 
16
16
  Parameters:
17
17
  data (list or np.ndarray): List of sequences (lists) with 0, 1, or NaN.
@@ -1,6 +1,5 @@
1
1
  from . import helpers
2
2
  from .basecall_pod5s import basecall_pod5s
3
- from .load_adata import load_adata
4
3
  from .subsample_fasta_from_bed import subsample_fasta_from_bed
5
4
  from .subsample_pod5 import subsample_pod5
6
5
  from .fast5_to_pod5 import fast5_to_pod5
@@ -8,7 +7,6 @@ from .fast5_to_pod5 import fast5_to_pod5
8
7
 
9
8
  __all__ = [
10
9
  "basecall_pod5s",
11
- "load_adata",
12
10
  "subsample_fasta_from_bed",
13
11
  "subsample_pod5",
14
12
  "fast5_to_pod5",
@@ -0,0 +1,132 @@
1
+
2
+ def deaminase_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
3
+ """
4
+ Processes sequencing data from a conversion SMF experiment to an adata object.
5
+
6
+ Parameters:
7
+ fasta (str): File path to the reference genome to align to.
8
+ output_directory (str): A file path to the directory to output all the analyses.
9
+ conversion_type (list): A list of strings of the conversion types to use in the analysis.
10
+ strands (list): A list of converstion strands to use in the experiment.
11
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
12
+ model (str): a string representing the dorado basecalling model.
13
+ input_data_path (str): a string representing the file path to the experiment directory/file containing sequencing data
14
+ split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
16
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
17
+ experiment_name (str): A string to provide an experiment name to the output adata file.
18
+ bam_suffix (str): A suffix to add to the bam file.
19
+ basecall (bool): Whether to go through basecalling or not.
20
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
21
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
22
+ device (str): Device to use for basecalling. auto, metal, cpu, cuda
23
+ make_bigwigs (bool): Whether to make bigwigs
24
+ threads (int): cpu threads available for processing.
25
+ input_already_demuxed (bool): Whether the input files were already demultiplexed
26
+
27
+ Returns:
28
+ final_adata_path (str): Path to the final adata object
29
+ sorted_output (str): Path to the aligned, sorted BAM
30
+ """
31
+ from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, canoncall, converted_BAM_to_adata_II, generate_converted_FASTA, get_chromosome_lengths, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc, split_and_index_BAM
32
+ import os
33
+ import shutil
34
+ import glob
35
+
36
+ if basecall:
37
+ model_basename = os.path.basename(model)
38
+ model_basename = model_basename.replace('.', '_')
39
+ bam=f"{output_directory}/{model_basename}_canonical_basecalls"
40
+ else:
41
+ bam_base=os.path.basename(input_data_path).split('.bam')[0]
42
+ bam=os.path.join(output_directory, bam_base)
43
+ aligned_BAM=f"{bam}_aligned"
44
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
45
+
46
+ os.chdir(output_directory)
47
+
48
+ # 1) Convert FASTA file
49
+ fasta_basename = os.path.basename(fasta)
50
+ converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
51
+ converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
52
+ if 'converted.fa' in fasta:
53
+ print(fasta + ' is already converted. Using existing converted FASTA.')
54
+ converted_FASTA = fasta
55
+ elif os.path.exists(converted_FASTA):
56
+ print(converted_FASTA + ' already exists. Using existing converted FASTA.')
57
+ else:
58
+ generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
59
+
60
+ # Make a FAI and .chrom.names file for the converted fasta
61
+ get_chromosome_lengths(converted_FASTA)
62
+
63
+ # 2) Basecall from the input POD5 to generate a singular output BAM
64
+ if basecall:
65
+ canoncall_output = bam + bam_suffix
66
+ if os.path.exists(canoncall_output):
67
+ print(canoncall_output + ' already exists. Using existing basecalled BAM.')
68
+ else:
69
+ canoncall(model_dir, model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
70
+ else:
71
+ canoncall_output = input_data_path
72
+
73
+ # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
74
+ aligned_output = aligned_BAM + bam_suffix
75
+ sorted_output = aligned_sorted_BAM + bam_suffix
76
+ if os.path.exists(aligned_output) and os.path.exists(sorted_output):
77
+ print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
78
+ else:
79
+ align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory, make_bigwigs, threads, deaminase_alignment=True)
80
+
81
+ # Make beds and provide basic histograms
82
+ bed_dir = os.path.join(output_directory, 'beds')
83
+ if os.path.isdir(bed_dir):
84
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
85
+ else:
86
+ aligned_BAM_to_bed(aligned_output, output_directory, converted_FASTA, make_bigwigs, threads)
87
+
88
+ ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
89
+ if barcode_both_ends:
90
+ split_dir = split_dir + '_both_ends_barcoded'
91
+ else:
92
+ split_dir = split_dir + '_at_least_one_end_barcoded'
93
+
94
+ if os.path.isdir(split_dir):
95
+ print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
96
+ bam_pattern = '*' + bam_suffix
97
+ bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
98
+ bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
99
+ bam_files.sort()
100
+ else:
101
+ make_dirs([split_dir])
102
+ if input_already_demuxed:
103
+ bam_files = split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory) # custom for non-nanopore
104
+ else:
105
+ bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
106
+
107
+ # Make beds and provide basic histograms
108
+ bed_dir = os.path.join(split_dir, 'beds')
109
+ if os.path.isdir(bed_dir):
110
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
111
+ else:
112
+ for bam in bam_files:
113
+ aligned_BAM_to_bed(bam, split_dir, converted_FASTA, make_bigwigs, threads)
114
+
115
+ # 5) Samtools QC metrics on split BAM files
116
+ bam_qc_dir = f"{split_dir}/bam_qc"
117
+ if os.path.isdir(bam_qc_dir):
118
+ print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
119
+ else:
120
+ make_dirs([bam_qc_dir])
121
+ bam_qc(bam_files, bam_qc_dir, threads, modality='conversion')
122
+
123
+ # multiqc ###
124
+ if os.path.isdir(f"{split_dir}/multiqc"):
125
+ print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
126
+ else:
127
+ run_multiqc(split_dir, f"{split_dir}/multiqc")
128
+
129
+ # 6) Take the converted BAM and load it into an adata object.
130
+ final_adata, final_adata_path = converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device, deaminase_footprinting=True)
131
+
132
+ return final_adata, final_adata_path, sorted_output, bam_files
@@ -15,7 +15,10 @@ def fast5_to_pod5(fast5_dir, output_pod5='FAST5s_to_POD5.pod5'):
15
15
  import subprocess
16
16
  from pathlib import Path
17
17
 
18
- if Path(fast5_dir).is_file():
18
+ if isinstance(fast5_dir, (list, tuple)):
19
+ cmd = ["pod5", "convert", "fast5"] + fast5_dir + ["--output", output_pod5]
20
+ subprocess.run(cmd)
21
+ elif Path(fast5_dir).is_file():
19
22
  subprocess.run(["pod5", "convert", "fast5", fast5_dir, "--output", output_pod5])
20
23
  elif Path(fast5_dir).is_dir():
21
24
  subprocess.run(["pod5", "convert", "fast5", f".{fast5_dir}*.fast5", "--output", output_pod5])
@@ -9,6 +9,7 @@ from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
9
9
  from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
10
10
  from .count_aligned_reads import count_aligned_reads
11
11
  from .demux_and_index_BAM import demux_and_index_BAM
12
+ from .discover_input_files import *
12
13
  from .extract_base_identities import extract_base_identities
13
14
  from .extract_mods import extract_mods
14
15
  from .extract_read_features_from_bam import extract_read_features_from_bam
@@ -19,7 +20,6 @@ from .generate_converted_FASTA import convert_FASTA_record, generate_converted_F
19
20
  from .get_chromosome_lengths import get_chromosome_lengths
20
21
  from .get_native_references import get_native_references
21
22
  from .index_fasta import index_fasta
22
- from .LoadExperimentConfig import LoadExperimentConfig
23
23
  from .make_dirs import make_dirs
24
24
  from .make_modbed import make_modbed
25
25
  from .modcall import modcall
@@ -29,7 +29,7 @@ from .one_hot_encode import one_hot_encode
29
29
  from .ohe_batching import ohe_batching
30
30
  from .one_hot_decode import one_hot_decode
31
31
  from .ohe_layers_decode import ohe_layers_decode
32
- from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
32
+ from .plot_bed_histograms import plot_bed_histograms
33
33
  from .run_multiqc import run_multiqc
34
34
  from .separate_bam_by_bc import separate_bam_by_bc
35
35
  from .split_and_index_BAM import split_and_index_BAM
@@ -57,7 +57,6 @@ __all__ = [
57
57
  "get_chromosome_lengths",
58
58
  "get_native_references",
59
59
  "index_fasta",
60
- "LoadExperimentConfig",
61
60
  "make_dirs",
62
61
  "make_modbed",
63
62
  "modcall",
@@ -67,7 +66,7 @@ __all__ = [
67
66
  "ohe_batching",
68
67
  "one_hot_decode",
69
68
  "ohe_layers_decode",
70
- "plot_read_length_and_coverage_histograms",
69
+ "plot_bed_histograms",
71
70
  "run_multiqc",
72
71
  "separate_bam_by_bc",
73
72
  "split_and_index_BAM"
@@ -1,6 +1,13 @@
1
1
  ## align_and_sort_BAM
2
2
 
3
- def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligned_outputs', make_bigwigs=False, threads=None):
3
+ def align_and_sort_BAM(fasta,
4
+ input,
5
+ bam_suffix='.bam',
6
+ output_directory='aligned_outputs',
7
+ make_bigwigs=False,
8
+ threads=None,
9
+ aligner='minimap2',
10
+ aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
4
11
  """
5
12
  A wrapper for running dorado aligner and samtools functions
6
13
 
@@ -11,6 +18,8 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
11
18
  output_directory (str): A file path to the directory to output all the analyses.
12
19
  make_bigwigs (bool): Whether to make bigwigs
13
20
  threads (int): Number of additional threads to use
21
+ aligner (str): Aligner to use. minimap2 and dorado options
22
+ aligner_args (list): list of optional parameters to use for the alignment
14
23
 
15
24
  Returns:
16
25
  None
@@ -21,6 +30,7 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
21
30
 
22
31
  input_basename = os.path.basename(input)
23
32
  input_suffix = '.' + input_basename.split('.')[1]
33
+ input_as_fastq = input_basename.split('.')[0] + '.fastq'
24
34
 
25
35
  output_path_minus_suffix = os.path.join(output_directory, input_basename.split(input_suffix)[0])
26
36
 
@@ -34,13 +44,30 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
34
44
  else:
35
45
  pass
36
46
 
37
- # Run dorado aligner
38
- print(f"Aligning BAM to Reference: {input}")
39
- if threads:
40
- alignment_command = ["dorado", "aligner", "-t", threads, '--mm2-opts', "-N 1", fasta, input]
47
+ if aligner == 'minimap2':
48
+ print(f"Converting BAM to FASTQ: {input}")
49
+ bam_to_fastq_command = ['samtools', 'fastq', input]
50
+ subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
51
+ print(f"Aligning FASTQ to Reference: {input_as_fastq}")
52
+ if threads:
53
+ minimap_command = ['minimap2'] + aligner_args + ['-t', threads, fasta, input_as_fastq]
54
+ else:
55
+ minimap_command = ['minimap2'] + aligner_args + [fasta, input_as_fastq]
56
+ subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
57
+ os.remove(input_as_fastq)
58
+
59
+ elif aligner == 'dorado':
60
+ # Run dorado aligner
61
+ print(f"Aligning BAM to Reference: {input}")
62
+ if threads:
63
+ alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [fasta, input]
64
+ else:
65
+ alignment_command = ["dorado", "aligner"] + aligner_args + [fasta, input]
66
+ subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
67
+
41
68
  else:
42
- alignment_command = ["dorado", "aligner", '--mm2-opts', "-N 1", fasta, input]
43
- subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
69
+ print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
70
+ return
44
71
 
45
72
  # Sort the BAM on positional coordinates
46
73
  print(f"Sorting BAM: {aligned_output}")
@@ -1,7 +1,7 @@
1
1
  def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
2
2
  """
3
3
  Takes an aligned BAM as input and writes a BED file of reads as output.
4
- Bed columns are: Record name, start position, end position, read length, read name.
4
+ Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
5
5
 
6
6
  Parameters:
7
7
  aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
@@ -15,11 +15,13 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
15
15
  """
16
16
  import subprocess
17
17
  import os
18
+ import pysam
19
+ import numpy as np
18
20
  import concurrent.futures
19
21
  from concurrent.futures import ProcessPoolExecutor
20
22
  from .bed_to_bigwig import bed_to_bigwig
21
23
  from . import make_dirs
22
- from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
24
+ from .plot_bed_histograms import plot_bed_histograms
23
25
 
24
26
  threads = threads or os.cpu_count() # Use max available cores if not specified
25
27
 
@@ -30,45 +32,54 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
30
32
 
31
33
  bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
32
34
 
33
- print(f"Creating BED from BAM: {aligned_BAM} using {threads} threads...")
35
+ print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
34
36
 
35
- # Convert BAM to BED format
36
- with open(bed_output, "w") as output_file:
37
- samtools_view = subprocess.Popen(["samtools", "view", "-@", str(threads), aligned_BAM], stdout=subprocess.PIPE)
38
- awk_process = subprocess.Popen(
39
- ["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'],
40
- stdin=samtools_view.stdout,
41
- stdout=output_file
42
- )
37
+ with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
38
+ for read in bam.fetch(until_eof=True):
39
+ if read.is_unmapped:
40
+ chrom = "*"
41
+ start1 = 1
42
+ rl = read.query_length or 0
43
+ mapq = 0
44
+ else:
45
+ chrom = bam.get_reference_name(read.reference_id)
46
+ # pysam reference_start is 0-based → +1 for 1-based SAM-like start
47
+ start1 = int(read.reference_start) + 1
48
+ rl = read.query_length or 0
49
+ mapq = int(read.mapping_quality)
43
50
 
44
- samtools_view.stdout.close()
45
- awk_process.wait()
46
- samtools_view.wait()
51
+ # End position in 1-based inclusive coords
52
+ end1 = start1 + (rl or 0) - 1
47
53
 
48
- print(f"BED file created: {bed_output}")
54
+ qname = read.query_name
55
+ quals = read.query_qualities
56
+ if quals is None or rl == 0:
57
+ avg_q = float("nan")
58
+ else:
59
+ avg_q = float(np.mean(quals))
60
+
61
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
62
+
63
+ print(f"BED-like file created: {bed_output}")
49
64
 
50
65
  def split_bed(bed):
51
- """Splits BED into aligned and unaligned reads."""
66
+ """Splits into aligned and unaligned reads (chrom == '*')."""
52
67
  aligned = bed.replace(".bed", "_aligned.bed")
53
68
  unaligned = bed.replace(".bed", "_unaligned.bed")
54
-
55
69
  with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
56
70
  for line in infile:
57
- (unaligned_out if line.startswith("*") else aligned_out).write(line)
58
-
71
+ (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
59
72
  os.remove(bed)
60
73
  return aligned
61
74
 
62
- print(f"Splitting BED: {bed_output}")
75
+ print(f"Splitting: {bed_output}")
63
76
  aligned_bed = split_bed(bed_output)
64
77
 
65
- with ProcessPoolExecutor() as executor: # Use processes instead of threads
78
+ with ProcessPoolExecutor() as executor:
66
79
  futures = []
67
- futures.append(executor.submit(plot_read_length_and_coverage_histograms, aligned_bed, plotting_dir))
80
+ futures.append(executor.submit(plot_bed_histograms, aligned_bed, plotting_dir, fasta))
68
81
  if make_bigwigs:
69
82
  futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
70
-
71
- # Wait for all tasks to complete
72
83
  concurrent.futures.wait(futures)
73
84
 
74
85
  print("Processing completed successfully.")