smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,20 @@
1
+ from .apply_hmm_batched import apply_hmm_batched
2
+ from .calculate_distances import calculate_distances
3
+ from .call_hmm_peaks import call_hmm_peaks
4
+ from .display_hmm import display_hmm
5
+ from .hmm_readwrite import load_hmm, save_hmm
6
+ from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
7
+ from .train_hmm import train_hmm
8
+
9
+
10
+ __all__ = [
11
+ "apply_hmm_batched",
12
+ "calculate_distances",
13
+ "call_hmm_peaks",
14
+ "display_hmm",
15
+ "load_hmm",
16
+ "refine_nucleosome_calls",
17
+ "infer_nucleosomes_in_large_bound",
18
+ "save_hmm",
19
+ "train_hmm"
20
+ ]
@@ -3,14 +3,11 @@ import pandas as pd
3
3
  import torch
4
4
  from tqdm import tqdm
5
5
 
6
- def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
6
+ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A", "C"], device="cpu", threshold=0.7, deaminase_footprinting=False):
7
7
  """
8
8
  Applies an HMM model to an AnnData object using tensor-based sequence inputs.
9
9
  If multiple methbases are passed, generates a combined feature set.
10
10
  """
11
- import numpy as np
12
- import torch
13
- from tqdm import tqdm
14
11
 
15
12
  model.to(device)
16
13
 
@@ -74,6 +71,7 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
74
71
  for methbase in methbases:
75
72
  mask = {
76
73
  "a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
74
+ "c": ref_subset.var[f"{ref}_any_C_site"] == True,
77
75
  "gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
78
76
  "cpg": ref_subset.var[f"{ref}_CpG_site"] == True
79
77
  }[methbase.lower()]
@@ -150,6 +148,8 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
150
148
  adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
151
149
 
152
150
  # --- Binarization + Distance ---
151
+ coordinates = adata.var_names.astype(int).values
152
+
153
153
  for feature in tqdm(all_features, desc="Finalizing Layers"):
154
154
  bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
155
155
  counts = np.zeros(adata.shape[0], dtype=int)
@@ -158,9 +158,11 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
158
158
  intervals = []
159
159
  for start, length, prob in intervals:
160
160
  if prob > threshold:
161
- bin_matrix[row_idx, start:start+length] = 1
161
+ start_idx = np.searchsorted(coordinates, start, side="left")
162
+ end_idx = np.searchsorted(coordinates, start + length - 1, side="right")
163
+ bin_matrix[row_idx, start_idx:end_idx] = 1
162
164
  counts[row_idx] += 1
163
- adata.layers[f"{feature}"] = bin_matrix
165
+ adata.layers[feature] = bin_matrix
164
166
  adata.obs[f"n_{feature}"] = counts
165
167
  adata.obs[f"{feature}_distances"] = calculate_batch_distances(adata.obs[feature].tolist(), threshold)
166
168
 
@@ -202,7 +204,6 @@ def classify_batch(predicted_states_batch, probabilities_batch, coordinates, cla
202
204
  Returns:
203
205
  List of classifications for each sequence.
204
206
  """
205
- import numpy as np
206
207
 
207
208
  state_labels = ["Non-Methylated", "Methylated"]
208
209
  target_idx = state_labels.index(target_state)
@@ -0,0 +1,106 @@
1
+ def call_hmm_peaks(
2
+ adata,
3
+ feature_configs,
4
+ obs_column='Reference_strand',
5
+ site_types=['GpC_site', 'CpG_site'],
6
+ save_plot=False,
7
+ output_dir=None,
8
+ date_tag=None,
9
+ inplace=False
10
+ ):
11
+ import numpy as np
12
+ import pandas as pd
13
+ import matplotlib.pyplot as plt
14
+ from scipy.signal import find_peaks
15
+
16
+ if not inplace:
17
+ adata = adata.copy()
18
+
19
+ # Ensure obs_column is categorical
20
+ if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
21
+ adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
22
+
23
+ coordinates = adata.var_names.astype(int).values
24
+ peak_columns = []
25
+
26
+ obs_updates = {}
27
+
28
+ for feature_layer, config in feature_configs.items():
29
+ min_distance = config.get('min_distance', 200)
30
+ peak_width = config.get('peak_width', 200)
31
+ peak_prominence = config.get('peak_prominence', 0.2)
32
+ peak_threshold = config.get('peak_threshold', 0.8)
33
+
34
+ matrix = adata.layers[feature_layer]
35
+ means = np.mean(matrix, axis=0)
36
+ peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
37
+ peak_centers = coordinates[peak_indices]
38
+ adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
39
+
40
+ # Plot
41
+ plt.figure(figsize=(6, 3))
42
+ plt.plot(coordinates, means)
43
+ plt.title(f"{feature_layer} with peak calls")
44
+ plt.xlabel("Genomic position")
45
+ plt.ylabel("Mean intensity")
46
+ for i, center in enumerate(peak_centers):
47
+ start, end = center - peak_width // 2, center + peak_width // 2
48
+ plt.axvspan(start, end, color='purple', alpha=0.2)
49
+ plt.axvline(center, color='red', linestyle='--')
50
+ aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
51
+ plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
52
+ if save_plot and output_dir:
53
+ filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
54
+ plt.savefig(filename, bbox_inches='tight')
55
+ print(f"Saved plot to {filename}")
56
+ else:
57
+ plt.show()
58
+
59
+ feature_peak_columns = []
60
+ for center in peak_centers:
61
+ start, end = center - peak_width // 2, center + peak_width // 2
62
+ colname = f'{feature_layer}_peak_{center}'
63
+ peak_columns.append(colname)
64
+ feature_peak_columns.append(colname)
65
+
66
+ peak_mask = (coordinates >= start) & (coordinates <= end)
67
+ adata.var[colname] = peak_mask
68
+
69
+ region = matrix[:, peak_mask]
70
+ obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
71
+ obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
72
+ obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
73
+
74
+ for site_type in site_types:
75
+ adata.obs[f'{site_type}_sum_around_{center}'] = 0
76
+ adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
77
+
78
+ for ref in adata.obs[obs_column].cat.categories:
79
+ ref_idx = adata.obs[obs_column] == ref
80
+ mask_key = f"{ref}_{site_type}"
81
+ for site_type in site_types:
82
+ if mask_key not in adata.var:
83
+ continue
84
+ site_mask = adata.var[mask_key].values
85
+ site_coords = coordinates[site_mask]
86
+ region_mask = (site_coords >= start) & (site_coords <= end)
87
+ if not region_mask.any():
88
+ continue
89
+ full_mask = site_mask.copy()
90
+ full_mask[site_mask] = region_mask
91
+ site_region = adata[ref_idx, full_mask].X
92
+ if hasattr(site_region, "A"):
93
+ site_region = site_region.A
94
+ if site_region.shape[1] > 0:
95
+ adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
96
+ adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
97
+ else:
98
+ pass
99
+
100
+ adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
101
+ print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
102
+
103
+ adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
104
+ adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
105
+
106
+ return adata if not inplace else None
@@ -1,16 +1,16 @@
1
1
  def display_hmm(hmm, state_labels=["Non-Methylated", "Methylated"], obs_labels=["0", "1"]):
2
2
  import torch
3
- print("\n🔹 **HMM Model Overview**")
3
+ print("\n**HMM Model Overview**")
4
4
  print(hmm)
5
5
 
6
- print("\n🔹 **Transition Matrix**")
6
+ print("\n**Transition Matrix**")
7
7
  transition_matrix = torch.exp(hmm.edges).detach().cpu().numpy()
8
8
  for i, row in enumerate(transition_matrix):
9
9
  label = state_labels[i] if state_labels else f"State {i}"
10
10
  formatted_row = ", ".join(f"{p:.6f}" for p in row)
11
11
  print(f"{label}: [{formatted_row}]")
12
12
 
13
- print("\n🔹 **Emission Probabilities**")
13
+ print("\n**Emission Probabilities**")
14
14
  for i, dist in enumerate(hmm.distributions):
15
15
  label = state_labels[i] if state_labels else f"State {i}"
16
16
  probs = dist.probs.detach().cpu().numpy()
@@ -56,7 +56,7 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
56
56
  adata.layers[f"{layer_name}_hexamers"] = hexamer_layer
57
57
  adata.layers[f"{layer_name}_octamers"] = octamer_layer
58
58
 
59
- print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
59
+ print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
60
60
  return adata
61
61
 
62
62
  def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_layer, nan_mask_layer, nuc_size=147, linker_size=50, exclusion_buffer=30, device="cpu"):
@@ -100,5 +100,5 @@ def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_laye
100
100
  pos_cursor += 1
101
101
 
102
102
  adata.layers[f"{large_bound_layer}_phased_nucleosomes"] = inferred_layer
103
- print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
103
+ print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
104
104
  return adata
@@ -11,7 +11,7 @@ def train_hmm(
11
11
  pad_value=0,
12
12
  ):
13
13
  """
14
- Trains a 2-state DenseHMM model on binary methylation data.
14
+ Trains a 2-state DenseHMM model on binary methylation/deamination data.
15
15
 
16
16
  Parameters:
17
17
  data (list or np.ndarray): List of sequences (lists) with 0, 1, or NaN.
@@ -1,16 +1,20 @@
1
- from . import helpers
2
- from .basecall_pod5s import basecall_pod5s
3
- from .load_adata import load_adata
4
- from .subsample_fasta_from_bed import subsample_fasta_from_bed
5
- from .subsample_pod5 import subsample_pod5
6
- from .fast5_to_pod5 import fast5_to_pod5
7
-
1
+ from .bam_functions import align_and_sort_BAM, bam_qc, concatenate_fastqs_to_bam, count_aligned_reads, demux_and_index_BAM, extract_base_identities, extract_read_features_from_bam, extract_readnames_from_bam, separate_bam_by_bc, split_and_index_BAM
2
+ from .basecalling import canoncall, modcall
3
+ from .bed_functions import aligned_BAM_to_bed, _bed_to_bigwig, extract_read_lengths_from_bed, _plot_bed_histograms
4
+ from .converted_BAM_to_adata import converted_BAM_to_adata
5
+ from .fasta_functions import find_conversion_sites, generate_converted_FASTA, get_chromosome_lengths, get_native_references, index_fasta, subsample_fasta_from_bed
6
+ from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
7
+ from .modkit_functions import extract_mods, make_modbed, modQC
8
+ from .modkit_extract_to_adata import modkit_extract_to_adata
9
+ from .ohe import one_hot_encode, one_hot_decode, ohe_layers_decode, ohe_batching
10
+ from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
11
+ from .run_multiqc import run_multiqc
8
12
 
9
13
  __all__ = [
10
14
  "basecall_pod5s",
11
- "load_adata",
15
+ "converted_BAM_to_adata",
12
16
  "subsample_fasta_from_bed",
13
17
  "subsample_pod5",
14
18
  "fast5_to_pod5",
15
- "helpers"
19
+ "run_multiqc"
16
20
  ]
@@ -0,0 +1,132 @@
1
+
2
+ def deaminase_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
3
+ """
4
+ Processes sequencing data from a conversion SMF experiment to an adata object.
5
+
6
+ Parameters:
7
+ fasta (str): File path to the reference genome to align to.
8
+ output_directory (str): A file path to the directory to output all the analyses.
9
+ conversion_type (list): A list of strings of the conversion types to use in the analysis.
10
+ strands (list): A list of converstion strands to use in the experiment.
11
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
12
+ model (str): a string representing the dorado basecalling model.
13
+ input_data_path (str): a string representing the file path to the experiment directory/file containing sequencing data
14
+ split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
+ barcode_kit (str): A string representing the barcoding kit used in the experiment.
16
+ mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
17
+ experiment_name (str): A string to provide an experiment name to the output adata file.
18
+ bam_suffix (str): A suffix to add to the bam file.
19
+ basecall (bool): Whether to go through basecalling or not.
20
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
21
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
22
+ device (str): Device to use for basecalling. auto, metal, cpu, cuda
23
+ make_bigwigs (bool): Whether to make bigwigs
24
+ threads (int): cpu threads available for processing.
25
+ input_already_demuxed (bool): Whether the input files were already demultiplexed
26
+
27
+ Returns:
28
+ final_adata_path (str): Path to the final adata object
29
+ sorted_output (str): Path to the aligned, sorted BAM
30
+ """
31
+ from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, canoncall, converted_BAM_to_adata_II, generate_converted_FASTA, get_chromosome_lengths, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc, split_and_index_BAM
32
+ import os
33
+ import shutil
34
+ import glob
35
+
36
+ if basecall:
37
+ model_basename = os.path.basename(model)
38
+ model_basename = model_basename.replace('.', '_')
39
+ bam=f"{output_directory}/{model_basename}_canonical_basecalls"
40
+ else:
41
+ bam_base=os.path.basename(input_data_path).split('.bam')[0]
42
+ bam=os.path.join(output_directory, bam_base)
43
+ aligned_BAM=f"{bam}_aligned"
44
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
45
+
46
+ os.chdir(output_directory)
47
+
48
+ # 1) Convert FASTA file
49
+ fasta_basename = os.path.basename(fasta)
50
+ converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
51
+ converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
52
+ if 'converted.fa' in fasta:
53
+ print(fasta + ' is already converted. Using existing converted FASTA.')
54
+ converted_FASTA = fasta
55
+ elif os.path.exists(converted_FASTA):
56
+ print(converted_FASTA + ' already exists. Using existing converted FASTA.')
57
+ else:
58
+ generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
59
+
60
+ # Make a FAI and .chrom.names file for the converted fasta
61
+ get_chromosome_lengths(converted_FASTA)
62
+
63
+ # 2) Basecall from the input POD5 to generate a singular output BAM
64
+ if basecall:
65
+ canoncall_output = bam + bam_suffix
66
+ if os.path.exists(canoncall_output):
67
+ print(canoncall_output + ' already exists. Using existing basecalled BAM.')
68
+ else:
69
+ canoncall(model_dir, model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
70
+ else:
71
+ canoncall_output = input_data_path
72
+
73
+ # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
74
+ aligned_output = aligned_BAM + bam_suffix
75
+ sorted_output = aligned_sorted_BAM + bam_suffix
76
+ if os.path.exists(aligned_output) and os.path.exists(sorted_output):
77
+ print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
78
+ else:
79
+ align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory, make_bigwigs, threads, deaminase_alignment=True)
80
+
81
+ # Make beds and provide basic histograms
82
+ bed_dir = os.path.join(output_directory, 'beds')
83
+ if os.path.isdir(bed_dir):
84
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
85
+ else:
86
+ aligned_BAM_to_bed(aligned_output, output_directory, converted_FASTA, make_bigwigs, threads)
87
+
88
+ ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
89
+ if barcode_both_ends:
90
+ split_dir = split_dir + '_both_ends_barcoded'
91
+ else:
92
+ split_dir = split_dir + '_at_least_one_end_barcoded'
93
+
94
+ if os.path.isdir(split_dir):
95
+ print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
96
+ bam_pattern = '*' + bam_suffix
97
+ bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
98
+ bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
99
+ bam_files.sort()
100
+ else:
101
+ make_dirs([split_dir])
102
+ if input_already_demuxed:
103
+ bam_files = split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory) # custom for non-nanopore
104
+ else:
105
+ bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
106
+
107
+ # Make beds and provide basic histograms
108
+ bed_dir = os.path.join(split_dir, 'beds')
109
+ if os.path.isdir(bed_dir):
110
+ print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
111
+ else:
112
+ for bam in bam_files:
113
+ aligned_BAM_to_bed(bam, split_dir, converted_FASTA, make_bigwigs, threads)
114
+
115
+ # 5) Samtools QC metrics on split BAM files
116
+ bam_qc_dir = f"{split_dir}/bam_qc"
117
+ if os.path.isdir(bam_qc_dir):
118
+ print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
119
+ else:
120
+ make_dirs([bam_qc_dir])
121
+ bam_qc(bam_files, bam_qc_dir, threads, modality='conversion')
122
+
123
+ # multiqc ###
124
+ if os.path.isdir(f"{split_dir}/multiqc"):
125
+ print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
126
+ else:
127
+ run_multiqc(split_dir, f"{split_dir}/multiqc")
128
+
129
+ # 6) Take the converted BAM and load it into an adata object.
130
+ final_adata, final_adata_path = converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device, deaminase_footprinting=True)
131
+
132
+ return final_adata, final_adata_path, sorted_output, bam_files
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+ import subprocess
3
+ from typing import Union, List
4
+
5
+ def fast5_to_pod5(
6
+ fast5_dir: Union[str, Path, List[Union[str, Path]]],
7
+ output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
8
+ ) -> None:
9
+ """
10
+ Convert Nanopore FAST5 files (single file, list of files, or directory)
11
+ into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
12
+ """
13
+
14
+ output_pod5 = str(output_pod5) # ensure string
15
+
16
+ # 1) If user gives a list of FAST5 files
17
+ if isinstance(fast5_dir, (list, tuple)):
18
+ fast5_paths = [str(Path(f)) for f in fast5_dir]
19
+ cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
20
+ subprocess.run(cmd, check=True)
21
+ return
22
+
23
+ # Ensure Path object
24
+ p = Path(fast5_dir)
25
+
26
+ # 2) If user gives a single file
27
+ if p.is_file():
28
+ cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
29
+ subprocess.run(cmd, check=True)
30
+ return
31
+
32
+ # 3) If user gives a directory → collect FAST5s
33
+ if p.is_dir():
34
+ fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
35
+ if not fast5_paths:
36
+ raise FileNotFoundError(f"No FAST5 files found in {p}")
37
+
38
+ cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
39
+ subprocess.run(cmd, check=True)
40
+ return
41
+
42
+ raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
43
+
@@ -0,0 +1,71 @@
1
+ # from .align_and_sort_BAM import align_and_sort_BAM
2
+ # from .aligned_BAM_to_bed import aligned_BAM_to_bed
3
+ # from .bam_qc import bam_qc
4
+ # from .bed_to_bigwig import bed_to_bigwig
5
+ # from .binarize_converted_base_identities import binarize_converted_base_identities
6
+ # from .canoncall import canoncall
7
+ # from .complement_base_list import complement_base_list
8
+ # from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
9
+ # from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
10
+ # from .count_aligned_reads import count_aligned_reads
11
+ # from .demux_and_index_BAM import demux_and_index_BAM
12
+ # from .discover_input_files import *
13
+ # from .extract_base_identities import extract_base_identities
14
+ # from .extract_mods import extract_mods
15
+ # from .extract_read_features_from_bam import extract_read_features_from_bam
16
+ # from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
17
+ # from .extract_readnames_from_BAM import extract_readnames_from_BAM
18
+ # from .find_conversion_sites import find_conversion_sites
19
+ # from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
20
+ # from .get_chromosome_lengths import get_chromosome_lengths
21
+ # from .get_native_references import get_native_references
22
+ # from .index_fasta import index_fasta
23
+ # from .make_modbed import make_modbed
24
+ # from .modcall import modcall
25
+ # from .modkit_extract_to_adata import modkit_extract_to_adata
26
+ # from .modQC import modQC
27
+ # from .one_hot_encode import one_hot_encode
28
+ # from .ohe_batching import ohe_batching
29
+ # from .one_hot_decode import one_hot_decode
30
+ # from .ohe_layers_decode import ohe_layers_decode
31
+ # from .plot_bed_histograms import plot_bed_histograms
32
+ # from .run_multiqc import run_multiqc
33
+ # from .separate_bam_by_bc import separate_bam_by_bc
34
+ # from .split_and_index_BAM import split_and_index_BAM
35
+
36
+ # __all__ = [
37
+ # "align_and_sort_BAM",
38
+ # "aligned_BAM_to_bed",
39
+ # "bam_qc",
40
+ # "bed_to_bigwig",
41
+ # "binarize_converted_base_identities",
42
+ # "canoncall",
43
+ # "complement_base_list",
44
+ # "converted_BAM_to_adata_II",
45
+ # "concatenate_fastqs_to_bam",
46
+ # "count_aligned_reads",
47
+ # "demux_and_index_BAM",
48
+ # "extract_base_identities",
49
+ # "extract_mods",
50
+ # "extract_read_features_from_bam",
51
+ # "extract_read_lengths_from_bed",
52
+ # "extract_readnames_from_BAM",
53
+ # "find_conversion_sites",
54
+ # "convert_FASTA_record",
55
+ # "generate_converted_FASTA",
56
+ # "get_chromosome_lengths",
57
+ # "get_native_references",
58
+ # "index_fasta",
59
+ # "make_modbed",
60
+ # "modcall",
61
+ # "modkit_extract_to_adata",
62
+ # "modQC",
63
+ # "one_hot_encode",
64
+ # "ohe_batching",
65
+ # "one_hot_decode",
66
+ # "ohe_layers_decode",
67
+ # "plot_bed_histograms",
68
+ # "run_multiqc",
69
+ # "separate_bam_by_bc",
70
+ # "split_and_index_BAM"
71
+ # ]
@@ -0,0 +1,126 @@
1
+ from pathlib import Path
2
+ import os
3
+ import subprocess
4
+ from typing import List, Optional, Union
5
+ import pysam
6
+
7
+ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
8
+ """
9
+ Minimal BAM->FASTQ using pysam. Writes unmapped or unaligned reads as-is.
10
+ """
11
+ bam_path = str(bam_path)
12
+ fastq_path = str(fastq_path)
13
+ with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w") as fq:
14
+ for r in bam.fetch(until_eof=True):
15
+ # Skip secondary/supplementary if you want (optional):
16
+ # if r.is_secondary or r.is_supplementary: continue
17
+ name = r.query_name
18
+ seq = r.query_sequence or ""
19
+ qual = r.qual or ""
20
+ fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
21
+
22
+ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
23
+ in_bam, out_bam = str(in_bam), str(out_bam)
24
+ args = []
25
+ if threads:
26
+ args += ["-@", str(threads)]
27
+ args += ["-o", out_bam, in_bam]
28
+ pysam.sort(*args)
29
+
30
+ def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
31
+ bam_path = str(bam_path)
32
+ # pysam.index supports samtools-style args
33
+ if threads:
34
+ pysam.index("-@", str(threads), bam_path)
35
+ else:
36
+ pysam.index(bam_path)
37
+
38
+ def align_and_sort_BAM(fasta,
39
+ input,
40
+ bam_suffix='.bam',
41
+ output_directory='aligned_outputs',
42
+ make_bigwigs=False,
43
+ threads=None,
44
+ aligner='minimap2',
45
+ aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
46
+ """
47
+ A wrapper for running dorado aligner and samtools functions
48
+
49
+ Parameters:
50
+ fasta (str): File path to the reference genome to align to.
51
+ input (str): File path to the basecalled file to align. Works for .bam and .fastq files
52
+ bam_suffix (str): The suffix to use for the BAM file.
53
+ output_directory (str): A file path to the directory to output all the analyses.
54
+ make_bigwigs (bool): Whether to make bigwigs
55
+ threads (int): Number of additional threads to use
56
+ aligner (str): Aligner to use. minimap2 and dorado options
57
+ aligner_args (list): list of optional parameters to use for the alignment
58
+
59
+ Returns:
60
+ None
61
+ The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
62
+ """
63
+ input_basename = input.name
64
+ input_suffix = input.suffix
65
+ input_as_fastq = input.with_name(input.stem + '.fastq')
66
+
67
+ output_path_minus_suffix = output_directory / input.stem
68
+
69
+ aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
70
+ aligned_output = aligned_BAM.with_suffix(bam_suffix)
71
+ aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
72
+ aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
73
+
74
+ if threads:
75
+ threads = str(threads)
76
+ else:
77
+ pass
78
+
79
+ if aligner == 'minimap2':
80
+ print(f"Converting BAM to FASTQ: {input}")
81
+ _bam_to_fastq_with_pysam(input, input_as_fastq)
82
+ # bam_to_fastq_command = ['samtools', 'fastq', input]
83
+ # subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
84
+ print(f"Aligning FASTQ to Reference: {input_as_fastq}")
85
+ if threads:
86
+ minimap_command = ['minimap2'] + aligner_args + ['-t', threads, str(fasta), str(input_as_fastq)]
87
+ else:
88
+ minimap_command = ['minimap2'] + aligner_args + [str(fasta), str(input_as_fastq)]
89
+ subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
90
+ os.remove(input_as_fastq)
91
+
92
+ elif aligner == 'dorado':
93
+ # Run dorado aligner
94
+ print(f"Aligning BAM to Reference: {input}")
95
+ if threads:
96
+ alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [str(fasta), str(input)]
97
+ else:
98
+ alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
99
+ subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
100
+
101
+ else:
102
+ print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
103
+ return
104
+
105
+ # --- Sort & Index with pysam ---
106
+ print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
107
+ _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
108
+
109
+ print(f"[pysam] Indexing: {aligned_sorted_output}")
110
+ _index_bam_with_pysam(aligned_sorted_output, threads=threads)
111
+
112
+ # Sort the BAM on positional coordinates
113
+ # print(f"Sorting BAM: {aligned_output}")
114
+ # if threads:
115
+ # sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
116
+ # else:
117
+ # sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
118
+ # subprocess.run(sort_command)
119
+
120
+ # # Create a BAM index file
121
+ # print(f"Indexing BAM: {aligned_sorted_output}")
122
+ # if threads:
123
+ # index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
124
+ # else:
125
+ # index_command = ["samtools", "index", aligned_sorted_output]
126
+ # subprocess.run(index_command)