smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +9 -4
- smftools/_version.py +1 -1
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +0 -2
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/fast5_to_pod5.py +4 -1
- smftools/informatics/helpers/__init__.py +3 -4
- smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
- smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
- smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +29 -3
- smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
- smftools/informatics/helpers/find_conversion_sites.py +5 -4
- smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
- smftools/informatics/helpers/split_and_index_BAM.py +1 -5
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/general_plotting.py +566 -89
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +13 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +849 -43
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/evaluation/__init__.py +0 -0
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
smftools/hmm/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .apply_hmm_batched import apply_hmm_batched
|
|
2
|
+
from .calculate_distances import calculate_distances
|
|
3
|
+
from .call_hmm_peaks import call_hmm_peaks
|
|
4
|
+
from .display_hmm import display_hmm
|
|
5
|
+
from .hmm_readwrite import load_hmm, save_hmm
|
|
6
|
+
from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
|
|
7
|
+
from .train_hmm import train_hmm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"apply_hmm_batched",
|
|
12
|
+
"calculate_distances",
|
|
13
|
+
"call_hmm_peaks",
|
|
14
|
+
"display_hmm",
|
|
15
|
+
"load_hmm",
|
|
16
|
+
"refine_nucleosome_calls",
|
|
17
|
+
"infer_nucleosomes_in_large_bound",
|
|
18
|
+
"save_hmm",
|
|
19
|
+
"train_hmm"
|
|
20
|
+
]
|
|
@@ -3,14 +3,11 @@ import pandas as pd
|
|
|
3
3
|
import torch
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
|
|
6
|
-
def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
|
|
6
|
+
def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A", "C"], device="cpu", threshold=0.7, deaminase_footprinting=False):
|
|
7
7
|
"""
|
|
8
8
|
Applies an HMM model to an AnnData object using tensor-based sequence inputs.
|
|
9
9
|
If multiple methbases are passed, generates a combined feature set.
|
|
10
10
|
"""
|
|
11
|
-
import numpy as np
|
|
12
|
-
import torch
|
|
13
|
-
from tqdm import tqdm
|
|
14
11
|
|
|
15
12
|
model.to(device)
|
|
16
13
|
|
|
@@ -74,6 +71,7 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
|
|
|
74
71
|
for methbase in methbases:
|
|
75
72
|
mask = {
|
|
76
73
|
"a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
|
|
74
|
+
"c": ref_subset.var[f"{ref}_any_C_site"] == True,
|
|
77
75
|
"gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
|
|
78
76
|
"cpg": ref_subset.var[f"{ref}_CpG_site"] == True
|
|
79
77
|
}[methbase.lower()]
|
|
@@ -150,6 +148,8 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
|
|
|
150
148
|
adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
|
|
151
149
|
|
|
152
150
|
# --- Binarization + Distance ---
|
|
151
|
+
coordinates = adata.var_names.astype(int).values
|
|
152
|
+
|
|
153
153
|
for feature in tqdm(all_features, desc="Finalizing Layers"):
|
|
154
154
|
bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
|
|
155
155
|
counts = np.zeros(adata.shape[0], dtype=int)
|
|
@@ -158,9 +158,11 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
|
|
|
158
158
|
intervals = []
|
|
159
159
|
for start, length, prob in intervals:
|
|
160
160
|
if prob > threshold:
|
|
161
|
-
|
|
161
|
+
start_idx = np.searchsorted(coordinates, start, side="left")
|
|
162
|
+
end_idx = np.searchsorted(coordinates, start + length - 1, side="right")
|
|
163
|
+
bin_matrix[row_idx, start_idx:end_idx] = 1
|
|
162
164
|
counts[row_idx] += 1
|
|
163
|
-
adata.layers[
|
|
165
|
+
adata.layers[feature] = bin_matrix
|
|
164
166
|
adata.obs[f"n_{feature}"] = counts
|
|
165
167
|
adata.obs[f"{feature}_distances"] = calculate_batch_distances(adata.obs[feature].tolist(), threshold)
|
|
166
168
|
|
|
@@ -202,7 +204,6 @@ def classify_batch(predicted_states_batch, probabilities_batch, coordinates, cla
|
|
|
202
204
|
Returns:
|
|
203
205
|
List of classifications for each sequence.
|
|
204
206
|
"""
|
|
205
|
-
import numpy as np
|
|
206
207
|
|
|
207
208
|
state_labels = ["Non-Methylated", "Methylated"]
|
|
208
209
|
target_idx = state_labels.index(target_state)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
def call_hmm_peaks(
|
|
2
|
+
adata,
|
|
3
|
+
feature_configs,
|
|
4
|
+
obs_column='Reference_strand',
|
|
5
|
+
site_types=['GpC_site', 'CpG_site'],
|
|
6
|
+
save_plot=False,
|
|
7
|
+
output_dir=None,
|
|
8
|
+
date_tag=None,
|
|
9
|
+
inplace=False
|
|
10
|
+
):
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
from scipy.signal import find_peaks
|
|
15
|
+
|
|
16
|
+
if not inplace:
|
|
17
|
+
adata = adata.copy()
|
|
18
|
+
|
|
19
|
+
# Ensure obs_column is categorical
|
|
20
|
+
if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
|
|
21
|
+
adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
|
|
22
|
+
|
|
23
|
+
coordinates = adata.var_names.astype(int).values
|
|
24
|
+
peak_columns = []
|
|
25
|
+
|
|
26
|
+
obs_updates = {}
|
|
27
|
+
|
|
28
|
+
for feature_layer, config in feature_configs.items():
|
|
29
|
+
min_distance = config.get('min_distance', 200)
|
|
30
|
+
peak_width = config.get('peak_width', 200)
|
|
31
|
+
peak_prominence = config.get('peak_prominence', 0.2)
|
|
32
|
+
peak_threshold = config.get('peak_threshold', 0.8)
|
|
33
|
+
|
|
34
|
+
matrix = adata.layers[feature_layer]
|
|
35
|
+
means = np.mean(matrix, axis=0)
|
|
36
|
+
peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
|
|
37
|
+
peak_centers = coordinates[peak_indices]
|
|
38
|
+
adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
|
|
39
|
+
|
|
40
|
+
# Plot
|
|
41
|
+
plt.figure(figsize=(6, 3))
|
|
42
|
+
plt.plot(coordinates, means)
|
|
43
|
+
plt.title(f"{feature_layer} with peak calls")
|
|
44
|
+
plt.xlabel("Genomic position")
|
|
45
|
+
plt.ylabel("Mean intensity")
|
|
46
|
+
for i, center in enumerate(peak_centers):
|
|
47
|
+
start, end = center - peak_width // 2, center + peak_width // 2
|
|
48
|
+
plt.axvspan(start, end, color='purple', alpha=0.2)
|
|
49
|
+
plt.axvline(center, color='red', linestyle='--')
|
|
50
|
+
aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
|
|
51
|
+
plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
|
|
52
|
+
if save_plot and output_dir:
|
|
53
|
+
filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
|
|
54
|
+
plt.savefig(filename, bbox_inches='tight')
|
|
55
|
+
print(f"Saved plot to {filename}")
|
|
56
|
+
else:
|
|
57
|
+
plt.show()
|
|
58
|
+
|
|
59
|
+
feature_peak_columns = []
|
|
60
|
+
for center in peak_centers:
|
|
61
|
+
start, end = center - peak_width // 2, center + peak_width // 2
|
|
62
|
+
colname = f'{feature_layer}_peak_{center}'
|
|
63
|
+
peak_columns.append(colname)
|
|
64
|
+
feature_peak_columns.append(colname)
|
|
65
|
+
|
|
66
|
+
peak_mask = (coordinates >= start) & (coordinates <= end)
|
|
67
|
+
adata.var[colname] = peak_mask
|
|
68
|
+
|
|
69
|
+
region = matrix[:, peak_mask]
|
|
70
|
+
obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
|
|
71
|
+
obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
|
|
72
|
+
obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
|
|
73
|
+
|
|
74
|
+
for site_type in site_types:
|
|
75
|
+
adata.obs[f'{site_type}_sum_around_{center}'] = 0
|
|
76
|
+
adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
|
|
77
|
+
|
|
78
|
+
for ref in adata.obs[obs_column].cat.categories:
|
|
79
|
+
ref_idx = adata.obs[obs_column] == ref
|
|
80
|
+
mask_key = f"{ref}_{site_type}"
|
|
81
|
+
for site_type in site_types:
|
|
82
|
+
if mask_key not in adata.var:
|
|
83
|
+
continue
|
|
84
|
+
site_mask = adata.var[mask_key].values
|
|
85
|
+
site_coords = coordinates[site_mask]
|
|
86
|
+
region_mask = (site_coords >= start) & (site_coords <= end)
|
|
87
|
+
if not region_mask.any():
|
|
88
|
+
continue
|
|
89
|
+
full_mask = site_mask.copy()
|
|
90
|
+
full_mask[site_mask] = region_mask
|
|
91
|
+
site_region = adata[ref_idx, full_mask].X
|
|
92
|
+
if hasattr(site_region, "A"):
|
|
93
|
+
site_region = site_region.A
|
|
94
|
+
if site_region.shape[1] > 0:
|
|
95
|
+
adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
|
|
96
|
+
adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
|
|
97
|
+
else:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
|
|
101
|
+
print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
|
|
102
|
+
|
|
103
|
+
adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
|
|
104
|
+
adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
|
|
105
|
+
|
|
106
|
+
return adata if not inplace else None
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
def display_hmm(hmm, state_labels=["Non-Methylated", "Methylated"], obs_labels=["0", "1"]):
|
|
2
2
|
import torch
|
|
3
|
-
print("\n
|
|
3
|
+
print("\n**HMM Model Overview**")
|
|
4
4
|
print(hmm)
|
|
5
5
|
|
|
6
|
-
print("\n
|
|
6
|
+
print("\n**Transition Matrix**")
|
|
7
7
|
transition_matrix = torch.exp(hmm.edges).detach().cpu().numpy()
|
|
8
8
|
for i, row in enumerate(transition_matrix):
|
|
9
9
|
label = state_labels[i] if state_labels else f"State {i}"
|
|
10
10
|
formatted_row = ", ".join(f"{p:.6f}" for p in row)
|
|
11
11
|
print(f"{label}: [{formatted_row}]")
|
|
12
12
|
|
|
13
|
-
print("\n
|
|
13
|
+
print("\n**Emission Probabilities**")
|
|
14
14
|
for i, dist in enumerate(hmm.distributions):
|
|
15
15
|
label = state_labels[i] if state_labels else f"State {i}"
|
|
16
16
|
probs = dist.probs.detach().cpu().numpy()
|
|
@@ -56,7 +56,7 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
|
|
|
56
56
|
adata.layers[f"{layer_name}_hexamers"] = hexamer_layer
|
|
57
57
|
adata.layers[f"{layer_name}_octamers"] = octamer_layer
|
|
58
58
|
|
|
59
|
-
print(f"
|
|
59
|
+
print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
|
|
60
60
|
return adata
|
|
61
61
|
|
|
62
62
|
def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_layer, nan_mask_layer, nuc_size=147, linker_size=50, exclusion_buffer=30, device="cpu"):
|
|
@@ -100,5 +100,5 @@ def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_laye
|
|
|
100
100
|
pos_cursor += 1
|
|
101
101
|
|
|
102
102
|
adata.layers[f"{large_bound_layer}_phased_nucleosomes"] = inferred_layer
|
|
103
|
-
print(f"
|
|
103
|
+
print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
|
|
104
104
|
return adata
|
|
@@ -11,7 +11,7 @@ def train_hmm(
|
|
|
11
11
|
pad_value=0,
|
|
12
12
|
):
|
|
13
13
|
"""
|
|
14
|
-
Trains a 2-state DenseHMM model on binary methylation data.
|
|
14
|
+
Trains a 2-state DenseHMM model on binary methylation/deamination data.
|
|
15
15
|
|
|
16
16
|
Parameters:
|
|
17
17
|
data (list or np.ndarray): List of sequences (lists) with 0, 1, or NaN.
|
smftools/informatics/__init__.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from . import helpers
|
|
2
2
|
from .basecall_pod5s import basecall_pod5s
|
|
3
|
-
from .load_adata import load_adata
|
|
4
3
|
from .subsample_fasta_from_bed import subsample_fasta_from_bed
|
|
5
4
|
from .subsample_pod5 import subsample_pod5
|
|
6
5
|
from .fast5_to_pod5 import fast5_to_pod5
|
|
@@ -8,7 +7,6 @@ from .fast5_to_pod5 import fast5_to_pod5
|
|
|
8
7
|
|
|
9
8
|
__all__ = [
|
|
10
9
|
"basecall_pod5s",
|
|
11
|
-
"load_adata",
|
|
12
10
|
"subsample_fasta_from_bed",
|
|
13
11
|
"subsample_pod5",
|
|
14
12
|
"fast5_to_pod5",
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
|
|
2
|
+
def deaminase_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
|
|
3
|
+
"""
|
|
4
|
+
Processes sequencing data from a conversion SMF experiment to an adata object.
|
|
5
|
+
|
|
6
|
+
Parameters:
|
|
7
|
+
fasta (str): File path to the reference genome to align to.
|
|
8
|
+
output_directory (str): A file path to the directory to output all the analyses.
|
|
9
|
+
conversion_type (list): A list of strings of the conversion types to use in the analysis.
|
|
10
|
+
strands (list): A list of converstion strands to use in the experiment.
|
|
11
|
+
model_dir (str): a string representing the file path to the dorado basecalling model directory.
|
|
12
|
+
model (str): a string representing the dorado basecalling model.
|
|
13
|
+
input_data_path (str): a string representing the file path to the experiment directory/file containing sequencing data
|
|
14
|
+
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
15
|
+
barcode_kit (str): A string representing the barcoding kit used in the experiment.
|
|
16
|
+
mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
|
|
17
|
+
experiment_name (str): A string to provide an experiment name to the output adata file.
|
|
18
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
19
|
+
basecall (bool): Whether to go through basecalling or not.
|
|
20
|
+
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
21
|
+
trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
|
|
22
|
+
device (str): Device to use for basecalling. auto, metal, cpu, cuda
|
|
23
|
+
make_bigwigs (bool): Whether to make bigwigs
|
|
24
|
+
threads (int): cpu threads available for processing.
|
|
25
|
+
input_already_demuxed (bool): Whether the input files were already demultiplexed
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
final_adata_path (str): Path to the final adata object
|
|
29
|
+
sorted_output (str): Path to the aligned, sorted BAM
|
|
30
|
+
"""
|
|
31
|
+
from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, canoncall, converted_BAM_to_adata_II, generate_converted_FASTA, get_chromosome_lengths, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc, split_and_index_BAM
|
|
32
|
+
import os
|
|
33
|
+
import shutil
|
|
34
|
+
import glob
|
|
35
|
+
|
|
36
|
+
if basecall:
|
|
37
|
+
model_basename = os.path.basename(model)
|
|
38
|
+
model_basename = model_basename.replace('.', '_')
|
|
39
|
+
bam=f"{output_directory}/{model_basename}_canonical_basecalls"
|
|
40
|
+
else:
|
|
41
|
+
bam_base=os.path.basename(input_data_path).split('.bam')[0]
|
|
42
|
+
bam=os.path.join(output_directory, bam_base)
|
|
43
|
+
aligned_BAM=f"{bam}_aligned"
|
|
44
|
+
aligned_sorted_BAM=f"{aligned_BAM}_sorted"
|
|
45
|
+
|
|
46
|
+
os.chdir(output_directory)
|
|
47
|
+
|
|
48
|
+
# 1) Convert FASTA file
|
|
49
|
+
fasta_basename = os.path.basename(fasta)
|
|
50
|
+
converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
|
|
51
|
+
converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
|
|
52
|
+
if 'converted.fa' in fasta:
|
|
53
|
+
print(fasta + ' is already converted. Using existing converted FASTA.')
|
|
54
|
+
converted_FASTA = fasta
|
|
55
|
+
elif os.path.exists(converted_FASTA):
|
|
56
|
+
print(converted_FASTA + ' already exists. Using existing converted FASTA.')
|
|
57
|
+
else:
|
|
58
|
+
generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
|
|
59
|
+
|
|
60
|
+
# Make a FAI and .chrom.names file for the converted fasta
|
|
61
|
+
get_chromosome_lengths(converted_FASTA)
|
|
62
|
+
|
|
63
|
+
# 2) Basecall from the input POD5 to generate a singular output BAM
|
|
64
|
+
if basecall:
|
|
65
|
+
canoncall_output = bam + bam_suffix
|
|
66
|
+
if os.path.exists(canoncall_output):
|
|
67
|
+
print(canoncall_output + ' already exists. Using existing basecalled BAM.')
|
|
68
|
+
else:
|
|
69
|
+
canoncall(model_dir, model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
|
|
70
|
+
else:
|
|
71
|
+
canoncall_output = input_data_path
|
|
72
|
+
|
|
73
|
+
# 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
|
|
74
|
+
aligned_output = aligned_BAM + bam_suffix
|
|
75
|
+
sorted_output = aligned_sorted_BAM + bam_suffix
|
|
76
|
+
if os.path.exists(aligned_output) and os.path.exists(sorted_output):
|
|
77
|
+
print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
|
|
78
|
+
else:
|
|
79
|
+
align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory, make_bigwigs, threads, deaminase_alignment=True)
|
|
80
|
+
|
|
81
|
+
# Make beds and provide basic histograms
|
|
82
|
+
bed_dir = os.path.join(output_directory, 'beds')
|
|
83
|
+
if os.path.isdir(bed_dir):
|
|
84
|
+
print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
|
|
85
|
+
else:
|
|
86
|
+
aligned_BAM_to_bed(aligned_output, output_directory, converted_FASTA, make_bigwigs, threads)
|
|
87
|
+
|
|
88
|
+
### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
|
|
89
|
+
if barcode_both_ends:
|
|
90
|
+
split_dir = split_dir + '_both_ends_barcoded'
|
|
91
|
+
else:
|
|
92
|
+
split_dir = split_dir + '_at_least_one_end_barcoded'
|
|
93
|
+
|
|
94
|
+
if os.path.isdir(split_dir):
|
|
95
|
+
print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
|
|
96
|
+
bam_pattern = '*' + bam_suffix
|
|
97
|
+
bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
|
|
98
|
+
bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
|
|
99
|
+
bam_files.sort()
|
|
100
|
+
else:
|
|
101
|
+
make_dirs([split_dir])
|
|
102
|
+
if input_already_demuxed:
|
|
103
|
+
bam_files = split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory) # custom for non-nanopore
|
|
104
|
+
else:
|
|
105
|
+
bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
|
|
106
|
+
|
|
107
|
+
# Make beds and provide basic histograms
|
|
108
|
+
bed_dir = os.path.join(split_dir, 'beds')
|
|
109
|
+
if os.path.isdir(bed_dir):
|
|
110
|
+
print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
|
|
111
|
+
else:
|
|
112
|
+
for bam in bam_files:
|
|
113
|
+
aligned_BAM_to_bed(bam, split_dir, converted_FASTA, make_bigwigs, threads)
|
|
114
|
+
|
|
115
|
+
# 5) Samtools QC metrics on split BAM files
|
|
116
|
+
bam_qc_dir = f"{split_dir}/bam_qc"
|
|
117
|
+
if os.path.isdir(bam_qc_dir):
|
|
118
|
+
print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
|
|
119
|
+
else:
|
|
120
|
+
make_dirs([bam_qc_dir])
|
|
121
|
+
bam_qc(bam_files, bam_qc_dir, threads, modality='conversion')
|
|
122
|
+
|
|
123
|
+
# multiqc ###
|
|
124
|
+
if os.path.isdir(f"{split_dir}/multiqc"):
|
|
125
|
+
print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
|
|
126
|
+
else:
|
|
127
|
+
run_multiqc(split_dir, f"{split_dir}/multiqc")
|
|
128
|
+
|
|
129
|
+
# 6) Take the converted BAM and load it into an adata object.
|
|
130
|
+
final_adata, final_adata_path = converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device, deaminase_footprinting=True)
|
|
131
|
+
|
|
132
|
+
return final_adata, final_adata_path, sorted_output, bam_files
|
|
@@ -15,7 +15,10 @@ def fast5_to_pod5(fast5_dir, output_pod5='FAST5s_to_POD5.pod5'):
|
|
|
15
15
|
import subprocess
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
|
|
18
|
-
if
|
|
18
|
+
if isinstance(fast5_dir, (list, tuple)):
|
|
19
|
+
cmd = ["pod5", "convert", "fast5"] + fast5_dir + ["--output", output_pod5]
|
|
20
|
+
subprocess.run(cmd)
|
|
21
|
+
elif Path(fast5_dir).is_file():
|
|
19
22
|
subprocess.run(["pod5", "convert", "fast5", fast5_dir, "--output", output_pod5])
|
|
20
23
|
elif Path(fast5_dir).is_dir():
|
|
21
24
|
subprocess.run(["pod5", "convert", "fast5", f".{fast5_dir}*.fast5", "--output", output_pod5])
|
|
@@ -9,6 +9,7 @@ from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
|
|
|
9
9
|
from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
|
|
10
10
|
from .count_aligned_reads import count_aligned_reads
|
|
11
11
|
from .demux_and_index_BAM import demux_and_index_BAM
|
|
12
|
+
from .discover_input_files import *
|
|
12
13
|
from .extract_base_identities import extract_base_identities
|
|
13
14
|
from .extract_mods import extract_mods
|
|
14
15
|
from .extract_read_features_from_bam import extract_read_features_from_bam
|
|
@@ -19,7 +20,6 @@ from .generate_converted_FASTA import convert_FASTA_record, generate_converted_F
|
|
|
19
20
|
from .get_chromosome_lengths import get_chromosome_lengths
|
|
20
21
|
from .get_native_references import get_native_references
|
|
21
22
|
from .index_fasta import index_fasta
|
|
22
|
-
from .LoadExperimentConfig import LoadExperimentConfig
|
|
23
23
|
from .make_dirs import make_dirs
|
|
24
24
|
from .make_modbed import make_modbed
|
|
25
25
|
from .modcall import modcall
|
|
@@ -29,7 +29,7 @@ from .one_hot_encode import one_hot_encode
|
|
|
29
29
|
from .ohe_batching import ohe_batching
|
|
30
30
|
from .one_hot_decode import one_hot_decode
|
|
31
31
|
from .ohe_layers_decode import ohe_layers_decode
|
|
32
|
-
from .
|
|
32
|
+
from .plot_bed_histograms import plot_bed_histograms
|
|
33
33
|
from .run_multiqc import run_multiqc
|
|
34
34
|
from .separate_bam_by_bc import separate_bam_by_bc
|
|
35
35
|
from .split_and_index_BAM import split_and_index_BAM
|
|
@@ -57,7 +57,6 @@ __all__ = [
|
|
|
57
57
|
"get_chromosome_lengths",
|
|
58
58
|
"get_native_references",
|
|
59
59
|
"index_fasta",
|
|
60
|
-
"LoadExperimentConfig",
|
|
61
60
|
"make_dirs",
|
|
62
61
|
"make_modbed",
|
|
63
62
|
"modcall",
|
|
@@ -67,7 +66,7 @@ __all__ = [
|
|
|
67
66
|
"ohe_batching",
|
|
68
67
|
"one_hot_decode",
|
|
69
68
|
"ohe_layers_decode",
|
|
70
|
-
"
|
|
69
|
+
"plot_bed_histograms",
|
|
71
70
|
"run_multiqc",
|
|
72
71
|
"separate_bam_by_bc",
|
|
73
72
|
"split_and_index_BAM"
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
## align_and_sort_BAM
|
|
2
2
|
|
|
3
|
-
def align_and_sort_BAM(fasta,
|
|
3
|
+
def align_and_sort_BAM(fasta,
|
|
4
|
+
input,
|
|
5
|
+
bam_suffix='.bam',
|
|
6
|
+
output_directory='aligned_outputs',
|
|
7
|
+
make_bigwigs=False,
|
|
8
|
+
threads=None,
|
|
9
|
+
aligner='minimap2',
|
|
10
|
+
aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
|
|
4
11
|
"""
|
|
5
12
|
A wrapper for running dorado aligner and samtools functions
|
|
6
13
|
|
|
@@ -11,6 +18,8 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
|
|
|
11
18
|
output_directory (str): A file path to the directory to output all the analyses.
|
|
12
19
|
make_bigwigs (bool): Whether to make bigwigs
|
|
13
20
|
threads (int): Number of additional threads to use
|
|
21
|
+
aligner (str): Aligner to use. minimap2 and dorado options
|
|
22
|
+
aligner_args (list): list of optional parameters to use for the alignment
|
|
14
23
|
|
|
15
24
|
Returns:
|
|
16
25
|
None
|
|
@@ -21,6 +30,7 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
|
|
|
21
30
|
|
|
22
31
|
input_basename = os.path.basename(input)
|
|
23
32
|
input_suffix = '.' + input_basename.split('.')[1]
|
|
33
|
+
input_as_fastq = input_basename.split('.')[0] + '.fastq'
|
|
24
34
|
|
|
25
35
|
output_path_minus_suffix = os.path.join(output_directory, input_basename.split(input_suffix)[0])
|
|
26
36
|
|
|
@@ -34,13 +44,30 @@ def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligne
|
|
|
34
44
|
else:
|
|
35
45
|
pass
|
|
36
46
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
47
|
+
if aligner == 'minimap2':
|
|
48
|
+
print(f"Converting BAM to FASTQ: {input}")
|
|
49
|
+
bam_to_fastq_command = ['samtools', 'fastq', input]
|
|
50
|
+
subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
|
|
51
|
+
print(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
52
|
+
if threads:
|
|
53
|
+
minimap_command = ['minimap2'] + aligner_args + ['-t', threads, fasta, input_as_fastq]
|
|
54
|
+
else:
|
|
55
|
+
minimap_command = ['minimap2'] + aligner_args + [fasta, input_as_fastq]
|
|
56
|
+
subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
|
|
57
|
+
os.remove(input_as_fastq)
|
|
58
|
+
|
|
59
|
+
elif aligner == 'dorado':
|
|
60
|
+
# Run dorado aligner
|
|
61
|
+
print(f"Aligning BAM to Reference: {input}")
|
|
62
|
+
if threads:
|
|
63
|
+
alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [fasta, input]
|
|
64
|
+
else:
|
|
65
|
+
alignment_command = ["dorado", "aligner"] + aligner_args + [fasta, input]
|
|
66
|
+
subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
|
|
67
|
+
|
|
41
68
|
else:
|
|
42
|
-
|
|
43
|
-
|
|
69
|
+
print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
|
|
70
|
+
return
|
|
44
71
|
|
|
45
72
|
# Sort the BAM on positional coordinates
|
|
46
73
|
print(f"Sorting BAM: {aligned_output}")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
2
2
|
"""
|
|
3
3
|
Takes an aligned BAM as input and writes a BED file of reads as output.
|
|
4
|
-
Bed columns are: Record name, start position, end position, read length, read name.
|
|
4
|
+
Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
|
|
5
5
|
|
|
6
6
|
Parameters:
|
|
7
7
|
aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
|
|
@@ -15,11 +15,13 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
15
15
|
"""
|
|
16
16
|
import subprocess
|
|
17
17
|
import os
|
|
18
|
+
import pysam
|
|
19
|
+
import numpy as np
|
|
18
20
|
import concurrent.futures
|
|
19
21
|
from concurrent.futures import ProcessPoolExecutor
|
|
20
22
|
from .bed_to_bigwig import bed_to_bigwig
|
|
21
23
|
from . import make_dirs
|
|
22
|
-
from .
|
|
24
|
+
from .plot_bed_histograms import plot_bed_histograms
|
|
23
25
|
|
|
24
26
|
threads = threads or os.cpu_count() # Use max available cores if not specified
|
|
25
27
|
|
|
@@ -30,45 +32,54 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
|
30
32
|
|
|
31
33
|
bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
|
|
32
34
|
|
|
33
|
-
print(f"Creating BED from BAM: {aligned_BAM}
|
|
35
|
+
print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
|
|
34
36
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
37
|
+
with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
|
|
38
|
+
for read in bam.fetch(until_eof=True):
|
|
39
|
+
if read.is_unmapped:
|
|
40
|
+
chrom = "*"
|
|
41
|
+
start1 = 1
|
|
42
|
+
rl = read.query_length or 0
|
|
43
|
+
mapq = 0
|
|
44
|
+
else:
|
|
45
|
+
chrom = bam.get_reference_name(read.reference_id)
|
|
46
|
+
# pysam reference_start is 0-based → +1 for 1-based SAM-like start
|
|
47
|
+
start1 = int(read.reference_start) + 1
|
|
48
|
+
rl = read.query_length or 0
|
|
49
|
+
mapq = int(read.mapping_quality)
|
|
43
50
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
samtools_view.wait()
|
|
51
|
+
# End position in 1-based inclusive coords
|
|
52
|
+
end1 = start1 + (rl or 0) - 1
|
|
47
53
|
|
|
48
|
-
|
|
54
|
+
qname = read.query_name
|
|
55
|
+
quals = read.query_qualities
|
|
56
|
+
if quals is None or rl == 0:
|
|
57
|
+
avg_q = float("nan")
|
|
58
|
+
else:
|
|
59
|
+
avg_q = float(np.mean(quals))
|
|
60
|
+
|
|
61
|
+
out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
|
|
62
|
+
|
|
63
|
+
print(f"BED-like file created: {bed_output}")
|
|
49
64
|
|
|
50
65
|
def split_bed(bed):
|
|
51
|
-
"""Splits
|
|
66
|
+
"""Splits into aligned and unaligned reads (chrom == '*')."""
|
|
52
67
|
aligned = bed.replace(".bed", "_aligned.bed")
|
|
53
68
|
unaligned = bed.replace(".bed", "_unaligned.bed")
|
|
54
|
-
|
|
55
69
|
with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
|
|
56
70
|
for line in infile:
|
|
57
|
-
(unaligned_out if line.startswith("
|
|
58
|
-
|
|
71
|
+
(unaligned_out if line.startswith("*\t") else aligned_out).write(line)
|
|
59
72
|
os.remove(bed)
|
|
60
73
|
return aligned
|
|
61
74
|
|
|
62
|
-
print(f"Splitting
|
|
75
|
+
print(f"Splitting: {bed_output}")
|
|
63
76
|
aligned_bed = split_bed(bed_output)
|
|
64
77
|
|
|
65
|
-
with ProcessPoolExecutor() as executor:
|
|
78
|
+
with ProcessPoolExecutor() as executor:
|
|
66
79
|
futures = []
|
|
67
|
-
futures.append(executor.submit(
|
|
80
|
+
futures.append(executor.submit(plot_bed_histograms, aligned_bed, plotting_dir, fasta))
|
|
68
81
|
if make_bigwigs:
|
|
69
82
|
futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
|
|
70
|
-
|
|
71
|
-
# Wait for all tasks to complete
|
|
72
83
|
concurrent.futures.wait(futures)
|
|
73
84
|
|
|
74
85
|
print("Processing completed successfully.")
|