smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/hmm/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .apply_hmm_batched import apply_hmm_batched
|
|
2
|
+
from .calculate_distances import calculate_distances
|
|
3
|
+
from .call_hmm_peaks import call_hmm_peaks
|
|
4
|
+
from .display_hmm import display_hmm
|
|
5
|
+
from .hmm_readwrite import load_hmm, save_hmm
|
|
6
|
+
from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
|
|
7
|
+
from .train_hmm import train_hmm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"apply_hmm_batched",
|
|
12
|
+
"calculate_distances",
|
|
13
|
+
"call_hmm_peaks",
|
|
14
|
+
"display_hmm",
|
|
15
|
+
"load_hmm",
|
|
16
|
+
"refine_nucleosome_calls",
|
|
17
|
+
"infer_nucleosomes_in_large_bound",
|
|
18
|
+
"save_hmm",
|
|
19
|
+
"train_hmm"
|
|
20
|
+
]
|
|
@@ -3,14 +3,11 @@ import pandas as pd
|
|
|
3
3
|
import torch
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
|
|
6
|
-
def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
|
|
6
|
+
def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A", "C"], device="cpu", threshold=0.7, deaminase_footprinting=False):
|
|
7
7
|
"""
|
|
8
8
|
Applies an HMM model to an AnnData object using tensor-based sequence inputs.
|
|
9
9
|
If multiple methbases are passed, generates a combined feature set.
|
|
10
10
|
"""
|
|
11
|
-
import numpy as np
|
|
12
|
-
import torch
|
|
13
|
-
from tqdm import tqdm
|
|
14
11
|
|
|
15
12
|
model.to(device)
|
|
16
13
|
|
|
@@ -74,6 +71,7 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
|
|
|
74
71
|
for methbase in methbases:
|
|
75
72
|
mask = {
|
|
76
73
|
"a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
|
|
74
|
+
"c": ref_subset.var[f"{ref}_any_C_site"] == True,
|
|
77
75
|
"gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
|
|
78
76
|
"cpg": ref_subset.var[f"{ref}_CpG_site"] == True
|
|
79
77
|
}[methbase.lower()]
|
|
@@ -150,6 +148,8 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
|
|
|
150
148
|
adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
|
|
151
149
|
|
|
152
150
|
# --- Binarization + Distance ---
|
|
151
|
+
coordinates = adata.var_names.astype(int).values
|
|
152
|
+
|
|
153
153
|
for feature in tqdm(all_features, desc="Finalizing Layers"):
|
|
154
154
|
bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
|
|
155
155
|
counts = np.zeros(adata.shape[0], dtype=int)
|
|
@@ -158,9 +158,11 @@ def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, acc
|
|
|
158
158
|
intervals = []
|
|
159
159
|
for start, length, prob in intervals:
|
|
160
160
|
if prob > threshold:
|
|
161
|
-
|
|
161
|
+
start_idx = np.searchsorted(coordinates, start, side="left")
|
|
162
|
+
end_idx = np.searchsorted(coordinates, start + length - 1, side="right")
|
|
163
|
+
bin_matrix[row_idx, start_idx:end_idx] = 1
|
|
162
164
|
counts[row_idx] += 1
|
|
163
|
-
adata.layers[
|
|
165
|
+
adata.layers[feature] = bin_matrix
|
|
164
166
|
adata.obs[f"n_{feature}"] = counts
|
|
165
167
|
adata.obs[f"{feature}_distances"] = calculate_batch_distances(adata.obs[feature].tolist(), threshold)
|
|
166
168
|
|
|
@@ -202,7 +204,6 @@ def classify_batch(predicted_states_batch, probabilities_batch, coordinates, cla
|
|
|
202
204
|
Returns:
|
|
203
205
|
List of classifications for each sequence.
|
|
204
206
|
"""
|
|
205
|
-
import numpy as np
|
|
206
207
|
|
|
207
208
|
state_labels = ["Non-Methylated", "Methylated"]
|
|
208
209
|
target_idx = state_labels.index(target_state)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
def call_hmm_peaks(
|
|
2
|
+
adata,
|
|
3
|
+
feature_configs,
|
|
4
|
+
obs_column='Reference_strand',
|
|
5
|
+
site_types=['GpC_site', 'CpG_site'],
|
|
6
|
+
save_plot=False,
|
|
7
|
+
output_dir=None,
|
|
8
|
+
date_tag=None,
|
|
9
|
+
inplace=False
|
|
10
|
+
):
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
from scipy.signal import find_peaks
|
|
15
|
+
|
|
16
|
+
if not inplace:
|
|
17
|
+
adata = adata.copy()
|
|
18
|
+
|
|
19
|
+
# Ensure obs_column is categorical
|
|
20
|
+
if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
|
|
21
|
+
adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
|
|
22
|
+
|
|
23
|
+
coordinates = adata.var_names.astype(int).values
|
|
24
|
+
peak_columns = []
|
|
25
|
+
|
|
26
|
+
obs_updates = {}
|
|
27
|
+
|
|
28
|
+
for feature_layer, config in feature_configs.items():
|
|
29
|
+
min_distance = config.get('min_distance', 200)
|
|
30
|
+
peak_width = config.get('peak_width', 200)
|
|
31
|
+
peak_prominence = config.get('peak_prominence', 0.2)
|
|
32
|
+
peak_threshold = config.get('peak_threshold', 0.8)
|
|
33
|
+
|
|
34
|
+
matrix = adata.layers[feature_layer]
|
|
35
|
+
means = np.mean(matrix, axis=0)
|
|
36
|
+
peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
|
|
37
|
+
peak_centers = coordinates[peak_indices]
|
|
38
|
+
adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
|
|
39
|
+
|
|
40
|
+
# Plot
|
|
41
|
+
plt.figure(figsize=(6, 3))
|
|
42
|
+
plt.plot(coordinates, means)
|
|
43
|
+
plt.title(f"{feature_layer} with peak calls")
|
|
44
|
+
plt.xlabel("Genomic position")
|
|
45
|
+
plt.ylabel("Mean intensity")
|
|
46
|
+
for i, center in enumerate(peak_centers):
|
|
47
|
+
start, end = center - peak_width // 2, center + peak_width // 2
|
|
48
|
+
plt.axvspan(start, end, color='purple', alpha=0.2)
|
|
49
|
+
plt.axvline(center, color='red', linestyle='--')
|
|
50
|
+
aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
|
|
51
|
+
plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
|
|
52
|
+
if save_plot and output_dir:
|
|
53
|
+
filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
|
|
54
|
+
plt.savefig(filename, bbox_inches='tight')
|
|
55
|
+
print(f"Saved plot to {filename}")
|
|
56
|
+
else:
|
|
57
|
+
plt.show()
|
|
58
|
+
|
|
59
|
+
feature_peak_columns = []
|
|
60
|
+
for center in peak_centers:
|
|
61
|
+
start, end = center - peak_width // 2, center + peak_width // 2
|
|
62
|
+
colname = f'{feature_layer}_peak_{center}'
|
|
63
|
+
peak_columns.append(colname)
|
|
64
|
+
feature_peak_columns.append(colname)
|
|
65
|
+
|
|
66
|
+
peak_mask = (coordinates >= start) & (coordinates <= end)
|
|
67
|
+
adata.var[colname] = peak_mask
|
|
68
|
+
|
|
69
|
+
region = matrix[:, peak_mask]
|
|
70
|
+
obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
|
|
71
|
+
obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
|
|
72
|
+
obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
|
|
73
|
+
|
|
74
|
+
for site_type in site_types:
|
|
75
|
+
adata.obs[f'{site_type}_sum_around_{center}'] = 0
|
|
76
|
+
adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
|
|
77
|
+
|
|
78
|
+
for ref in adata.obs[obs_column].cat.categories:
|
|
79
|
+
ref_idx = adata.obs[obs_column] == ref
|
|
80
|
+
mask_key = f"{ref}_{site_type}"
|
|
81
|
+
for site_type in site_types:
|
|
82
|
+
if mask_key not in adata.var:
|
|
83
|
+
continue
|
|
84
|
+
site_mask = adata.var[mask_key].values
|
|
85
|
+
site_coords = coordinates[site_mask]
|
|
86
|
+
region_mask = (site_coords >= start) & (site_coords <= end)
|
|
87
|
+
if not region_mask.any():
|
|
88
|
+
continue
|
|
89
|
+
full_mask = site_mask.copy()
|
|
90
|
+
full_mask[site_mask] = region_mask
|
|
91
|
+
site_region = adata[ref_idx, full_mask].X
|
|
92
|
+
if hasattr(site_region, "A"):
|
|
93
|
+
site_region = site_region.A
|
|
94
|
+
if site_region.shape[1] > 0:
|
|
95
|
+
adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
|
|
96
|
+
adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
|
|
97
|
+
else:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
|
|
101
|
+
print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
|
|
102
|
+
|
|
103
|
+
adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
|
|
104
|
+
adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
|
|
105
|
+
|
|
106
|
+
return adata if not inplace else None
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
def display_hmm(hmm, state_labels=["Non-Methylated", "Methylated"], obs_labels=["0", "1"]):
|
|
2
2
|
import torch
|
|
3
|
-
print("\n
|
|
3
|
+
print("\n**HMM Model Overview**")
|
|
4
4
|
print(hmm)
|
|
5
5
|
|
|
6
|
-
print("\n
|
|
6
|
+
print("\n**Transition Matrix**")
|
|
7
7
|
transition_matrix = torch.exp(hmm.edges).detach().cpu().numpy()
|
|
8
8
|
for i, row in enumerate(transition_matrix):
|
|
9
9
|
label = state_labels[i] if state_labels else f"State {i}"
|
|
10
10
|
formatted_row = ", ".join(f"{p:.6f}" for p in row)
|
|
11
11
|
print(f"{label}: [{formatted_row}]")
|
|
12
12
|
|
|
13
|
-
print("\n
|
|
13
|
+
print("\n**Emission Probabilities**")
|
|
14
14
|
for i, dist in enumerate(hmm.distributions):
|
|
15
15
|
label = state_labels[i] if state_labels else f"State {i}"
|
|
16
16
|
probs = dist.probs.detach().cpu().numpy()
|
|
@@ -56,7 +56,7 @@ def refine_nucleosome_calls(adata, layer_name, nan_mask_layer, hexamer_size=120,
|
|
|
56
56
|
adata.layers[f"{layer_name}_hexamers"] = hexamer_layer
|
|
57
57
|
adata.layers[f"{layer_name}_octamers"] = octamer_layer
|
|
58
58
|
|
|
59
|
-
print(f"
|
|
59
|
+
print(f"Added layers: {layer_name}_hexamers and {layer_name}_octamers")
|
|
60
60
|
return adata
|
|
61
61
|
|
|
62
62
|
def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_layer, nan_mask_layer, nuc_size=147, linker_size=50, exclusion_buffer=30, device="cpu"):
|
|
@@ -100,5 +100,5 @@ def infer_nucleosomes_in_large_bound(adata, large_bound_layer, combined_nuc_laye
|
|
|
100
100
|
pos_cursor += 1
|
|
101
101
|
|
|
102
102
|
adata.layers[f"{large_bound_layer}_phased_nucleosomes"] = inferred_layer
|
|
103
|
-
print(f"
|
|
103
|
+
print(f"Added layer: {large_bound_layer}_phased_nucleosomes")
|
|
104
104
|
return adata
|
|
@@ -11,7 +11,7 @@ def train_hmm(
|
|
|
11
11
|
pad_value=0,
|
|
12
12
|
):
|
|
13
13
|
"""
|
|
14
|
-
Trains a 2-state DenseHMM model on binary methylation data.
|
|
14
|
+
Trains a 2-state DenseHMM model on binary methylation/deamination data.
|
|
15
15
|
|
|
16
16
|
Parameters:
|
|
17
17
|
data (list or np.ndarray): List of sequences (lists) with 0, 1, or NaN.
|
smftools/informatics/__init__.py
CHANGED
|
@@ -1,16 +1,20 @@
|
|
|
1
|
-
from . import
|
|
2
|
-
from .
|
|
3
|
-
from .
|
|
4
|
-
from .
|
|
5
|
-
from .
|
|
6
|
-
from .
|
|
7
|
-
|
|
1
|
+
from .bam_functions import align_and_sort_BAM, bam_qc, concatenate_fastqs_to_bam, count_aligned_reads, demux_and_index_BAM, extract_base_identities, extract_read_features_from_bam, extract_readnames_from_bam, separate_bam_by_bc, split_and_index_BAM
|
|
2
|
+
from .basecalling import canoncall, modcall
|
|
3
|
+
from .bed_functions import aligned_BAM_to_bed, _bed_to_bigwig, extract_read_lengths_from_bed, _plot_bed_histograms
|
|
4
|
+
from .converted_BAM_to_adata import converted_BAM_to_adata
|
|
5
|
+
from .fasta_functions import find_conversion_sites, generate_converted_FASTA, get_chromosome_lengths, get_native_references, index_fasta, subsample_fasta_from_bed
|
|
6
|
+
from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
|
|
7
|
+
from .modkit_functions import extract_mods, make_modbed, modQC
|
|
8
|
+
from .modkit_extract_to_adata import modkit_extract_to_adata
|
|
9
|
+
from .ohe import one_hot_encode, one_hot_decode, ohe_layers_decode, ohe_batching
|
|
10
|
+
from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
|
|
11
|
+
from .run_multiqc import run_multiqc
|
|
8
12
|
|
|
9
13
|
__all__ = [
|
|
10
14
|
"basecall_pod5s",
|
|
11
|
-
"
|
|
15
|
+
"converted_BAM_to_adata",
|
|
12
16
|
"subsample_fasta_from_bed",
|
|
13
17
|
"subsample_pod5",
|
|
14
18
|
"fast5_to_pod5",
|
|
15
|
-
"
|
|
19
|
+
"run_multiqc"
|
|
16
20
|
]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
|
|
2
|
+
def deaminase_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
|
|
3
|
+
"""
|
|
4
|
+
Processes sequencing data from a conversion SMF experiment to an adata object.
|
|
5
|
+
|
|
6
|
+
Parameters:
|
|
7
|
+
fasta (str): File path to the reference genome to align to.
|
|
8
|
+
output_directory (str): A file path to the directory to output all the analyses.
|
|
9
|
+
conversion_type (list): A list of strings of the conversion types to use in the analysis.
|
|
10
|
+
strands (list): A list of converstion strands to use in the experiment.
|
|
11
|
+
model_dir (str): a string representing the file path to the dorado basecalling model directory.
|
|
12
|
+
model (str): a string representing the dorado basecalling model.
|
|
13
|
+
input_data_path (str): a string representing the file path to the experiment directory/file containing sequencing data
|
|
14
|
+
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
15
|
+
barcode_kit (str): A string representing the barcoding kit used in the experiment.
|
|
16
|
+
mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
|
|
17
|
+
experiment_name (str): A string to provide an experiment name to the output adata file.
|
|
18
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
19
|
+
basecall (bool): Whether to go through basecalling or not.
|
|
20
|
+
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
21
|
+
trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
|
|
22
|
+
device (str): Device to use for basecalling. auto, metal, cpu, cuda
|
|
23
|
+
make_bigwigs (bool): Whether to make bigwigs
|
|
24
|
+
threads (int): cpu threads available for processing.
|
|
25
|
+
input_already_demuxed (bool): Whether the input files were already demultiplexed
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
final_adata_path (str): Path to the final adata object
|
|
29
|
+
sorted_output (str): Path to the aligned, sorted BAM
|
|
30
|
+
"""
|
|
31
|
+
from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, canoncall, converted_BAM_to_adata_II, generate_converted_FASTA, get_chromosome_lengths, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc, split_and_index_BAM
|
|
32
|
+
import os
|
|
33
|
+
import shutil
|
|
34
|
+
import glob
|
|
35
|
+
|
|
36
|
+
if basecall:
|
|
37
|
+
model_basename = os.path.basename(model)
|
|
38
|
+
model_basename = model_basename.replace('.', '_')
|
|
39
|
+
bam=f"{output_directory}/{model_basename}_canonical_basecalls"
|
|
40
|
+
else:
|
|
41
|
+
bam_base=os.path.basename(input_data_path).split('.bam')[0]
|
|
42
|
+
bam=os.path.join(output_directory, bam_base)
|
|
43
|
+
aligned_BAM=f"{bam}_aligned"
|
|
44
|
+
aligned_sorted_BAM=f"{aligned_BAM}_sorted"
|
|
45
|
+
|
|
46
|
+
os.chdir(output_directory)
|
|
47
|
+
|
|
48
|
+
# 1) Convert FASTA file
|
|
49
|
+
fasta_basename = os.path.basename(fasta)
|
|
50
|
+
converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
|
|
51
|
+
converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
|
|
52
|
+
if 'converted.fa' in fasta:
|
|
53
|
+
print(fasta + ' is already converted. Using existing converted FASTA.')
|
|
54
|
+
converted_FASTA = fasta
|
|
55
|
+
elif os.path.exists(converted_FASTA):
|
|
56
|
+
print(converted_FASTA + ' already exists. Using existing converted FASTA.')
|
|
57
|
+
else:
|
|
58
|
+
generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
|
|
59
|
+
|
|
60
|
+
# Make a FAI and .chrom.names file for the converted fasta
|
|
61
|
+
get_chromosome_lengths(converted_FASTA)
|
|
62
|
+
|
|
63
|
+
# 2) Basecall from the input POD5 to generate a singular output BAM
|
|
64
|
+
if basecall:
|
|
65
|
+
canoncall_output = bam + bam_suffix
|
|
66
|
+
if os.path.exists(canoncall_output):
|
|
67
|
+
print(canoncall_output + ' already exists. Using existing basecalled BAM.')
|
|
68
|
+
else:
|
|
69
|
+
canoncall(model_dir, model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
|
|
70
|
+
else:
|
|
71
|
+
canoncall_output = input_data_path
|
|
72
|
+
|
|
73
|
+
# 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
|
|
74
|
+
aligned_output = aligned_BAM + bam_suffix
|
|
75
|
+
sorted_output = aligned_sorted_BAM + bam_suffix
|
|
76
|
+
if os.path.exists(aligned_output) and os.path.exists(sorted_output):
|
|
77
|
+
print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
|
|
78
|
+
else:
|
|
79
|
+
align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory, make_bigwigs, threads, deaminase_alignment=True)
|
|
80
|
+
|
|
81
|
+
# Make beds and provide basic histograms
|
|
82
|
+
bed_dir = os.path.join(output_directory, 'beds')
|
|
83
|
+
if os.path.isdir(bed_dir):
|
|
84
|
+
print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
|
|
85
|
+
else:
|
|
86
|
+
aligned_BAM_to_bed(aligned_output, output_directory, converted_FASTA, make_bigwigs, threads)
|
|
87
|
+
|
|
88
|
+
### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
|
|
89
|
+
if barcode_both_ends:
|
|
90
|
+
split_dir = split_dir + '_both_ends_barcoded'
|
|
91
|
+
else:
|
|
92
|
+
split_dir = split_dir + '_at_least_one_end_barcoded'
|
|
93
|
+
|
|
94
|
+
if os.path.isdir(split_dir):
|
|
95
|
+
print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
|
|
96
|
+
bam_pattern = '*' + bam_suffix
|
|
97
|
+
bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
|
|
98
|
+
bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
|
|
99
|
+
bam_files.sort()
|
|
100
|
+
else:
|
|
101
|
+
make_dirs([split_dir])
|
|
102
|
+
if input_already_demuxed:
|
|
103
|
+
bam_files = split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory) # custom for non-nanopore
|
|
104
|
+
else:
|
|
105
|
+
bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
|
|
106
|
+
|
|
107
|
+
# Make beds and provide basic histograms
|
|
108
|
+
bed_dir = os.path.join(split_dir, 'beds')
|
|
109
|
+
if os.path.isdir(bed_dir):
|
|
110
|
+
print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
|
|
111
|
+
else:
|
|
112
|
+
for bam in bam_files:
|
|
113
|
+
aligned_BAM_to_bed(bam, split_dir, converted_FASTA, make_bigwigs, threads)
|
|
114
|
+
|
|
115
|
+
# 5) Samtools QC metrics on split BAM files
|
|
116
|
+
bam_qc_dir = f"{split_dir}/bam_qc"
|
|
117
|
+
if os.path.isdir(bam_qc_dir):
|
|
118
|
+
print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
|
|
119
|
+
else:
|
|
120
|
+
make_dirs([bam_qc_dir])
|
|
121
|
+
bam_qc(bam_files, bam_qc_dir, threads, modality='conversion')
|
|
122
|
+
|
|
123
|
+
# multiqc ###
|
|
124
|
+
if os.path.isdir(f"{split_dir}/multiqc"):
|
|
125
|
+
print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
|
|
126
|
+
else:
|
|
127
|
+
run_multiqc(split_dir, f"{split_dir}/multiqc")
|
|
128
|
+
|
|
129
|
+
# 6) Take the converted BAM and load it into an adata object.
|
|
130
|
+
final_adata, final_adata_path = converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device, deaminase_footprinting=True)
|
|
131
|
+
|
|
132
|
+
return final_adata, final_adata_path, sorted_output, bam_files
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import subprocess
|
|
3
|
+
from typing import Union, List
|
|
4
|
+
|
|
5
|
+
def fast5_to_pod5(
|
|
6
|
+
fast5_dir: Union[str, Path, List[Union[str, Path]]],
|
|
7
|
+
output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
|
|
8
|
+
) -> None:
|
|
9
|
+
"""
|
|
10
|
+
Convert Nanopore FAST5 files (single file, list of files, or directory)
|
|
11
|
+
into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
output_pod5 = str(output_pod5) # ensure string
|
|
15
|
+
|
|
16
|
+
# 1) If user gives a list of FAST5 files
|
|
17
|
+
if isinstance(fast5_dir, (list, tuple)):
|
|
18
|
+
fast5_paths = [str(Path(f)) for f in fast5_dir]
|
|
19
|
+
cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
|
|
20
|
+
subprocess.run(cmd, check=True)
|
|
21
|
+
return
|
|
22
|
+
|
|
23
|
+
# Ensure Path object
|
|
24
|
+
p = Path(fast5_dir)
|
|
25
|
+
|
|
26
|
+
# 2) If user gives a single file
|
|
27
|
+
if p.is_file():
|
|
28
|
+
cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
|
|
29
|
+
subprocess.run(cmd, check=True)
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
# 3) If user gives a directory → collect FAST5s
|
|
33
|
+
if p.is_dir():
|
|
34
|
+
fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
|
|
35
|
+
if not fast5_paths:
|
|
36
|
+
raise FileNotFoundError(f"No FAST5 files found in {p}")
|
|
37
|
+
|
|
38
|
+
cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
|
|
39
|
+
subprocess.run(cmd, check=True)
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
|
|
43
|
+
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# from .align_and_sort_BAM import align_and_sort_BAM
|
|
2
|
+
# from .aligned_BAM_to_bed import aligned_BAM_to_bed
|
|
3
|
+
# from .bam_qc import bam_qc
|
|
4
|
+
# from .bed_to_bigwig import bed_to_bigwig
|
|
5
|
+
# from .binarize_converted_base_identities import binarize_converted_base_identities
|
|
6
|
+
# from .canoncall import canoncall
|
|
7
|
+
# from .complement_base_list import complement_base_list
|
|
8
|
+
# from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
|
|
9
|
+
# from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
|
|
10
|
+
# from .count_aligned_reads import count_aligned_reads
|
|
11
|
+
# from .demux_and_index_BAM import demux_and_index_BAM
|
|
12
|
+
# from .discover_input_files import *
|
|
13
|
+
# from .extract_base_identities import extract_base_identities
|
|
14
|
+
# from .extract_mods import extract_mods
|
|
15
|
+
# from .extract_read_features_from_bam import extract_read_features_from_bam
|
|
16
|
+
# from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
|
|
17
|
+
# from .extract_readnames_from_BAM import extract_readnames_from_BAM
|
|
18
|
+
# from .find_conversion_sites import find_conversion_sites
|
|
19
|
+
# from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
|
|
20
|
+
# from .get_chromosome_lengths import get_chromosome_lengths
|
|
21
|
+
# from .get_native_references import get_native_references
|
|
22
|
+
# from .index_fasta import index_fasta
|
|
23
|
+
# from .make_modbed import make_modbed
|
|
24
|
+
# from .modcall import modcall
|
|
25
|
+
# from .modkit_extract_to_adata import modkit_extract_to_adata
|
|
26
|
+
# from .modQC import modQC
|
|
27
|
+
# from .one_hot_encode import one_hot_encode
|
|
28
|
+
# from .ohe_batching import ohe_batching
|
|
29
|
+
# from .one_hot_decode import one_hot_decode
|
|
30
|
+
# from .ohe_layers_decode import ohe_layers_decode
|
|
31
|
+
# from .plot_bed_histograms import plot_bed_histograms
|
|
32
|
+
# from .run_multiqc import run_multiqc
|
|
33
|
+
# from .separate_bam_by_bc import separate_bam_by_bc
|
|
34
|
+
# from .split_and_index_BAM import split_and_index_BAM
|
|
35
|
+
|
|
36
|
+
# __all__ = [
|
|
37
|
+
# "align_and_sort_BAM",
|
|
38
|
+
# "aligned_BAM_to_bed",
|
|
39
|
+
# "bam_qc",
|
|
40
|
+
# "bed_to_bigwig",
|
|
41
|
+
# "binarize_converted_base_identities",
|
|
42
|
+
# "canoncall",
|
|
43
|
+
# "complement_base_list",
|
|
44
|
+
# "converted_BAM_to_adata_II",
|
|
45
|
+
# "concatenate_fastqs_to_bam",
|
|
46
|
+
# "count_aligned_reads",
|
|
47
|
+
# "demux_and_index_BAM",
|
|
48
|
+
# "extract_base_identities",
|
|
49
|
+
# "extract_mods",
|
|
50
|
+
# "extract_read_features_from_bam",
|
|
51
|
+
# "extract_read_lengths_from_bed",
|
|
52
|
+
# "extract_readnames_from_BAM",
|
|
53
|
+
# "find_conversion_sites",
|
|
54
|
+
# "convert_FASTA_record",
|
|
55
|
+
# "generate_converted_FASTA",
|
|
56
|
+
# "get_chromosome_lengths",
|
|
57
|
+
# "get_native_references",
|
|
58
|
+
# "index_fasta",
|
|
59
|
+
# "make_modbed",
|
|
60
|
+
# "modcall",
|
|
61
|
+
# "modkit_extract_to_adata",
|
|
62
|
+
# "modQC",
|
|
63
|
+
# "one_hot_encode",
|
|
64
|
+
# "ohe_batching",
|
|
65
|
+
# "one_hot_decode",
|
|
66
|
+
# "ohe_layers_decode",
|
|
67
|
+
# "plot_bed_histograms",
|
|
68
|
+
# "run_multiqc",
|
|
69
|
+
# "separate_bam_by_bc",
|
|
70
|
+
# "split_and_index_BAM"
|
|
71
|
+
# ]
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
import pysam
|
|
6
|
+
|
|
7
|
+
def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
|
|
8
|
+
"""
|
|
9
|
+
Minimal BAM->FASTQ using pysam. Writes unmapped or unaligned reads as-is.
|
|
10
|
+
"""
|
|
11
|
+
bam_path = str(bam_path)
|
|
12
|
+
fastq_path = str(fastq_path)
|
|
13
|
+
with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w") as fq:
|
|
14
|
+
for r in bam.fetch(until_eof=True):
|
|
15
|
+
# Skip secondary/supplementary if you want (optional):
|
|
16
|
+
# if r.is_secondary or r.is_supplementary: continue
|
|
17
|
+
name = r.query_name
|
|
18
|
+
seq = r.query_sequence or ""
|
|
19
|
+
qual = r.qual or ""
|
|
20
|
+
fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
|
|
21
|
+
|
|
22
|
+
def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
23
|
+
in_bam, out_bam = str(in_bam), str(out_bam)
|
|
24
|
+
args = []
|
|
25
|
+
if threads:
|
|
26
|
+
args += ["-@", str(threads)]
|
|
27
|
+
args += ["-o", out_bam, in_bam]
|
|
28
|
+
pysam.sort(*args)
|
|
29
|
+
|
|
30
|
+
def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
|
|
31
|
+
bam_path = str(bam_path)
|
|
32
|
+
# pysam.index supports samtools-style args
|
|
33
|
+
if threads:
|
|
34
|
+
pysam.index("-@", str(threads), bam_path)
|
|
35
|
+
else:
|
|
36
|
+
pysam.index(bam_path)
|
|
37
|
+
|
|
38
|
+
def align_and_sort_BAM(fasta,
|
|
39
|
+
input,
|
|
40
|
+
bam_suffix='.bam',
|
|
41
|
+
output_directory='aligned_outputs',
|
|
42
|
+
make_bigwigs=False,
|
|
43
|
+
threads=None,
|
|
44
|
+
aligner='minimap2',
|
|
45
|
+
aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
|
|
46
|
+
"""
|
|
47
|
+
A wrapper for running dorado aligner and samtools functions
|
|
48
|
+
|
|
49
|
+
Parameters:
|
|
50
|
+
fasta (str): File path to the reference genome to align to.
|
|
51
|
+
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
52
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
53
|
+
output_directory (str): A file path to the directory to output all the analyses.
|
|
54
|
+
make_bigwigs (bool): Whether to make bigwigs
|
|
55
|
+
threads (int): Number of additional threads to use
|
|
56
|
+
aligner (str): Aligner to use. minimap2 and dorado options
|
|
57
|
+
aligner_args (list): list of optional parameters to use for the alignment
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
None
|
|
61
|
+
The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
|
|
62
|
+
"""
|
|
63
|
+
input_basename = input.name
|
|
64
|
+
input_suffix = input.suffix
|
|
65
|
+
input_as_fastq = input.with_name(input.stem + '.fastq')
|
|
66
|
+
|
|
67
|
+
output_path_minus_suffix = output_directory / input.stem
|
|
68
|
+
|
|
69
|
+
aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
|
|
70
|
+
aligned_output = aligned_BAM.with_suffix(bam_suffix)
|
|
71
|
+
aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
72
|
+
aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
73
|
+
|
|
74
|
+
if threads:
|
|
75
|
+
threads = str(threads)
|
|
76
|
+
else:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
if aligner == 'minimap2':
|
|
80
|
+
print(f"Converting BAM to FASTQ: {input}")
|
|
81
|
+
_bam_to_fastq_with_pysam(input, input_as_fastq)
|
|
82
|
+
# bam_to_fastq_command = ['samtools', 'fastq', input]
|
|
83
|
+
# subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
|
|
84
|
+
print(f"Aligning FASTQ to Reference: {input_as_fastq}")
|
|
85
|
+
if threads:
|
|
86
|
+
minimap_command = ['minimap2'] + aligner_args + ['-t', threads, str(fasta), str(input_as_fastq)]
|
|
87
|
+
else:
|
|
88
|
+
minimap_command = ['minimap2'] + aligner_args + [str(fasta), str(input_as_fastq)]
|
|
89
|
+
subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
|
|
90
|
+
os.remove(input_as_fastq)
|
|
91
|
+
|
|
92
|
+
elif aligner == 'dorado':
|
|
93
|
+
# Run dorado aligner
|
|
94
|
+
print(f"Aligning BAM to Reference: {input}")
|
|
95
|
+
if threads:
|
|
96
|
+
alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [str(fasta), str(input)]
|
|
97
|
+
else:
|
|
98
|
+
alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
|
|
99
|
+
subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# --- Sort & Index with pysam ---
|
|
106
|
+
print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
|
|
107
|
+
_sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
|
|
108
|
+
|
|
109
|
+
print(f"[pysam] Indexing: {aligned_sorted_output}")
|
|
110
|
+
_index_bam_with_pysam(aligned_sorted_output, threads=threads)
|
|
111
|
+
|
|
112
|
+
# Sort the BAM on positional coordinates
|
|
113
|
+
# print(f"Sorting BAM: {aligned_output}")
|
|
114
|
+
# if threads:
|
|
115
|
+
# sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
|
|
116
|
+
# else:
|
|
117
|
+
# sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
|
|
118
|
+
# subprocess.run(sort_command)
|
|
119
|
+
|
|
120
|
+
# # Create a BAM index file
|
|
121
|
+
# print(f"Indexing BAM: {aligned_sorted_output}")
|
|
122
|
+
# if threads:
|
|
123
|
+
# index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
|
|
124
|
+
# else:
|
|
125
|
+
# index_command = ["samtools", "index", aligned_sorted_output]
|
|
126
|
+
# subprocess.run(index_command)
|