smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +34 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/hmm/apply_hmm_batched.py +242 -0
- smftools/hmm/calculate_distances.py +18 -0
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/hmm/display_hmm.py +18 -0
- smftools/hmm/hmm_readwrite.py +16 -0
- smftools/hmm/nucleosome_hmm_refinement.py +104 -0
- smftools/hmm/train_hmm.py +78 -0
- smftools/informatics/__init__.py +14 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/conversion_smf.py +132 -0
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/direct_smf.py +137 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/fast5_to_pod5.py +24 -0
- smftools/informatics/helpers/__init__.py +73 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +70 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +51 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +32 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/data/preprocessing.py +6 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/__init__.py +9 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/machine_learning/models/positional.py +18 -0
- smftools/machine_learning/models/rnn.py +17 -0
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/models/wrappers.py +20 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +10 -0
- smftools/machine_learning/utils/grl.py +14 -0
- smftools/plotting/__init__.py +18 -0
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +682 -0
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +38 -0
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_coverage.py +51 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +62 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- smftools/preprocessing/invert_adata.py +37 -0
- smftools/preprocessing/load_sample_sheet.py +53 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +1004 -0
- smftools/tools/__init__.py +20 -0
- smftools/tools/archived/apply_hmm.py +202 -0
- smftools/tools/archived/classifiers.py +787 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/position_stats.py +601 -0
- smftools/tools/read_stats.py +184 -0
- smftools/tools/spatial_autocorrelation.py +562 -0
- smftools/tools/subset_adata.py +28 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .position_stats import calculate_relative_risk_on_activity, compute_positionwise_statistics
|
|
2
|
+
from .calculate_umap import calculate_umap
|
|
3
|
+
from .cluster_adata_on_methylation import cluster_adata_on_methylation
|
|
4
|
+
from .general_tools import create_nan_mask_from_X, combine_layers, create_nan_or_non_gpc_mask
|
|
5
|
+
from .read_stats import calculate_row_entropy
|
|
6
|
+
from .spatial_autocorrelation import *
|
|
7
|
+
from .subset_adata import subset_adata
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"compute_positionwise_statistics",
|
|
12
|
+
"calculate_row_entropy",
|
|
13
|
+
"calculate_umap",
|
|
14
|
+
"calculate_relative_risk_on_activity",
|
|
15
|
+
"cluster_adata_on_methylation",
|
|
16
|
+
"create_nan_mask_from_X",
|
|
17
|
+
"create_nan_or_non_gpc_mask",
|
|
18
|
+
"combine_layers",
|
|
19
|
+
"subset_adata",
|
|
20
|
+
]
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import torch
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
|
|
6
|
+
def apply_hmm(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
|
|
7
|
+
"""
|
|
8
|
+
Applies an HMM model to an AnnData object using tensor-based sequence inputs.
|
|
9
|
+
If multiple methbases are passed, generates a combined feature set.
|
|
10
|
+
"""
|
|
11
|
+
model.to(device)
|
|
12
|
+
|
|
13
|
+
# --- Feature Definitions ---
|
|
14
|
+
feature_sets = {}
|
|
15
|
+
if footprints:
|
|
16
|
+
feature_sets["footprint"] = {
|
|
17
|
+
"features": {
|
|
18
|
+
"small_bound_stretch": [0, 30],
|
|
19
|
+
"medium_bound_stretch": [30, 80],
|
|
20
|
+
"putative_nucleosome": [80, 200],
|
|
21
|
+
"large_bound_stretch": [200, np.inf]
|
|
22
|
+
},
|
|
23
|
+
"state": "Non-Methylated"
|
|
24
|
+
}
|
|
25
|
+
if accessible_patches:
|
|
26
|
+
feature_sets["accessible"] = {
|
|
27
|
+
"features": {
|
|
28
|
+
"small_accessible_patch": [0, 30],
|
|
29
|
+
"mid_accessible_patch": [30, 80],
|
|
30
|
+
"large_accessible_patch": [80, np.inf]
|
|
31
|
+
},
|
|
32
|
+
"state": "Methylated"
|
|
33
|
+
}
|
|
34
|
+
if cpg:
|
|
35
|
+
feature_sets["cpg"] = {
|
|
36
|
+
"features": {
|
|
37
|
+
"cpg_patch": [0, np.inf]
|
|
38
|
+
},
|
|
39
|
+
"state": "Methylated"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# --- Init columns ---
|
|
43
|
+
all_features = []
|
|
44
|
+
combined_prefix = "Combined"
|
|
45
|
+
for key, fs in feature_sets.items():
|
|
46
|
+
if key == 'cpg':
|
|
47
|
+
all_features += [f"CpG_{f}" for f in fs["features"]]
|
|
48
|
+
all_features.append(f"CpG_all_{key}_features")
|
|
49
|
+
else:
|
|
50
|
+
for methbase in methbases:
|
|
51
|
+
all_features += [f"{methbase}_{f}" for f in fs["features"]]
|
|
52
|
+
all_features.append(f"{methbase}_all_{key}_features")
|
|
53
|
+
all_features += [f"{combined_prefix}_{f}" for f in fs["features"]]
|
|
54
|
+
all_features.append(f"{combined_prefix}_all_{key}_features")
|
|
55
|
+
|
|
56
|
+
for feature in all_features:
|
|
57
|
+
adata.obs[feature] = pd.Series([[] for _ in range(adata.shape[0])], dtype=object, index=adata.obs.index)
|
|
58
|
+
adata.obs[f"{feature}_distances"] = pd.Series([None] * adata.shape[0])
|
|
59
|
+
adata.obs[f"n_{feature}"] = -1
|
|
60
|
+
|
|
61
|
+
# --- Main loop ---
|
|
62
|
+
references = adata.obs[obs_column].cat.categories
|
|
63
|
+
|
|
64
|
+
for ref in tqdm(references, desc="Processing References"):
|
|
65
|
+
ref_subset = adata[adata.obs[obs_column] == ref]
|
|
66
|
+
|
|
67
|
+
# Create combined mask for methbases
|
|
68
|
+
combined_mask = None
|
|
69
|
+
for methbase in methbases:
|
|
70
|
+
mask = {
|
|
71
|
+
"a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
|
|
72
|
+
"gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
|
|
73
|
+
"cpg": ref_subset.var[f"{ref}_CpG_site"] == True
|
|
74
|
+
}[methbase.lower()]
|
|
75
|
+
combined_mask = mask if combined_mask is None else combined_mask | mask
|
|
76
|
+
|
|
77
|
+
methbase_subset = ref_subset[:, mask]
|
|
78
|
+
matrix = methbase_subset.layers[layer] if layer else methbase_subset.X
|
|
79
|
+
|
|
80
|
+
for i, raw_read in enumerate(matrix):
|
|
81
|
+
read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
|
|
82
|
+
tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
|
|
83
|
+
coords = methbase_subset.var_names
|
|
84
|
+
|
|
85
|
+
for key, fs in feature_sets.items():
|
|
86
|
+
if key == 'cpg':
|
|
87
|
+
continue
|
|
88
|
+
state_target = fs["state"]
|
|
89
|
+
feature_map = fs["features"]
|
|
90
|
+
|
|
91
|
+
classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
|
|
92
|
+
idx = methbase_subset.obs.index[i]
|
|
93
|
+
|
|
94
|
+
for start, length, label, prob in classifications:
|
|
95
|
+
adata.obs.at[idx, f"{methbase}_{label}"].append([start, length, prob])
|
|
96
|
+
adata.obs.at[idx, f"{methbase}_all_{key}_features"].append([start, length, prob])
|
|
97
|
+
|
|
98
|
+
# Combined methbase subset
|
|
99
|
+
combined_subset = ref_subset[:, combined_mask]
|
|
100
|
+
combined_matrix = combined_subset.layers[layer] if layer else combined_subset.X
|
|
101
|
+
|
|
102
|
+
for i, raw_read in enumerate(combined_matrix):
|
|
103
|
+
read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
|
|
104
|
+
tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
|
|
105
|
+
coords = combined_subset.var_names
|
|
106
|
+
|
|
107
|
+
for key, fs in feature_sets.items():
|
|
108
|
+
if key == 'cpg':
|
|
109
|
+
continue
|
|
110
|
+
state_target = fs["state"]
|
|
111
|
+
feature_map = fs["features"]
|
|
112
|
+
|
|
113
|
+
classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
|
|
114
|
+
idx = combined_subset.obs.index[i]
|
|
115
|
+
|
|
116
|
+
for start, length, label, prob in classifications:
|
|
117
|
+
adata.obs.at[idx, f"{combined_prefix}_{label}"].append([start, length, prob])
|
|
118
|
+
adata.obs.at[idx, f"{combined_prefix}_all_{key}_features"].append([start, length, prob])
|
|
119
|
+
|
|
120
|
+
# --- Special handling for CpG ---
|
|
121
|
+
if cpg:
|
|
122
|
+
for ref in tqdm(references, desc="Processing CpG"):
|
|
123
|
+
ref_subset = adata[adata.obs[obs_column] == ref]
|
|
124
|
+
mask = (ref_subset.var[f"{ref}_CpG_site"] == True)
|
|
125
|
+
cpg_subset = ref_subset[:, mask]
|
|
126
|
+
matrix = cpg_subset.layers[layer] if layer else cpg_subset.X
|
|
127
|
+
|
|
128
|
+
for i, raw_read in enumerate(matrix):
|
|
129
|
+
read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
|
|
130
|
+
tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
|
|
131
|
+
coords = cpg_subset.var_names
|
|
132
|
+
fs = feature_sets['cpg']
|
|
133
|
+
state_target = fs["state"]
|
|
134
|
+
feature_map = fs["features"]
|
|
135
|
+
classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
|
|
136
|
+
idx = cpg_subset.obs.index[i]
|
|
137
|
+
for start, length, label, prob in classifications:
|
|
138
|
+
adata.obs.at[idx, f"CpG_{label}"].append([start, length, prob])
|
|
139
|
+
adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
|
|
140
|
+
|
|
141
|
+
# --- Binarization + Distance ---
|
|
142
|
+
for feature in tqdm(all_features, desc="Finalizing Layers"):
|
|
143
|
+
bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
|
|
144
|
+
counts = np.zeros(adata.shape[0], dtype=int)
|
|
145
|
+
for row_idx, intervals in enumerate(adata.obs[feature]):
|
|
146
|
+
if not isinstance(intervals, list):
|
|
147
|
+
intervals = []
|
|
148
|
+
for start, length, prob in intervals:
|
|
149
|
+
if prob > threshold:
|
|
150
|
+
bin_matrix[row_idx, start:start+length] = 1
|
|
151
|
+
counts[row_idx] += 1
|
|
152
|
+
adata.layers[f"{feature}"] = bin_matrix
|
|
153
|
+
adata.obs[f"n_{feature}"] = counts
|
|
154
|
+
adata.obs[f"{feature}_distances"] = adata.obs[feature].apply(lambda x: calculate_distances(x, threshold))
|
|
155
|
+
|
|
156
|
+
def calculate_distances(intervals, threshold=0.9):
|
|
157
|
+
"""Calculates distances between consecutive features in a read."""
|
|
158
|
+
intervals = sorted([iv for iv in intervals if iv[2] > threshold], key=lambda x: x[0])
|
|
159
|
+
distances = [(intervals[i + 1][0] - (intervals[i][0] + intervals[i][1]))
|
|
160
|
+
for i in range(len(intervals) - 1)]
|
|
161
|
+
return distances
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def classify_features(sequence, model, coordinates, classification_mapping={}, target_state="Methylated"):
|
|
165
|
+
"""
|
|
166
|
+
Classifies regions based on HMM state.
|
|
167
|
+
|
|
168
|
+
Parameters:
|
|
169
|
+
sequence (torch.Tensor): Tensor of binarized data [batch_size, seq_len, 1]
|
|
170
|
+
model: Trained pomegranate HMM
|
|
171
|
+
coordinates (list): Genomic coordinates for sequence
|
|
172
|
+
classification_mapping (dict): Mapping for feature labeling
|
|
173
|
+
target_state (str): The state to classify ("Methylated" or "Non-Methylated")
|
|
174
|
+
"""
|
|
175
|
+
predicted_states = model.predict(sequence).squeeze(-1).squeeze(0).cpu().numpy()
|
|
176
|
+
probabilities = model.predict_proba(sequence).squeeze(0).cpu().numpy()
|
|
177
|
+
state_labels = ["Non-Methylated", "Methylated"]
|
|
178
|
+
|
|
179
|
+
classifications, current_start, current_length, current_probs = [], None, 0, []
|
|
180
|
+
|
|
181
|
+
for i, state_index in enumerate(predicted_states):
|
|
182
|
+
state_name = state_labels[state_index]
|
|
183
|
+
state_prob = probabilities[i][state_index]
|
|
184
|
+
|
|
185
|
+
if state_name == target_state:
|
|
186
|
+
if current_start is None:
|
|
187
|
+
current_start = i
|
|
188
|
+
current_length += 1
|
|
189
|
+
current_probs.append(state_prob)
|
|
190
|
+
elif current_start is not None:
|
|
191
|
+
classifications.append((current_start, current_length, avg := np.mean(current_probs)))
|
|
192
|
+
current_start, current_length, current_probs = None, 0, []
|
|
193
|
+
|
|
194
|
+
if current_start is not None:
|
|
195
|
+
classifications.append((current_start, current_length, avg := np.mean(current_probs)))
|
|
196
|
+
|
|
197
|
+
final = []
|
|
198
|
+
for start, length, prob in classifications:
|
|
199
|
+
feature_length = int(coordinates[start + length - 1]) - int(coordinates[start]) + 1
|
|
200
|
+
label = next((ftype for ftype, rng in classification_mapping.items() if rng[0] <= feature_length < rng[1]), target_state)
|
|
201
|
+
final.append((int(coordinates[start]) + 1, feature_length, label, prob))
|
|
202
|
+
return final
|