smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,20 @@
1
+ from .position_stats import calculate_relative_risk_on_activity, compute_positionwise_statistics
2
+ from .calculate_umap import calculate_umap
3
+ from .cluster_adata_on_methylation import cluster_adata_on_methylation
4
+ from .general_tools import create_nan_mask_from_X, combine_layers, create_nan_or_non_gpc_mask
5
+ from .read_stats import calculate_row_entropy
6
+ from .spatial_autocorrelation import *
7
+ from .subset_adata import subset_adata
8
+
9
+
10
+ __all__ = [
11
+ "compute_positionwise_statistics",
12
+ "calculate_row_entropy",
13
+ "calculate_umap",
14
+ "calculate_relative_risk_on_activity",
15
+ "cluster_adata_on_methylation",
16
+ "create_nan_mask_from_X",
17
+ "create_nan_or_non_gpc_mask",
18
+ "combine_layers",
19
+ "subset_adata",
20
+ ]
@@ -0,0 +1,202 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ from tqdm import tqdm
5
+
6
+ def apply_hmm(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
7
+ """
8
+ Applies an HMM model to an AnnData object using tensor-based sequence inputs.
9
+ If multiple methbases are passed, generates a combined feature set.
10
+ """
11
+ model.to(device)
12
+
13
+ # --- Feature Definitions ---
14
+ feature_sets = {}
15
+ if footprints:
16
+ feature_sets["footprint"] = {
17
+ "features": {
18
+ "small_bound_stretch": [0, 30],
19
+ "medium_bound_stretch": [30, 80],
20
+ "putative_nucleosome": [80, 200],
21
+ "large_bound_stretch": [200, np.inf]
22
+ },
23
+ "state": "Non-Methylated"
24
+ }
25
+ if accessible_patches:
26
+ feature_sets["accessible"] = {
27
+ "features": {
28
+ "small_accessible_patch": [0, 30],
29
+ "mid_accessible_patch": [30, 80],
30
+ "large_accessible_patch": [80, np.inf]
31
+ },
32
+ "state": "Methylated"
33
+ }
34
+ if cpg:
35
+ feature_sets["cpg"] = {
36
+ "features": {
37
+ "cpg_patch": [0, np.inf]
38
+ },
39
+ "state": "Methylated"
40
+ }
41
+
42
+ # --- Init columns ---
43
+ all_features = []
44
+ combined_prefix = "Combined"
45
+ for key, fs in feature_sets.items():
46
+ if key == 'cpg':
47
+ all_features += [f"CpG_{f}" for f in fs["features"]]
48
+ all_features.append(f"CpG_all_{key}_features")
49
+ else:
50
+ for methbase in methbases:
51
+ all_features += [f"{methbase}_{f}" for f in fs["features"]]
52
+ all_features.append(f"{methbase}_all_{key}_features")
53
+ all_features += [f"{combined_prefix}_{f}" for f in fs["features"]]
54
+ all_features.append(f"{combined_prefix}_all_{key}_features")
55
+
56
+ for feature in all_features:
57
+ adata.obs[feature] = pd.Series([[] for _ in range(adata.shape[0])], dtype=object, index=adata.obs.index)
58
+ adata.obs[f"{feature}_distances"] = pd.Series([None] * adata.shape[0])
59
+ adata.obs[f"n_{feature}"] = -1
60
+
61
+ # --- Main loop ---
62
+ references = adata.obs[obs_column].cat.categories
63
+
64
+ for ref in tqdm(references, desc="Processing References"):
65
+ ref_subset = adata[adata.obs[obs_column] == ref]
66
+
67
+ # Create combined mask for methbases
68
+ combined_mask = None
69
+ for methbase in methbases:
70
+ mask = {
71
+ "a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
72
+ "gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
73
+ "cpg": ref_subset.var[f"{ref}_CpG_site"] == True
74
+ }[methbase.lower()]
75
+ combined_mask = mask if combined_mask is None else combined_mask | mask
76
+
77
+ methbase_subset = ref_subset[:, mask]
78
+ matrix = methbase_subset.layers[layer] if layer else methbase_subset.X
79
+
80
+ for i, raw_read in enumerate(matrix):
81
+ read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
82
+ tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
83
+ coords = methbase_subset.var_names
84
+
85
+ for key, fs in feature_sets.items():
86
+ if key == 'cpg':
87
+ continue
88
+ state_target = fs["state"]
89
+ feature_map = fs["features"]
90
+
91
+ classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
92
+ idx = methbase_subset.obs.index[i]
93
+
94
+ for start, length, label, prob in classifications:
95
+ adata.obs.at[idx, f"{methbase}_{label}"].append([start, length, prob])
96
+ adata.obs.at[idx, f"{methbase}_all_{key}_features"].append([start, length, prob])
97
+
98
+ # Combined methbase subset
99
+ combined_subset = ref_subset[:, combined_mask]
100
+ combined_matrix = combined_subset.layers[layer] if layer else combined_subset.X
101
+
102
+ for i, raw_read in enumerate(combined_matrix):
103
+ read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
104
+ tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
105
+ coords = combined_subset.var_names
106
+
107
+ for key, fs in feature_sets.items():
108
+ if key == 'cpg':
109
+ continue
110
+ state_target = fs["state"]
111
+ feature_map = fs["features"]
112
+
113
+ classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
114
+ idx = combined_subset.obs.index[i]
115
+
116
+ for start, length, label, prob in classifications:
117
+ adata.obs.at[idx, f"{combined_prefix}_{label}"].append([start, length, prob])
118
+ adata.obs.at[idx, f"{combined_prefix}_all_{key}_features"].append([start, length, prob])
119
+
120
+ # --- Special handling for CpG ---
121
+ if cpg:
122
+ for ref in tqdm(references, desc="Processing CpG"):
123
+ ref_subset = adata[adata.obs[obs_column] == ref]
124
+ mask = (ref_subset.var[f"{ref}_CpG_site"] == True)
125
+ cpg_subset = ref_subset[:, mask]
126
+ matrix = cpg_subset.layers[layer] if layer else cpg_subset.X
127
+
128
+ for i, raw_read in enumerate(matrix):
129
+ read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
130
+ tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
131
+ coords = cpg_subset.var_names
132
+ fs = feature_sets['cpg']
133
+ state_target = fs["state"]
134
+ feature_map = fs["features"]
135
+ classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
136
+ idx = cpg_subset.obs.index[i]
137
+ for start, length, label, prob in classifications:
138
+ adata.obs.at[idx, f"CpG_{label}"].append([start, length, prob])
139
+ adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
140
+
141
+ # --- Binarization + Distance ---
142
+ for feature in tqdm(all_features, desc="Finalizing Layers"):
143
+ bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
144
+ counts = np.zeros(adata.shape[0], dtype=int)
145
+ for row_idx, intervals in enumerate(adata.obs[feature]):
146
+ if not isinstance(intervals, list):
147
+ intervals = []
148
+ for start, length, prob in intervals:
149
+ if prob > threshold:
150
+ bin_matrix[row_idx, start:start+length] = 1
151
+ counts[row_idx] += 1
152
+ adata.layers[f"{feature}"] = bin_matrix
153
+ adata.obs[f"n_{feature}"] = counts
154
+ adata.obs[f"{feature}_distances"] = adata.obs[feature].apply(lambda x: calculate_distances(x, threshold))
155
+
156
+ def calculate_distances(intervals, threshold=0.9):
157
+ """Calculates distances between consecutive features in a read."""
158
+ intervals = sorted([iv for iv in intervals if iv[2] > threshold], key=lambda x: x[0])
159
+ distances = [(intervals[i + 1][0] - (intervals[i][0] + intervals[i][1]))
160
+ for i in range(len(intervals) - 1)]
161
+ return distances
162
+
163
+
164
+ def classify_features(sequence, model, coordinates, classification_mapping={}, target_state="Methylated"):
165
+ """
166
+ Classifies regions based on HMM state.
167
+
168
+ Parameters:
169
+ sequence (torch.Tensor): Tensor of binarized data [batch_size, seq_len, 1]
170
+ model: Trained pomegranate HMM
171
+ coordinates (list): Genomic coordinates for sequence
172
+ classification_mapping (dict): Mapping for feature labeling
173
+ target_state (str): The state to classify ("Methylated" or "Non-Methylated")
174
+ """
175
+ predicted_states = model.predict(sequence).squeeze(-1).squeeze(0).cpu().numpy()
176
+ probabilities = model.predict_proba(sequence).squeeze(0).cpu().numpy()
177
+ state_labels = ["Non-Methylated", "Methylated"]
178
+
179
+ classifications, current_start, current_length, current_probs = [], None, 0, []
180
+
181
+ for i, state_index in enumerate(predicted_states):
182
+ state_name = state_labels[state_index]
183
+ state_prob = probabilities[i][state_index]
184
+
185
+ if state_name == target_state:
186
+ if current_start is None:
187
+ current_start = i
188
+ current_length += 1
189
+ current_probs.append(state_prob)
190
+ elif current_start is not None:
191
+ classifications.append((current_start, current_length, avg := np.mean(current_probs)))
192
+ current_start, current_length, current_probs = None, 0, []
193
+
194
+ if current_start is not None:
195
+ classifications.append((current_start, current_length, avg := np.mean(current_probs)))
196
+
197
+ final = []
198
+ for start, length, prob in classifications:
199
+ feature_length = int(coordinates[start + length - 1]) - int(coordinates[start]) + 1
200
+ label = next((ftype for ftype, rng in classification_mapping.items() if rng[0] <= feature_length < rng[1]), target_state)
201
+ final.append((int(coordinates[start]) + 1, feature_length, label, prob))
202
+ return final