smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,184 @@
1
+ # ------------------------- Utilities -------------------------
2
+ def random_fill_nans(X):
3
+ import numpy as np
4
+ nan_mask = np.isnan(X)
5
+ X[nan_mask] = np.random.rand(*X[nan_mask].shape)
6
+ return X
7
+
8
+ def calculate_row_entropy(
9
+ adata,
10
+ layer,
11
+ output_key="entropy",
12
+ site_config=None,
13
+ ref_col="Reference_strand",
14
+ encoding="signed",
15
+ max_threads=None):
16
+ """
17
+ Adds an obs column to the adata that calculates entropy within each read from a given layer
18
+ when looking at each site type passed in the site_config list.
19
+
20
+ Parameters:
21
+ adata (AnnData): The annotated data matrix.
22
+ layer (str): Name of the layer to use for entropy calculation.
23
+ method (str): Unused currently. Placeholder for potential future methods.
24
+ output_key (str): Base name for the entropy column in adata.obs.
25
+ site_config (dict): {ref: [site_types]} for masking relevant sites.
26
+ ref_col (str): Column in adata.obs denoting reference strands.
27
+ encoding (str): 'signed' (1/-1/0) or 'binary' (1/0/NaN).
28
+ max_threads (int): Number of threads for parallel processing.
29
+ """
30
+ import numpy as np
31
+ import pandas as pd
32
+ from scipy.stats import entropy
33
+ from joblib import Parallel, delayed
34
+ from tqdm import tqdm
35
+
36
+ entropy_values = []
37
+ row_indices = []
38
+
39
+ for ref in adata.obs[ref_col].cat.categories:
40
+ subset = adata[adata.obs[ref_col] == ref].copy()
41
+ if subset.shape[0] == 0:
42
+ continue
43
+
44
+ if site_config and ref in site_config:
45
+ site_mask = np.zeros(subset.shape[1], dtype=bool)
46
+ for site in site_config[ref]:
47
+ site_mask |= subset.var[f"{ref}_{site}"]
48
+ subset = subset[:, site_mask].copy()
49
+
50
+ X = subset.layers[layer].copy()
51
+
52
+ if encoding == "signed":
53
+ X_bin = np.where(X == 1, 1, np.where(X == -1, 0, np.nan))
54
+ else:
55
+ X_bin = np.where(X == 1, 1, np.where(X == 0, 0, np.nan))
56
+
57
+ def compute_entropy(row):
58
+ counts = pd.Series(row).value_counts(dropna=True).sort_index()
59
+ probs = counts / counts.sum()
60
+ return entropy(probs, base=2)
61
+
62
+ entropies = Parallel(n_jobs=max_threads)(
63
+ delayed(compute_entropy)(X_bin[i, :]) for i in tqdm(range(X_bin.shape[0]), desc=f"Entropy: {ref}")
64
+ )
65
+
66
+ entropy_values.extend(entropies)
67
+ row_indices.extend(subset.obs_names.tolist())
68
+
69
+ entropy_key = f"{output_key}_entropy"
70
+ adata.obs.loc[row_indices, entropy_key] = entropy_values
71
+
72
+ def binary_autocorrelation_with_spacing(row, positions, max_lag=1000, assume_sorted=True):
73
+ """
74
+ Fast autocorrelation over real genomic spacing.
75
+ Uses a sliding window + bincount to aggregate per-lag products.
76
+
77
+ Parameters
78
+ ----------
79
+ row : 1D array (float)
80
+ Values per position (NaN = missing). Works for binary or real-valued.
81
+ positions : 1D array (int)
82
+ Genomic coordinates for each column of `row`.
83
+ max_lag : int
84
+ Max genomic lag (inclusive).
85
+ assume_sorted : bool
86
+ If True, assumes `positions` are strictly non-decreasing.
87
+
88
+ Returns
89
+ -------
90
+ autocorr : 1D array, shape (max_lag+1,)
91
+ Normalized autocorrelation; autocorr[0] = 1.0.
92
+ Lags with no valid pairs are NaN.
93
+ """
94
+ import numpy as np
95
+
96
+ # mask valid entries
97
+ valid = ~np.isnan(row)
98
+ if valid.sum() < 2:
99
+ return np.full(max_lag + 1, np.nan, dtype=np.float32)
100
+
101
+ x = row[valid].astype(np.float64, copy=False)
102
+ pos = positions[valid].astype(np.int64, copy=False)
103
+
104
+ # sort by position if needed
105
+ if not assume_sorted:
106
+ order = np.argsort(pos, kind="mergesort")
107
+ pos = pos[order]
108
+ x = x[order]
109
+
110
+ n = x.size
111
+ x_mean = x.mean()
112
+ xc = x - x_mean
113
+ var = np.sum(xc * xc)
114
+ if var == 0.0:
115
+ return np.full(max_lag + 1, np.nan, dtype=np.float32)
116
+
117
+ lag_sums = np.zeros(max_lag + 1, dtype=np.float64)
118
+ lag_counts = np.zeros(max_lag + 1, dtype=np.int64)
119
+
120
+ # sliding window upper pointer
121
+ j = 1
122
+ for i in range(n - 1):
123
+ # advance j to include all positions within max_lag
124
+ while j < n and pos[j] - pos[i] <= max_lag:
125
+ j += 1
126
+ # consider pairs (i, i+1...j-1)
127
+ if j - i > 1:
128
+ diffs = pos[i+1:j] - pos[i] # 1..max_lag
129
+ contrib = xc[i] * xc[i+1:j] # contributions for each pair
130
+ # accumulate weighted sums and counts per lag
131
+ lag_sums[:max_lag+1] += np.bincount(diffs, weights=contrib,
132
+ minlength=max_lag+1)[:max_lag+1]
133
+ lag_counts[:max_lag+1] += np.bincount(diffs,
134
+ minlength=max_lag+1)[:max_lag+1]
135
+
136
+ autocorr = np.full(max_lag + 1, np.nan, dtype=np.float64)
137
+ nz = lag_counts > 0
138
+ autocorr[nz] = lag_sums[nz] / var
139
+ autocorr[0] = 1.0 # by definition
140
+
141
+ return autocorr.astype(np.float32, copy=False)
142
+
143
+ # def binary_autocorrelation_with_spacing(row, positions, max_lag=1000):
144
+ # """
145
+ # Compute autocorrelation within a read using real genomic spacing from `positions`.
146
+ # Only valid (non-NaN) positions are considered.
147
+ # Output is indexed by genomic lag (up to max_lag).
148
+ # """
149
+ # from collections import defaultdict
150
+ # import numpy as np
151
+ # # Get valid positions and values
152
+ # valid_mask = ~np.isnan(row)
153
+ # x = row[valid_mask]
154
+ # pos = positions[valid_mask]
155
+ # n = len(x)
156
+
157
+ # if n < 2:
158
+ # return np.full(max_lag + 1, np.nan)
159
+
160
+ # x_mean = x.mean()
161
+ # var = np.sum((x - x_mean)**2)
162
+ # if var == 0:
163
+ # return np.full(max_lag + 1, np.nan)
164
+
165
+ # # Collect values by lag
166
+ # lag_sums = defaultdict(float)
167
+ # lag_counts = defaultdict(int)
168
+
169
+ # for i in range(n):
170
+ # for j in range(i + 1, n):
171
+ # lag = abs(pos[j] - pos[i])
172
+ # if lag > max_lag:
173
+ # continue
174
+ # product = (x[i] - x_mean) * (x[j] - x_mean)
175
+ # lag_sums[lag] += product
176
+ # lag_counts[lag] += 1
177
+
178
+ # # Normalize to get autocorrelation
179
+ # autocorr = np.full(max_lag + 1, np.nan)
180
+ # for lag in range(max_lag + 1):
181
+ # if lag_counts[lag] > 0:
182
+ # autocorr[lag] = lag_sums[lag] / var
183
+
184
+ # return autocorr