smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +34 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/hmm/apply_hmm_batched.py +242 -0
- smftools/hmm/calculate_distances.py +18 -0
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/hmm/display_hmm.py +18 -0
- smftools/hmm/hmm_readwrite.py +16 -0
- smftools/hmm/nucleosome_hmm_refinement.py +104 -0
- smftools/hmm/train_hmm.py +78 -0
- smftools/informatics/__init__.py +14 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/conversion_smf.py +132 -0
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/direct_smf.py +137 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/fast5_to_pod5.py +24 -0
- smftools/informatics/helpers/__init__.py +73 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +70 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +51 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +32 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/data/preprocessing.py +6 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/__init__.py +9 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/machine_learning/models/positional.py +18 -0
- smftools/machine_learning/models/rnn.py +17 -0
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/models/wrappers.py +20 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +10 -0
- smftools/machine_learning/utils/grl.py +14 -0
- smftools/plotting/__init__.py +18 -0
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +682 -0
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +38 -0
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_coverage.py +51 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +62 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- smftools/preprocessing/invert_adata.py +37 -0
- smftools/preprocessing/load_sample_sheet.py +53 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +1004 -0
- smftools/tools/__init__.py +20 -0
- smftools/tools/archived/apply_hmm.py +202 -0
- smftools/tools/archived/classifiers.py +787 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/position_stats.py +601 -0
- smftools/tools/read_stats.py +184 -0
- smftools/tools/spatial_autocorrelation.py +562 -0
- smftools/tools/subset_adata.py +28 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# ------------------------- Utilities -------------------------
|
|
2
|
+
def random_fill_nans(X):
|
|
3
|
+
import numpy as np
|
|
4
|
+
nan_mask = np.isnan(X)
|
|
5
|
+
X[nan_mask] = np.random.rand(*X[nan_mask].shape)
|
|
6
|
+
return X
|
|
7
|
+
|
|
8
|
+
def calculate_row_entropy(
|
|
9
|
+
adata,
|
|
10
|
+
layer,
|
|
11
|
+
output_key="entropy",
|
|
12
|
+
site_config=None,
|
|
13
|
+
ref_col="Reference_strand",
|
|
14
|
+
encoding="signed",
|
|
15
|
+
max_threads=None):
|
|
16
|
+
"""
|
|
17
|
+
Adds an obs column to the adata that calculates entropy within each read from a given layer
|
|
18
|
+
when looking at each site type passed in the site_config list.
|
|
19
|
+
|
|
20
|
+
Parameters:
|
|
21
|
+
adata (AnnData): The annotated data matrix.
|
|
22
|
+
layer (str): Name of the layer to use for entropy calculation.
|
|
23
|
+
method (str): Unused currently. Placeholder for potential future methods.
|
|
24
|
+
output_key (str): Base name for the entropy column in adata.obs.
|
|
25
|
+
site_config (dict): {ref: [site_types]} for masking relevant sites.
|
|
26
|
+
ref_col (str): Column in adata.obs denoting reference strands.
|
|
27
|
+
encoding (str): 'signed' (1/-1/0) or 'binary' (1/0/NaN).
|
|
28
|
+
max_threads (int): Number of threads for parallel processing.
|
|
29
|
+
"""
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
from scipy.stats import entropy
|
|
33
|
+
from joblib import Parallel, delayed
|
|
34
|
+
from tqdm import tqdm
|
|
35
|
+
|
|
36
|
+
entropy_values = []
|
|
37
|
+
row_indices = []
|
|
38
|
+
|
|
39
|
+
for ref in adata.obs[ref_col].cat.categories:
|
|
40
|
+
subset = adata[adata.obs[ref_col] == ref].copy()
|
|
41
|
+
if subset.shape[0] == 0:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
if site_config and ref in site_config:
|
|
45
|
+
site_mask = np.zeros(subset.shape[1], dtype=bool)
|
|
46
|
+
for site in site_config[ref]:
|
|
47
|
+
site_mask |= subset.var[f"{ref}_{site}"]
|
|
48
|
+
subset = subset[:, site_mask].copy()
|
|
49
|
+
|
|
50
|
+
X = subset.layers[layer].copy()
|
|
51
|
+
|
|
52
|
+
if encoding == "signed":
|
|
53
|
+
X_bin = np.where(X == 1, 1, np.where(X == -1, 0, np.nan))
|
|
54
|
+
else:
|
|
55
|
+
X_bin = np.where(X == 1, 1, np.where(X == 0, 0, np.nan))
|
|
56
|
+
|
|
57
|
+
def compute_entropy(row):
|
|
58
|
+
counts = pd.Series(row).value_counts(dropna=True).sort_index()
|
|
59
|
+
probs = counts / counts.sum()
|
|
60
|
+
return entropy(probs, base=2)
|
|
61
|
+
|
|
62
|
+
entropies = Parallel(n_jobs=max_threads)(
|
|
63
|
+
delayed(compute_entropy)(X_bin[i, :]) for i in tqdm(range(X_bin.shape[0]), desc=f"Entropy: {ref}")
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
entropy_values.extend(entropies)
|
|
67
|
+
row_indices.extend(subset.obs_names.tolist())
|
|
68
|
+
|
|
69
|
+
entropy_key = f"{output_key}_entropy"
|
|
70
|
+
adata.obs.loc[row_indices, entropy_key] = entropy_values
|
|
71
|
+
|
|
72
|
+
def binary_autocorrelation_with_spacing(row, positions, max_lag=1000, assume_sorted=True):
|
|
73
|
+
"""
|
|
74
|
+
Fast autocorrelation over real genomic spacing.
|
|
75
|
+
Uses a sliding window + bincount to aggregate per-lag products.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
row : 1D array (float)
|
|
80
|
+
Values per position (NaN = missing). Works for binary or real-valued.
|
|
81
|
+
positions : 1D array (int)
|
|
82
|
+
Genomic coordinates for each column of `row`.
|
|
83
|
+
max_lag : int
|
|
84
|
+
Max genomic lag (inclusive).
|
|
85
|
+
assume_sorted : bool
|
|
86
|
+
If True, assumes `positions` are strictly non-decreasing.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
autocorr : 1D array, shape (max_lag+1,)
|
|
91
|
+
Normalized autocorrelation; autocorr[0] = 1.0.
|
|
92
|
+
Lags with no valid pairs are NaN.
|
|
93
|
+
"""
|
|
94
|
+
import numpy as np
|
|
95
|
+
|
|
96
|
+
# mask valid entries
|
|
97
|
+
valid = ~np.isnan(row)
|
|
98
|
+
if valid.sum() < 2:
|
|
99
|
+
return np.full(max_lag + 1, np.nan, dtype=np.float32)
|
|
100
|
+
|
|
101
|
+
x = row[valid].astype(np.float64, copy=False)
|
|
102
|
+
pos = positions[valid].astype(np.int64, copy=False)
|
|
103
|
+
|
|
104
|
+
# sort by position if needed
|
|
105
|
+
if not assume_sorted:
|
|
106
|
+
order = np.argsort(pos, kind="mergesort")
|
|
107
|
+
pos = pos[order]
|
|
108
|
+
x = x[order]
|
|
109
|
+
|
|
110
|
+
n = x.size
|
|
111
|
+
x_mean = x.mean()
|
|
112
|
+
xc = x - x_mean
|
|
113
|
+
var = np.sum(xc * xc)
|
|
114
|
+
if var == 0.0:
|
|
115
|
+
return np.full(max_lag + 1, np.nan, dtype=np.float32)
|
|
116
|
+
|
|
117
|
+
lag_sums = np.zeros(max_lag + 1, dtype=np.float64)
|
|
118
|
+
lag_counts = np.zeros(max_lag + 1, dtype=np.int64)
|
|
119
|
+
|
|
120
|
+
# sliding window upper pointer
|
|
121
|
+
j = 1
|
|
122
|
+
for i in range(n - 1):
|
|
123
|
+
# advance j to include all positions within max_lag
|
|
124
|
+
while j < n and pos[j] - pos[i] <= max_lag:
|
|
125
|
+
j += 1
|
|
126
|
+
# consider pairs (i, i+1...j-1)
|
|
127
|
+
if j - i > 1:
|
|
128
|
+
diffs = pos[i+1:j] - pos[i] # 1..max_lag
|
|
129
|
+
contrib = xc[i] * xc[i+1:j] # contributions for each pair
|
|
130
|
+
# accumulate weighted sums and counts per lag
|
|
131
|
+
lag_sums[:max_lag+1] += np.bincount(diffs, weights=contrib,
|
|
132
|
+
minlength=max_lag+1)[:max_lag+1]
|
|
133
|
+
lag_counts[:max_lag+1] += np.bincount(diffs,
|
|
134
|
+
minlength=max_lag+1)[:max_lag+1]
|
|
135
|
+
|
|
136
|
+
autocorr = np.full(max_lag + 1, np.nan, dtype=np.float64)
|
|
137
|
+
nz = lag_counts > 0
|
|
138
|
+
autocorr[nz] = lag_sums[nz] / var
|
|
139
|
+
autocorr[0] = 1.0 # by definition
|
|
140
|
+
|
|
141
|
+
return autocorr.astype(np.float32, copy=False)
|
|
142
|
+
|
|
143
|
+
# def binary_autocorrelation_with_spacing(row, positions, max_lag=1000):
|
|
144
|
+
# """
|
|
145
|
+
# Compute autocorrelation within a read using real genomic spacing from `positions`.
|
|
146
|
+
# Only valid (non-NaN) positions are considered.
|
|
147
|
+
# Output is indexed by genomic lag (up to max_lag).
|
|
148
|
+
# """
|
|
149
|
+
# from collections import defaultdict
|
|
150
|
+
# import numpy as np
|
|
151
|
+
# # Get valid positions and values
|
|
152
|
+
# valid_mask = ~np.isnan(row)
|
|
153
|
+
# x = row[valid_mask]
|
|
154
|
+
# pos = positions[valid_mask]
|
|
155
|
+
# n = len(x)
|
|
156
|
+
|
|
157
|
+
# if n < 2:
|
|
158
|
+
# return np.full(max_lag + 1, np.nan)
|
|
159
|
+
|
|
160
|
+
# x_mean = x.mean()
|
|
161
|
+
# var = np.sum((x - x_mean)**2)
|
|
162
|
+
# if var == 0:
|
|
163
|
+
# return np.full(max_lag + 1, np.nan)
|
|
164
|
+
|
|
165
|
+
# # Collect values by lag
|
|
166
|
+
# lag_sums = defaultdict(float)
|
|
167
|
+
# lag_counts = defaultdict(int)
|
|
168
|
+
|
|
169
|
+
# for i in range(n):
|
|
170
|
+
# for j in range(i + 1, n):
|
|
171
|
+
# lag = abs(pos[j] - pos[i])
|
|
172
|
+
# if lag > max_lag:
|
|
173
|
+
# continue
|
|
174
|
+
# product = (x[i] - x_mean) * (x[j] - x_mean)
|
|
175
|
+
# lag_sums[lag] += product
|
|
176
|
+
# lag_counts[lag] += 1
|
|
177
|
+
|
|
178
|
+
# # Normalize to get autocorrelation
|
|
179
|
+
# autocorr = np.full(max_lag + 1, np.nan)
|
|
180
|
+
# for lag in range(max_lag + 1):
|
|
181
|
+
# if lag_counts[lag] > 0:
|
|
182
|
+
# autocorr[lag] = lag_sums[lag] / var
|
|
183
|
+
|
|
184
|
+
# return autocorr
|