smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +34 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/hmm/apply_hmm_batched.py +242 -0
- smftools/hmm/calculate_distances.py +18 -0
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/hmm/display_hmm.py +18 -0
- smftools/hmm/hmm_readwrite.py +16 -0
- smftools/hmm/nucleosome_hmm_refinement.py +104 -0
- smftools/hmm/train_hmm.py +78 -0
- smftools/informatics/__init__.py +14 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/conversion_smf.py +132 -0
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/direct_smf.py +137 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/fast5_to_pod5.py +24 -0
- smftools/informatics/helpers/__init__.py +73 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +70 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +51 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +32 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/data/preprocessing.py +6 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/__init__.py +9 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/machine_learning/models/positional.py +18 -0
- smftools/machine_learning/models/rnn.py +17 -0
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/models/wrappers.py +20 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +10 -0
- smftools/machine_learning/utils/grl.py +14 -0
- smftools/plotting/__init__.py +18 -0
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +682 -0
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +38 -0
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_coverage.py +51 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +62 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- smftools/preprocessing/invert_adata.py +37 -0
- smftools/preprocessing/load_sample_sheet.py +53 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +1004 -0
- smftools/tools/__init__.py +20 -0
- smftools/tools/archived/apply_hmm.py +202 -0
- smftools/tools/archived/classifiers.py +787 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/position_stats.py +601 -0
- smftools/tools/read_stats.py +184 -0
- smftools/tools/spatial_autocorrelation.py +562 -0
- smftools/tools/subset_adata.py +28 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import scipy.sparse as sp
|
|
3
|
+
|
|
4
|
+
def append_binary_layer_by_base_context(
|
|
5
|
+
adata,
|
|
6
|
+
reference_column: str,
|
|
7
|
+
smf_modality: str = "conversion",
|
|
8
|
+
verbose: bool = True,
|
|
9
|
+
uns_flag: str = "binary_layers_by_base_context_added",
|
|
10
|
+
bypass: bool = False,
|
|
11
|
+
force_redo: bool = False
|
|
12
|
+
):
|
|
13
|
+
"""
|
|
14
|
+
Build per-reference C/G-site masked layers:
|
|
15
|
+
- GpC_site_binary
|
|
16
|
+
- CpG_site_binary
|
|
17
|
+
- GpC_CpG_combined_site_binary (numeric sum where present; NaN where neither present)
|
|
18
|
+
- any_C_site_binary
|
|
19
|
+
- other_C_site_binary
|
|
20
|
+
|
|
21
|
+
Behavior:
|
|
22
|
+
- If X is sparse it will be converted to dense for these layers (keeps original adata.X untouched).
|
|
23
|
+
- Missing var columns are warned about but do not crash.
|
|
24
|
+
- Masked positions are filled with np.nan to make masked vs unmasked explicit.
|
|
25
|
+
- Requires append_base_context to be run first
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
# Only run if not already performed
|
|
29
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
30
|
+
if (already and not force_redo) or bypass or ("base_context_added" not in adata.uns):
|
|
31
|
+
# QC already performed; nothing to do
|
|
32
|
+
return adata
|
|
33
|
+
|
|
34
|
+
# check inputs
|
|
35
|
+
if reference_column not in adata.obs.columns:
|
|
36
|
+
raise KeyError(f"reference_column '{reference_column}' not found in adata.obs")
|
|
37
|
+
|
|
38
|
+
# modality flag (kept for your potential use)
|
|
39
|
+
if smf_modality != "direct":
|
|
40
|
+
if smf_modality == "conversion":
|
|
41
|
+
deaminase = False
|
|
42
|
+
else:
|
|
43
|
+
deaminase = True
|
|
44
|
+
else:
|
|
45
|
+
deaminase = None # unused but preserved
|
|
46
|
+
|
|
47
|
+
# expected per-reference var column names
|
|
48
|
+
references = adata.obs[reference_column].astype("category").cat.categories
|
|
49
|
+
reference_to_gpc_column = {ref: f"{ref}_GpC_site" for ref in references}
|
|
50
|
+
reference_to_cpg_column = {ref: f"{ref}_CpG_site" for ref in references}
|
|
51
|
+
reference_to_c_column = {ref: f"{ref}_any_C_site" for ref in references}
|
|
52
|
+
reference_to_other_c_column = {ref: f"{ref}_other_C_site" for ref in references}
|
|
53
|
+
|
|
54
|
+
# verify var columns exist and build boolean masks per ref (len = n_vars)
|
|
55
|
+
n_obs, n_vars = adata.shape
|
|
56
|
+
def _col_mask_or_warn(colname):
|
|
57
|
+
if colname not in adata.var.columns:
|
|
58
|
+
if verbose:
|
|
59
|
+
print(f"Warning: var column '{colname}' not found; treating as all-False mask.")
|
|
60
|
+
return np.zeros(n_vars, dtype=bool)
|
|
61
|
+
vals = adata.var[colname].values
|
|
62
|
+
# coerce truthiness
|
|
63
|
+
try:
|
|
64
|
+
return vals.astype(bool)
|
|
65
|
+
except Exception:
|
|
66
|
+
return np.array([bool(v) for v in vals], dtype=bool)
|
|
67
|
+
|
|
68
|
+
gpc_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_gpc_column.items()}
|
|
69
|
+
cpg_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_cpg_column.items()}
|
|
70
|
+
c_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_c_column.items()}
|
|
71
|
+
other_c_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_other_c_column.items()}
|
|
72
|
+
|
|
73
|
+
# prepare X as dense float32 for layer filling (we leave adata.X untouched)
|
|
74
|
+
X = adata.X
|
|
75
|
+
if sp.issparse(X):
|
|
76
|
+
if verbose:
|
|
77
|
+
print("Converting sparse X to dense array for layer construction (temporary).")
|
|
78
|
+
X = X.toarray()
|
|
79
|
+
X = np.asarray(X, dtype=np.float32)
|
|
80
|
+
|
|
81
|
+
# initialize masked arrays filled with NaN
|
|
82
|
+
masked_gpc = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
|
|
83
|
+
masked_cpg = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
|
|
84
|
+
masked_any_c = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
|
|
85
|
+
masked_other_c = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
|
|
86
|
+
|
|
87
|
+
# fill row-blocks per reference (this avoids creating a full row×var boolean mask)
|
|
88
|
+
obs_ref_series = adata.obs[reference_column]
|
|
89
|
+
for ref in references:
|
|
90
|
+
rows_mask = (obs_ref_series.values == ref)
|
|
91
|
+
if not rows_mask.any():
|
|
92
|
+
continue
|
|
93
|
+
row_idx = np.nonzero(rows_mask)[0] # integer indices of rows for this ref
|
|
94
|
+
|
|
95
|
+
# column masks for this ref
|
|
96
|
+
gpc_cols = gpc_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
|
|
97
|
+
cpg_cols = cpg_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
|
|
98
|
+
c_cols = c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
|
|
99
|
+
other_c_cols = other_c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
|
|
100
|
+
|
|
101
|
+
if gpc_cols.any():
|
|
102
|
+
# assign only the submatrix (rows x selected cols)
|
|
103
|
+
masked_gpc[np.ix_(row_idx, gpc_cols)] = X[np.ix_(row_idx, gpc_cols)]
|
|
104
|
+
if cpg_cols.any():
|
|
105
|
+
masked_cpg[np.ix_(row_idx, cpg_cols)] = X[np.ix_(row_idx, cpg_cols)]
|
|
106
|
+
if c_cols.any():
|
|
107
|
+
masked_any_c[np.ix_(row_idx, c_cols)] = X[np.ix_(row_idx, c_cols)]
|
|
108
|
+
if other_c_cols.any():
|
|
109
|
+
masked_other_c[np.ix_(row_idx, other_c_cols)] = X[np.ix_(row_idx, other_c_cols)]
|
|
110
|
+
|
|
111
|
+
# Build combined layer:
|
|
112
|
+
# - numeric_sum: sum where either exists, NaN where neither exists
|
|
113
|
+
# we compute numeric sum but preserve NaN where both are NaN
|
|
114
|
+
gpc_nan = np.isnan(masked_gpc)
|
|
115
|
+
cpg_nan = np.isnan(masked_cpg)
|
|
116
|
+
combined_sum = np.nan_to_num(masked_gpc, nan=0.0) + np.nan_to_num(masked_cpg, nan=0.0)
|
|
117
|
+
both_nan = gpc_nan & cpg_nan
|
|
118
|
+
combined_sum[both_nan] = np.nan
|
|
119
|
+
|
|
120
|
+
# Alternative: if you prefer a boolean OR combined layer, uncomment:
|
|
121
|
+
# combined_bool = (~gpc_nan & (masked_gpc != 0)) | (~cpg_nan & (masked_cpg != 0))
|
|
122
|
+
# combined_layer = combined_bool.astype(np.float32)
|
|
123
|
+
|
|
124
|
+
adata.layers['GpC_site_binary'] = masked_gpc
|
|
125
|
+
adata.layers['CpG_site_binary'] = masked_cpg
|
|
126
|
+
adata.layers['GpC_CpG_combined_site_binary'] = combined_sum
|
|
127
|
+
adata.layers['any_C_site_binary'] = masked_any_c
|
|
128
|
+
adata.layers['other_C_site_binary'] = masked_other_c
|
|
129
|
+
|
|
130
|
+
if verbose:
|
|
131
|
+
def _filled_positions(arr):
|
|
132
|
+
return int(np.sum(~np.isnan(arr)))
|
|
133
|
+
print("Layer build summary (non-NaN cell counts):")
|
|
134
|
+
print(f" GpC: {_filled_positions(masked_gpc)}")
|
|
135
|
+
print(f" CpG: {_filled_positions(masked_cpg)}")
|
|
136
|
+
print(f" GpC+CpG combined: {_filled_positions(combined_sum)}")
|
|
137
|
+
print(f" any_C: {_filled_positions(masked_any_c)}")
|
|
138
|
+
print(f" other_C: {_filled_positions(masked_other_c)}")
|
|
139
|
+
|
|
140
|
+
# mark as done
|
|
141
|
+
adata.uns[uns_flag] = True
|
|
142
|
+
|
|
143
|
+
return adata
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
## mark_duplicates
|
|
2
|
+
|
|
3
|
+
def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', method='N_masked_distances', distance_thresholds={}):
|
|
4
|
+
"""
|
|
5
|
+
Marks duplicates in the adata object.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): An adata object.
|
|
9
|
+
layers (list): A list of strings representing the layers to use.
|
|
10
|
+
obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
|
|
11
|
+
sample_col (str): A string representing the obs column name to second subset on. Default is 'Sample_names'.
|
|
12
|
+
method (str): method to use for calculating the distance metric
|
|
13
|
+
distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
None
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
import matplotlib.pyplot as plt
|
|
22
|
+
from scipy.signal import find_peaks
|
|
23
|
+
import networkx as nx
|
|
24
|
+
from .binary_layers_to_ohe import binary_layers_to_ohe
|
|
25
|
+
from .calculate_pairwise_differences import calculate_pairwise_differences
|
|
26
|
+
from .min_non_diagonal import min_non_diagonal
|
|
27
|
+
|
|
28
|
+
categories = adata.obs[obs_column].cat.categories
|
|
29
|
+
sample_names = adata.obs[sample_col].cat.categories
|
|
30
|
+
|
|
31
|
+
# Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
|
|
32
|
+
adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
|
|
33
|
+
cat_sample_dict = {}
|
|
34
|
+
for cat in categories:
|
|
35
|
+
cat_subset = adata[adata.obs[obs_column] == cat].copy()
|
|
36
|
+
for sample in sample_names:
|
|
37
|
+
sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
|
|
38
|
+
sample_subset = sample_subset[:, sample_subset.var[f'{cat}_any_C_site'] == True].copy() # only uses C sites from the converted strand
|
|
39
|
+
# Encode sequencing reads as a one-hot-encodings
|
|
40
|
+
print(f'One-hot encoding reads from {sample} on {cat}')
|
|
41
|
+
cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
|
|
42
|
+
# Unpack the read names and one hot encodings into lists
|
|
43
|
+
read_names = []
|
|
44
|
+
ohe_list = []
|
|
45
|
+
for read_name, ohe in cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'].items():
|
|
46
|
+
read_names.append(read_name)
|
|
47
|
+
ohe_list.append(ohe)
|
|
48
|
+
# Calculate the pairwise hamming distances
|
|
49
|
+
if method == 'N_masked_distances':
|
|
50
|
+
print(f'Calculating N_masked_distances for {sample} on {cat} allele')
|
|
51
|
+
distance_matrix = calculate_pairwise_differences(ohe_list)
|
|
52
|
+
else:
|
|
53
|
+
print(f'{method} for calculating differences is not available')
|
|
54
|
+
n_reads = distance_matrix.shape[0]
|
|
55
|
+
# Load the hamming matrix into a dataframe with index and column names as the read_names
|
|
56
|
+
distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
|
|
57
|
+
cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
|
|
58
|
+
|
|
59
|
+
if n_reads > 1:
|
|
60
|
+
# Calculate the minimum non-self distance for every read in the reference and sample
|
|
61
|
+
min_distance_values = min_non_diagonal(distance_matrix)
|
|
62
|
+
min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
|
|
63
|
+
adata.obs.update(min_distance_df)
|
|
64
|
+
|
|
65
|
+
if cat in distance_thresholds:
|
|
66
|
+
adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = distance_thresholds[cat]
|
|
67
|
+
else: # eventually this should be written to use known PCR duplicate controls for thresholding.
|
|
68
|
+
# Generate a histogram of minimum non-self distances for each read
|
|
69
|
+
if n_reads > 3:
|
|
70
|
+
n_bins = n_reads // 4
|
|
71
|
+
else:
|
|
72
|
+
n_bins = 1
|
|
73
|
+
min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
|
|
74
|
+
# Normalize the max value in any histogram bin to 1
|
|
75
|
+
normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
|
|
76
|
+
# Extract the bin index of peak centers in the histogram
|
|
77
|
+
try:
|
|
78
|
+
peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
|
|
79
|
+
first_peak_index = peak_centers[0]
|
|
80
|
+
offset_index = first_peak_index-1
|
|
81
|
+
# Use the distance corresponding to the first peak as the threshold distance in graph construction
|
|
82
|
+
first_peak_distance = min_distance_bins[1][first_peak_index]
|
|
83
|
+
offset_distance = min_distance_bins[1][offset_index]
|
|
84
|
+
except:
|
|
85
|
+
offset_distance = normalized_min_distance_counts[0]
|
|
86
|
+
adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
|
|
87
|
+
else:
|
|
88
|
+
adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
|
|
89
|
+
|
|
90
|
+
## Detect likely duplicate reads and mark them in the adata object.
|
|
91
|
+
adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
|
|
92
|
+
adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
|
|
93
|
+
adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
|
|
94
|
+
|
|
95
|
+
for cat in categories:
|
|
96
|
+
for sample in sample_names:
|
|
97
|
+
distance_df = cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
|
|
98
|
+
read_names = distance_df.index
|
|
99
|
+
distance_matrix = distance_df.values
|
|
100
|
+
n_reads = distance_matrix.shape[0]
|
|
101
|
+
distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
|
|
102
|
+
# Initialize the read distance graph
|
|
103
|
+
G = nx.Graph()
|
|
104
|
+
# Add each read as a node to the graph
|
|
105
|
+
G.add_nodes_from(range(n_reads))
|
|
106
|
+
# Add edges based on the threshold
|
|
107
|
+
for i in range(n_reads):
|
|
108
|
+
for j in range(i + 1, n_reads):
|
|
109
|
+
if distance_matrix[i, j] <= distance_threshold:
|
|
110
|
+
G.add_edge(i, j)
|
|
111
|
+
# Determine distinct clusters using connected components
|
|
112
|
+
clusters = list(nx.connected_components(G))
|
|
113
|
+
clusters = [list(cluster) for cluster in clusters]
|
|
114
|
+
# Get the number of clusters
|
|
115
|
+
cluster_count = len(clusters)
|
|
116
|
+
if n_reads > 0:
|
|
117
|
+
fraction_unique = cluster_count / n_reads
|
|
118
|
+
else:
|
|
119
|
+
fraction_unique = 0
|
|
120
|
+
adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}'] = cluster_count
|
|
121
|
+
adata.uns[f'total_reads_within_{cat}_{sample}'] = n_reads
|
|
122
|
+
# Update the adata object
|
|
123
|
+
read_cluster_map = {}
|
|
124
|
+
read_duplicate_map = {}
|
|
125
|
+
read_keep_map = {}
|
|
126
|
+
for i, cluster in enumerate(clusters):
|
|
127
|
+
for j, read_index in enumerate(cluster):
|
|
128
|
+
read_name = read_names[read_index]
|
|
129
|
+
read_cluster_map[read_name] = i
|
|
130
|
+
if len(cluster) > 1:
|
|
131
|
+
read_duplicate_map[read_name] = True
|
|
132
|
+
if j == 0:
|
|
133
|
+
read_keep_map[read_name] = True
|
|
134
|
+
else:
|
|
135
|
+
read_keep_map[read_name] = False
|
|
136
|
+
elif len(cluster) == 1:
|
|
137
|
+
read_duplicate_map[read_name] = False
|
|
138
|
+
read_keep_map[read_name] = True
|
|
139
|
+
cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
|
|
140
|
+
duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
|
|
141
|
+
keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
|
|
142
|
+
df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
|
|
143
|
+
adata.obs.update(df_combined)
|
|
144
|
+
adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
|
|
145
|
+
adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
|
|
146
|
+
print(f'Hamming clusters for {sample} on {cat}\nThreshold: {distance_threshold}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {fraction_unique}')
|