smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +34 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/hmm/apply_hmm_batched.py +242 -0
- smftools/hmm/calculate_distances.py +18 -0
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/hmm/display_hmm.py +18 -0
- smftools/hmm/hmm_readwrite.py +16 -0
- smftools/hmm/nucleosome_hmm_refinement.py +104 -0
- smftools/hmm/train_hmm.py +78 -0
- smftools/informatics/__init__.py +14 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/conversion_smf.py +132 -0
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/direct_smf.py +137 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/fast5_to_pod5.py +24 -0
- smftools/informatics/helpers/__init__.py +73 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +70 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +51 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +32 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/data/preprocessing.py +6 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/__init__.py +9 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/machine_learning/models/positional.py +18 -0
- smftools/machine_learning/models/rnn.py +17 -0
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/models/wrappers.py +20 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +10 -0
- smftools/machine_learning/utils/grl.py +14 -0
- smftools/plotting/__init__.py +18 -0
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +682 -0
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +38 -0
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_coverage.py +51 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +62 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- smftools/preprocessing/invert_adata.py +37 -0
- smftools/preprocessing/load_sample_sheet.py +53 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +1004 -0
- smftools/tools/__init__.py +20 -0
- smftools/tools/archived/apply_hmm.py +202 -0
- smftools/tools/archived/classifiers.py +787 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/position_stats.py +601 -0
- smftools/tools/read_stats.py +184 -0
- smftools/tools/spatial_autocorrelation.py +562 -0
- smftools/tools/subset_adata.py +28 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
def binarize_on_Youden(adata, obs_column='Reference'):
|
|
2
|
+
"""
|
|
3
|
+
Binarize SMF values based on position thresholds determined by calculate_position_Youden.
|
|
4
|
+
|
|
5
|
+
Parameters:
|
|
6
|
+
adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
|
|
7
|
+
obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
|
|
8
|
+
|
|
9
|
+
Modifies:
|
|
10
|
+
Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
|
|
11
|
+
"""
|
|
12
|
+
import numpy as np
|
|
13
|
+
import anndata as ad
|
|
14
|
+
|
|
15
|
+
# Initialize an empty matrix to store the binarized methylation values
|
|
16
|
+
binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
|
|
17
|
+
|
|
18
|
+
# Get unique categories
|
|
19
|
+
categories = adata.obs[obs_column].cat.categories
|
|
20
|
+
|
|
21
|
+
for cat in categories:
|
|
22
|
+
# Select subset for this category
|
|
23
|
+
cat_mask = adata.obs[obs_column] == cat
|
|
24
|
+
cat_subset = adata[cat_mask]
|
|
25
|
+
|
|
26
|
+
# Extract the probability matrix
|
|
27
|
+
original_matrix = cat_subset.X.copy()
|
|
28
|
+
|
|
29
|
+
# Extract the thresholds for each position efficiently
|
|
30
|
+
thresholds = np.array(cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
|
|
31
|
+
|
|
32
|
+
# Identify NaN values
|
|
33
|
+
nan_mask = np.isnan(original_matrix)
|
|
34
|
+
|
|
35
|
+
# Binarize based on threshold
|
|
36
|
+
binarized_matrix = (original_matrix > thresholds).astype(float)
|
|
37
|
+
|
|
38
|
+
# Restore NaN values
|
|
39
|
+
binarized_matrix[nan_mask] = np.nan
|
|
40
|
+
|
|
41
|
+
# Assign the binarized values back into the preallocated storage
|
|
42
|
+
binarized_methylation[cat_mask, :] = binarized_matrix
|
|
43
|
+
|
|
44
|
+
# Store the binarized matrix in a new layer
|
|
45
|
+
adata.layers['binarized_methylation'] = binarized_methylation
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
## binary_layers_to_ohe
|
|
2
|
+
|
|
3
|
+
## Conversion SMF Specific
|
|
4
|
+
def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
|
|
5
|
+
"""
|
|
6
|
+
Parameters:
|
|
7
|
+
adata (AnnData): Anndata object.
|
|
8
|
+
binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
|
|
9
|
+
stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
|
|
13
|
+
Input: An adata object and a list of layers containing a binary encoding.
|
|
14
|
+
"""
|
|
15
|
+
import numpy as np
|
|
16
|
+
import anndata as ad
|
|
17
|
+
|
|
18
|
+
# Ensure that the N layer is last!
|
|
19
|
+
# Grab all binary layers that are not encoding N
|
|
20
|
+
ACGT_binary_layers = [layer for layer in binary_layers if 'binary' in layer and layer != 'N_binary_encoding']
|
|
21
|
+
# If there is a binary layer encoding N, hold it in N_binary_layer
|
|
22
|
+
N_binary_layer = [layer for layer in binary_layers if layer == 'N_binary_encoding']
|
|
23
|
+
# Add the N_binary_encoding layer to the end of the list of binary layers
|
|
24
|
+
all_binary_layers = ACGT_binary_layers + N_binary_layer
|
|
25
|
+
print(f'Found {all_binary_layers} layers in adata')
|
|
26
|
+
|
|
27
|
+
# Extract the layers
|
|
28
|
+
layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
|
|
29
|
+
n_reads = layers[0].shape[0]
|
|
30
|
+
ohe_dict = {}
|
|
31
|
+
for i in range(n_reads):
|
|
32
|
+
read_ohe = []
|
|
33
|
+
for layer in layers:
|
|
34
|
+
read_ohe.append(layer[i])
|
|
35
|
+
read_name = adata.obs_names[i]
|
|
36
|
+
if stack == 'hstack':
|
|
37
|
+
ohe_dict[read_name] = np.hstack(read_ohe)
|
|
38
|
+
elif stack == 'vstack':
|
|
39
|
+
ohe_dict[read_name] = np.vstack(read_ohe)
|
|
40
|
+
return ohe_dict
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
## calculate_complexity
|
|
2
|
+
|
|
3
|
+
def calculate_complexity(adata, output_directory='', obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False):
|
|
4
|
+
"""
|
|
5
|
+
A complexity analysis of the library.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): An adata object with mark_duplicates already run.
|
|
9
|
+
output_directory (str): String representing the path to the output directory.
|
|
10
|
+
obs_column (str): String of the obs column to iterate over.
|
|
11
|
+
sample_col (str): String of the sample column to iterate over.
|
|
12
|
+
plot (bool): Whether to plot the complexity model.
|
|
13
|
+
save_plot (bool): Whether to save the complexity model.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
None
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from scipy.optimize import curve_fit
|
|
22
|
+
|
|
23
|
+
def lander_waterman(x, C0):
|
|
24
|
+
return C0 * (1 - np.exp(-x / C0))
|
|
25
|
+
|
|
26
|
+
def count_unique_reads(reads, depth):
|
|
27
|
+
subsample = np.random.choice(reads, depth, replace=False)
|
|
28
|
+
return len(np.unique(subsample))
|
|
29
|
+
|
|
30
|
+
categories = adata.obs[obs_column].cat.categories
|
|
31
|
+
sample_names = adata.obs[sample_col].cat.categories
|
|
32
|
+
|
|
33
|
+
for cat in categories:
|
|
34
|
+
for sample in sample_names:
|
|
35
|
+
unique_reads = adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}']
|
|
36
|
+
total_reads = adata.uns[f'total_reads_within_{cat}_{sample}']
|
|
37
|
+
reads = np.concatenate((np.arange(unique_reads), np.random.choice(unique_reads, total_reads - unique_reads, replace=True)))
|
|
38
|
+
# Subsampling depths
|
|
39
|
+
subsampling_depths = [total_reads // (i+1) for i in range(10)]
|
|
40
|
+
# Arrays to store results
|
|
41
|
+
subsampled_total_reads = []
|
|
42
|
+
subsampled_unique_reads = []
|
|
43
|
+
# Perform subsampling
|
|
44
|
+
for depth in subsampling_depths:
|
|
45
|
+
unique_count = count_unique_reads(reads, depth)
|
|
46
|
+
subsampled_total_reads.append(depth)
|
|
47
|
+
subsampled_unique_reads.append(unique_count)
|
|
48
|
+
# Fit the Lander-Waterman model to the data
|
|
49
|
+
popt, _ = curve_fit(lander_waterman, subsampled_total_reads, subsampled_unique_reads)
|
|
50
|
+
# Generate data for the complexity curve
|
|
51
|
+
x_data = np.linspace(0, 5000, 100)
|
|
52
|
+
y_data = lander_waterman(x_data, *popt)
|
|
53
|
+
adata.uns[f'Library_complexity_of_{sample}_on_{cat}'] = popt[0]
|
|
54
|
+
if plot:
|
|
55
|
+
import matplotlib.pyplot as plt
|
|
56
|
+
# Plot the complexity curve
|
|
57
|
+
plt.figure(figsize=(6, 4))
|
|
58
|
+
plt.plot(total_reads, unique_reads, 'o', label='Observed unique reads')
|
|
59
|
+
plt.plot(x_data, y_data, '-', label=f'Lander-Waterman fit\nEstimated C0 = {popt[0]:.2f}')
|
|
60
|
+
plt.xlabel('Total number of reads')
|
|
61
|
+
plt.ylabel('Number of unique reads')
|
|
62
|
+
title = f'Library Complexity Analysis for {sample} on {cat}'
|
|
63
|
+
plt.title(title)
|
|
64
|
+
plt.legend()
|
|
65
|
+
plt.grid(True)
|
|
66
|
+
if save_plot:
|
|
67
|
+
date_string = date_string()
|
|
68
|
+
save_name = output_directory + f'/{date_string}_{title}'
|
|
69
|
+
plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
|
|
70
|
+
plt.close()
|
|
71
|
+
else:
|
|
72
|
+
plt.show()
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
def calculate_complexity_II(
|
|
3
|
+
adata,
|
|
4
|
+
output_directory='',
|
|
5
|
+
sample_col='Sample_names',
|
|
6
|
+
ref_col: Optional[str] = 'Reference_strand',
|
|
7
|
+
cluster_col='sequence__merged_cluster_id',
|
|
8
|
+
plot=True,
|
|
9
|
+
save_plot=False,
|
|
10
|
+
n_boot=30,
|
|
11
|
+
n_depths=12,
|
|
12
|
+
random_state=0,
|
|
13
|
+
csv_summary=True,
|
|
14
|
+
uns_flag='complexity_analysis_complete',
|
|
15
|
+
force_redo=False,
|
|
16
|
+
bypass=False
|
|
17
|
+
):
|
|
18
|
+
"""
|
|
19
|
+
Estimate and plot library complexity.
|
|
20
|
+
|
|
21
|
+
If ref_col is None (default), behaves as before: one calculation per sample.
|
|
22
|
+
If ref_col is provided, computes complexity for each (sample, ref) pair.
|
|
23
|
+
|
|
24
|
+
Results:
|
|
25
|
+
- adata.uns['Library_complexity_results'] : dict keyed by (sample,) or (sample, ref) -> dict with fields
|
|
26
|
+
C0, n_reads, n_unique, depths, mean_unique, ci_low, ci_high
|
|
27
|
+
- Also stores per-entity record in adata.uns[f'Library_complexity_{sanitized_name}'] (backwards compatible)
|
|
28
|
+
- Optionally saves PNGs and CSVs (curve points + fit summary)
|
|
29
|
+
"""
|
|
30
|
+
import os
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
import matplotlib.pyplot as plt
|
|
34
|
+
from scipy.optimize import curve_fit
|
|
35
|
+
from datetime import datetime
|
|
36
|
+
|
|
37
|
+
# early exits
|
|
38
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
39
|
+
if (already and not force_redo):
|
|
40
|
+
return None
|
|
41
|
+
if bypass:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
rng = np.random.default_rng(random_state)
|
|
45
|
+
|
|
46
|
+
def lw(x, C0):
|
|
47
|
+
return C0 * (1.0 - np.exp(-x / C0))
|
|
48
|
+
|
|
49
|
+
def sanitize(name: str) -> str:
|
|
50
|
+
return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
|
|
51
|
+
|
|
52
|
+
# checks
|
|
53
|
+
for col in (sample_col, cluster_col):
|
|
54
|
+
if col not in adata.obs.columns:
|
|
55
|
+
raise KeyError(f"Required column '{col}' not found in adata.obs")
|
|
56
|
+
if ref_col is not None and ref_col not in adata.obs.columns:
|
|
57
|
+
raise KeyError(f"ref_col '{ref_col}' not found in adata.obs")
|
|
58
|
+
|
|
59
|
+
if save_plot or csv_summary:
|
|
60
|
+
os.makedirs(output_directory or ".", exist_ok=True)
|
|
61
|
+
|
|
62
|
+
# containers to collect CSV rows across all groups
|
|
63
|
+
fit_records = []
|
|
64
|
+
curve_records = []
|
|
65
|
+
|
|
66
|
+
# output dict stored centrally
|
|
67
|
+
results = {}
|
|
68
|
+
|
|
69
|
+
# build list of groups: either samples only, or (sample, ref) pairs
|
|
70
|
+
sseries = adata.obs[sample_col].astype("category")
|
|
71
|
+
samples = list(sseries.cat.categories)
|
|
72
|
+
if ref_col is None:
|
|
73
|
+
group_keys = [(s,) for s in samples]
|
|
74
|
+
else:
|
|
75
|
+
rseries = adata.obs[ref_col].astype("category")
|
|
76
|
+
references = list(rseries.cat.categories)
|
|
77
|
+
group_keys = []
|
|
78
|
+
# iterate only pairs that exist in data to avoid empty processing
|
|
79
|
+
for s in samples:
|
|
80
|
+
mask_s = (adata.obs[sample_col] == s)
|
|
81
|
+
# find references present for this sample
|
|
82
|
+
ref_present = pd.Categorical(adata.obs.loc[mask_s, ref_col]).categories
|
|
83
|
+
# Use intersection of known reference categories and those present for sample
|
|
84
|
+
for r in ref_present:
|
|
85
|
+
group_keys.append((s, r))
|
|
86
|
+
|
|
87
|
+
# iterate groups
|
|
88
|
+
for g in group_keys:
|
|
89
|
+
if ref_col is None:
|
|
90
|
+
sample = g[0]
|
|
91
|
+
# filter mask
|
|
92
|
+
mask = (adata.obs[sample_col] == sample).values
|
|
93
|
+
group_label = f"{sample}"
|
|
94
|
+
else:
|
|
95
|
+
sample, ref = g
|
|
96
|
+
mask = (adata.obs[sample_col] == sample) & (adata.obs[ref_col] == ref)
|
|
97
|
+
group_label = f"{sample}__{ref}"
|
|
98
|
+
|
|
99
|
+
n_reads = int(mask.sum())
|
|
100
|
+
if n_reads < 2:
|
|
101
|
+
# store empty placeholders and continue
|
|
102
|
+
results[g] = {
|
|
103
|
+
"C0": np.nan,
|
|
104
|
+
"n_reads": int(n_reads),
|
|
105
|
+
"n_unique": 0,
|
|
106
|
+
"depths": np.array([], dtype=int),
|
|
107
|
+
"mean_unique": np.array([], dtype=float),
|
|
108
|
+
"ci_low": np.array([], dtype=float),
|
|
109
|
+
"ci_high": np.array([], dtype=float),
|
|
110
|
+
}
|
|
111
|
+
# also store back-compat key
|
|
112
|
+
adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# cluster ids array for this group
|
|
116
|
+
clusters = adata.obs.loc[mask, cluster_col].to_numpy()
|
|
117
|
+
# observed unique molecules at full depth
|
|
118
|
+
observed_unique = int(pd.unique(clusters).size)
|
|
119
|
+
|
|
120
|
+
# choose subsampling depths
|
|
121
|
+
if n_depths < 2:
|
|
122
|
+
depths = np.array([n_reads], dtype=int)
|
|
123
|
+
else:
|
|
124
|
+
lo = max(10, int(0.05 * n_reads))
|
|
125
|
+
depths = np.unique(np.linspace(lo, n_reads, n_depths, dtype=int))
|
|
126
|
+
depths = depths[depths > 0]
|
|
127
|
+
depths = depths.astype(int)
|
|
128
|
+
if depths.size == 0:
|
|
129
|
+
depths = np.array([n_reads], dtype=int)
|
|
130
|
+
|
|
131
|
+
# bootstrap sampling: for each depth, sample without replacement (if possible)
|
|
132
|
+
idx_all = np.arange(n_reads)
|
|
133
|
+
boot_unique = np.zeros((len(depths), n_boot), dtype=float)
|
|
134
|
+
for di, d in enumerate(depths):
|
|
135
|
+
d_use = int(min(d, n_reads))
|
|
136
|
+
# if d_use == n_reads we can short-circuit and set boot results to full observed uniques
|
|
137
|
+
if d_use == n_reads:
|
|
138
|
+
# bootstraps are deterministic in this special case
|
|
139
|
+
uniq_val = float(observed_unique)
|
|
140
|
+
boot_unique[di, :] = uniq_val
|
|
141
|
+
continue
|
|
142
|
+
# otherwise run bootstraps
|
|
143
|
+
for b in range(n_boot):
|
|
144
|
+
take = rng.choice(idx_all, size=d_use, replace=False)
|
|
145
|
+
boot_unique[di, b] = np.unique(clusters[take]).size
|
|
146
|
+
|
|
147
|
+
mean_unique = boot_unique.mean(axis=1)
|
|
148
|
+
lo_ci = np.percentile(boot_unique, 2.5, axis=1)
|
|
149
|
+
hi_ci = np.percentile(boot_unique, 97.5, axis=1)
|
|
150
|
+
|
|
151
|
+
# fit Lander-Waterman to the mean curve (safe bounds)
|
|
152
|
+
C0_init = max(observed_unique, mean_unique[-1] if mean_unique.size else observed_unique)
|
|
153
|
+
try:
|
|
154
|
+
popt, _ = curve_fit(
|
|
155
|
+
lw,
|
|
156
|
+
xdata=depths.astype(float),
|
|
157
|
+
ydata=mean_unique.astype(float),
|
|
158
|
+
p0=[C0_init],
|
|
159
|
+
bounds=(1.0, 1e12),
|
|
160
|
+
maxfev=10000,
|
|
161
|
+
)
|
|
162
|
+
C0 = float(popt[0])
|
|
163
|
+
except Exception:
|
|
164
|
+
C0 = float(observed_unique)
|
|
165
|
+
|
|
166
|
+
# store results
|
|
167
|
+
results[g] = {
|
|
168
|
+
"C0": C0,
|
|
169
|
+
"n_reads": int(n_reads),
|
|
170
|
+
"n_unique": int(observed_unique),
|
|
171
|
+
"depths": depths,
|
|
172
|
+
"mean_unique": mean_unique,
|
|
173
|
+
"ci_low": lo_ci,
|
|
174
|
+
"ci_high": hi_ci,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
# save per-group in adata.uns for backward compatibility
|
|
178
|
+
adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
|
|
179
|
+
|
|
180
|
+
# prepare curve and fit records for CSV
|
|
181
|
+
fit_records.append({
|
|
182
|
+
"sample": sample,
|
|
183
|
+
"reference": ref if ref_col is not None else "",
|
|
184
|
+
"C0": float(C0),
|
|
185
|
+
"n_reads": int(n_reads),
|
|
186
|
+
"n_unique_observed": int(observed_unique),
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
x_fit = np.linspace(0, max(n_reads, int(depths[-1]) if depths.size else n_reads), 200)
|
|
190
|
+
y_fit = lw(x_fit, C0)
|
|
191
|
+
for d, mu, lo, hi in zip(depths, mean_unique, lo_ci, hi_ci):
|
|
192
|
+
curve_records.append({
|
|
193
|
+
"sample": sample,
|
|
194
|
+
"reference": ref if ref_col is not None else "",
|
|
195
|
+
"type": "bootstrap",
|
|
196
|
+
"depth": int(d),
|
|
197
|
+
"mean_unique": float(mu),
|
|
198
|
+
"ci_low": float(lo),
|
|
199
|
+
"ci_high": float(hi),
|
|
200
|
+
})
|
|
201
|
+
for xf, yf in zip(x_fit, y_fit):
|
|
202
|
+
curve_records.append({
|
|
203
|
+
"sample": sample,
|
|
204
|
+
"reference": ref if ref_col is not None else "",
|
|
205
|
+
"type": "fit",
|
|
206
|
+
"depth": float(xf),
|
|
207
|
+
"mean_unique": float(yf),
|
|
208
|
+
"ci_low": np.nan,
|
|
209
|
+
"ci_high": np.nan,
|
|
210
|
+
})
|
|
211
|
+
|
|
212
|
+
# plotting for this group
|
|
213
|
+
if plot:
|
|
214
|
+
plt.figure(figsize=(6.5, 4.5))
|
|
215
|
+
plt.fill_between(depths, lo_ci, hi_ci, alpha=0.25, label="Bootstrap 95% CI")
|
|
216
|
+
plt.plot(depths, mean_unique, "o", label="Bootstrap mean")
|
|
217
|
+
plt.plot([n_reads], [observed_unique], "s", label="Observed (full)")
|
|
218
|
+
plt.plot(x_fit, y_fit, "-", label=f"LW fit C0≈{C0:,.0f}")
|
|
219
|
+
plt.xlabel("Total reads (subsampled depth)")
|
|
220
|
+
plt.ylabel("Unique molecules (clusters)")
|
|
221
|
+
title = f"Library Complexity — {sample}" + (f" / {ref}" if ref_col is not None else "")
|
|
222
|
+
plt.title(title)
|
|
223
|
+
plt.grid(True, alpha=0.3)
|
|
224
|
+
plt.legend()
|
|
225
|
+
plt.tight_layout()
|
|
226
|
+
|
|
227
|
+
if save_plot:
|
|
228
|
+
fname = f"complexity_{sanitize(group_label)}.png"
|
|
229
|
+
plt.savefig(os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight")
|
|
230
|
+
plt.close()
|
|
231
|
+
else:
|
|
232
|
+
plt.show()
|
|
233
|
+
|
|
234
|
+
# store central results dict
|
|
235
|
+
adata.uns["Library_complexity_results"] = results
|
|
236
|
+
|
|
237
|
+
# mark complexity analysis as complete
|
|
238
|
+
adata.uns[uns_flag] = True
|
|
239
|
+
|
|
240
|
+
# CSV outputs
|
|
241
|
+
if csv_summary and (fit_records or curve_records):
|
|
242
|
+
fit_df = pd.DataFrame(fit_records)
|
|
243
|
+
curve_df = pd.DataFrame(curve_records)
|
|
244
|
+
base = output_directory or "."
|
|
245
|
+
fit_df.to_csv(os.path.join(base, f"complexity_fit_summary.csv"), index=False)
|
|
246
|
+
curve_df.to_csv(os.path.join(base, f"complexity_curves.csv"), index=False)
|
|
247
|
+
|
|
248
|
+
return results
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# calculate_consensus
|
|
2
|
+
|
|
3
|
+
def calculate_consensus(adata, reference, sample=False, reference_column='Reference', sample_column='Sample'):
|
|
4
|
+
"""
|
|
5
|
+
Takes an input AnnData object, the reference to subset on, and the sample name to subset on to calculate the consensus sequence of the read set.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): The input adata to append consensus metadata to.
|
|
9
|
+
reference (str): The name of the reference to subset the adata on.
|
|
10
|
+
sample (bool | str): If False, uses all samples. If a string is passed, the adata is further subsetted to only analyze that sample.
|
|
11
|
+
reference_column (str): The name of the reference column (Default is 'Reference')
|
|
12
|
+
sample_column (str): The name of the sample column (Default is 'Sample)
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
None
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
# Subset the adata on the refernce of interest. Optionally, subset additionally on a sample of interest.
|
|
21
|
+
record_subset = adata[adata.obs[reference_column] == reference].copy()
|
|
22
|
+
if sample:
|
|
23
|
+
record_subset = record_subset[record_subset.obs[sample_column] == sample].copy()
|
|
24
|
+
else:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
# Grab layer names from the adata object that correspond to the binary encodings of the read sequences.
|
|
28
|
+
layers = [layer for layer in record_subset.layers if '_binary_' in layer]
|
|
29
|
+
layer_map, layer_counts = {}, []
|
|
30
|
+
for i, layer in enumerate(layers):
|
|
31
|
+
# Gives an integer mapping to access which sequence base the binary layer is encoding
|
|
32
|
+
layer_map[i] = layer.split('_')[0]
|
|
33
|
+
# Get the positional counts from all reads for the given base identity.
|
|
34
|
+
layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
|
|
35
|
+
# Combine the positional counts array derived from each binary base layer into an ndarray
|
|
36
|
+
count_array = np.array(layer_counts)
|
|
37
|
+
# Determine the row index that contains the largest count for each position and store this in an array.
|
|
38
|
+
nucleotide_indexes = np.argmax(count_array, axis=0)
|
|
39
|
+
# Map the base sequence derived from the row index array to attain the consensus sequence in a list.
|
|
40
|
+
consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
|
|
41
|
+
|
|
42
|
+
if sample:
|
|
43
|
+
adata.var[f'{reference}_consensus_from_{sample}'] = consensus_sequence_list
|
|
44
|
+
else:
|
|
45
|
+
adata.var[f'{reference}_consensus_across_samples'] = consensus_sequence_list
|
|
46
|
+
|
|
47
|
+
adata.uns[f'{reference}_consensus_sequence'] = consensus_sequence_list
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.00001, uns_flag='positional_coverage_calculated'):
|
|
2
|
+
"""
|
|
3
|
+
Append position-level metadata regarding whether the position is informative within the given observation category.
|
|
4
|
+
|
|
5
|
+
Parameters:
|
|
6
|
+
adata (AnnData): An AnnData object
|
|
7
|
+
obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
|
|
8
|
+
position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
|
|
9
|
+
|
|
10
|
+
Modifies:
|
|
11
|
+
- Adds new columns to `adata.var` containing coverage statistics.
|
|
12
|
+
"""
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import anndata as ad
|
|
16
|
+
|
|
17
|
+
# Only run if not already performed
|
|
18
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
19
|
+
if already:
|
|
20
|
+
# QC already performed; nothing to do
|
|
21
|
+
return
|
|
22
|
+
|
|
23
|
+
categories = adata.obs[obs_column].cat.categories
|
|
24
|
+
n_categories_with_position = np.zeros(adata.shape[1])
|
|
25
|
+
|
|
26
|
+
# Loop over categories
|
|
27
|
+
for cat in categories:
|
|
28
|
+
print(f'Assessing positional coverage across samples for {cat} reference')
|
|
29
|
+
|
|
30
|
+
# Subset to current category
|
|
31
|
+
cat_mask = adata.obs[obs_column] == cat
|
|
32
|
+
temp_cat_adata = adata[cat_mask]
|
|
33
|
+
|
|
34
|
+
# Compute fraction of valid coverage
|
|
35
|
+
cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
|
|
36
|
+
cat_valid_fraction = cat_valid_coverage / temp_cat_adata.shape[0] # Avoid extra computation
|
|
37
|
+
|
|
38
|
+
# Store coverage stats
|
|
39
|
+
adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
|
|
40
|
+
|
|
41
|
+
# Assign whether the position is covered based on threshold
|
|
42
|
+
adata.var[f'position_in_{cat}'] = cat_valid_fraction >= position_nan_threshold
|
|
43
|
+
|
|
44
|
+
# Sum the number of categories covering each position
|
|
45
|
+
n_categories_with_position += adata.var[f'position_in_{cat}'].values
|
|
46
|
+
|
|
47
|
+
# Store final category count
|
|
48
|
+
adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
|
|
49
|
+
|
|
50
|
+
# mark as done
|
|
51
|
+
adata.uns[uns_flag] = True
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# calculate_pairwise_differences
|
|
2
|
+
|
|
3
|
+
def calculate_pairwise_differences(arrays):
|
|
4
|
+
"""
|
|
5
|
+
Calculate the pairwise differences for a list of h-stacked ndarrays. Ignore N-positions
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
arrays (str): A list of ndarrays.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
distance_matrix (ndarray): a 2D array containing the pairwise differences between all arrays.
|
|
12
|
+
"""
|
|
13
|
+
import numpy as np
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
|
|
16
|
+
num_arrays = len(arrays)
|
|
17
|
+
|
|
18
|
+
n_rows = 5
|
|
19
|
+
reshaped_arrays = [array.reshape(n_rows, -1) for array in arrays]
|
|
20
|
+
N_masks = [array[-1].astype(bool) for array in reshaped_arrays]
|
|
21
|
+
reshaped_arrays_minus_N = [array[:-1].flatten() for array in reshaped_arrays]
|
|
22
|
+
|
|
23
|
+
# Precompute the repeated N masks to avoid repeated computations
|
|
24
|
+
repeated_N_masks = [np.tile(N_mask, (n_rows - 1)) for N_mask in N_masks]
|
|
25
|
+
|
|
26
|
+
# Initialize the distance matrix
|
|
27
|
+
distance_matrix = np.zeros((num_arrays, num_arrays), dtype=np.float32)
|
|
28
|
+
|
|
29
|
+
# Calculate pairwise distances with progress bar
|
|
30
|
+
for i in tqdm(range(num_arrays), desc="Calculating Pairwise Differences"):
|
|
31
|
+
array_i = reshaped_arrays_minus_N[i]
|
|
32
|
+
N_mask_i = repeated_N_masks[i]
|
|
33
|
+
|
|
34
|
+
for j in range(i + 1, num_arrays):
|
|
35
|
+
array_j = reshaped_arrays_minus_N[j]
|
|
36
|
+
N_mask_j = repeated_N_masks[j]
|
|
37
|
+
|
|
38
|
+
# Combined mask to ignore N positions
|
|
39
|
+
combined_mask = N_mask_i | N_mask_j
|
|
40
|
+
|
|
41
|
+
# Calculate the hamming distance directly with boolean operations
|
|
42
|
+
differences = (array_i != array_j) & ~combined_mask
|
|
43
|
+
distance = np.sum(differences) / np.sum(~combined_mask)
|
|
44
|
+
|
|
45
|
+
# Store the symmetric distances
|
|
46
|
+
distance_matrix[i, j] = distance
|
|
47
|
+
distance_matrix[j, i] = distance
|
|
48
|
+
|
|
49
|
+
return distance_matrix
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
## calculate_pairwise_hamming_distances
|
|
2
|
+
|
|
3
|
+
## Conversion SMF Specific
|
|
4
|
+
def calculate_pairwise_hamming_distances(arrays):
|
|
5
|
+
"""
|
|
6
|
+
Calculate the pairwise Hamming distances for a list of h-stacked ndarrays.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
arrays (str): A list of ndarrays.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
distance_matrix (ndarray): a 2D array containing the pairwise Hamming distances between all arrays.
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
import numpy as np
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
from scipy.spatial.distance import hamming
|
|
18
|
+
num_arrays = len(arrays)
|
|
19
|
+
# Initialize an empty distance matrix
|
|
20
|
+
distance_matrix = np.zeros((num_arrays, num_arrays))
|
|
21
|
+
# Calculate pairwise distances with progress bar
|
|
22
|
+
for i in tqdm(range(num_arrays), desc="Calculating Hamming Distances"):
|
|
23
|
+
for j in range(i + 1, num_arrays):
|
|
24
|
+
distance = hamming(arrays[i], arrays[j])
|
|
25
|
+
distance_matrix[i, j] = distance
|
|
26
|
+
distance_matrix[j, i] = distance
|
|
27
|
+
return distance_matrix
|