smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +5 -1
- smftools/_version.py +1 -1
- smftools/informatics/__init__.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/conversion_smf.py +63 -10
- smftools/informatics/direct_smf.py +66 -18
- smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
- smftools/informatics/helpers/__init__.py +16 -2
- smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
- smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
- smftools/informatics/helpers/canoncall.py +12 -3
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
- smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/extract_base_identities.py +33 -46
- smftools/informatics/helpers/extract_mods.py +55 -23
- smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/find_conversion_sites.py +33 -44
- smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
- smftools/informatics/helpers/modcall.py +13 -5
- smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
- smftools/informatics/helpers/ohe_batching.py +65 -41
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +45 -9
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/split_and_index_BAM.py +3 -8
- smftools/informatics/load_adata.py +58 -3
- smftools/plotting/__init__.py +15 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +205 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/preprocessing/__init__.py +6 -7
- smftools/preprocessing/append_C_context.py +22 -9
- smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
- smftools/preprocessing/binarize_on_Youden.py +35 -32
- smftools/preprocessing/binary_layers_to_ohe.py +13 -3
- smftools/preprocessing/calculate_complexity.py +3 -2
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
- smftools/preprocessing/calculate_coverage.py +26 -25
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_position_Youden.py +18 -7
- smftools/preprocessing/calculate_read_length_stats.py +39 -46
- smftools/preprocessing/clean_NaN.py +33 -25
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
- smftools/preprocessing/filter_reads_on_length.py +14 -4
- smftools/preprocessing/flag_duplicate_reads.py +149 -0
- smftools/preprocessing/invert_adata.py +18 -11
- smftools/preprocessing/load_sample_sheet.py +30 -16
- smftools/preprocessing/recipes.py +22 -20
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +105 -13
- smftools/tools/__init__.py +49 -0
- smftools/tools/apply_hmm.py +202 -0
- smftools/tools/apply_hmm_batched.py +241 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_distances.py +18 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/call_hmm_peaks.py +105 -0
- smftools/tools/classifiers.py +787 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/data/__init__.py +2 -0
- smftools/tools/data/anndata_data_module.py +90 -0
- smftools/tools/data/preprocessing.py +6 -0
- smftools/tools/display_hmm.py +18 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/hmm_readwrite.py +16 -0
- smftools/tools/inference/__init__.py +1 -0
- smftools/tools/inference/lightning_inference.py +41 -0
- smftools/tools/models/__init__.py +9 -0
- smftools/tools/models/base.py +14 -0
- smftools/tools/models/cnn.py +34 -0
- smftools/tools/models/lightning_base.py +41 -0
- smftools/tools/models/mlp.py +17 -0
- smftools/tools/models/positional.py +17 -0
- smftools/tools/models/rnn.py +16 -0
- smftools/tools/models/sklearn_models.py +40 -0
- smftools/tools/models/transformer.py +133 -0
- smftools/tools/models/wrappers.py +20 -0
- smftools/tools/nucleosome_hmm_refinement.py +104 -0
- smftools/tools/position_stats.py +239 -0
- smftools/tools/read_stats.py +70 -0
- smftools/tools/subset_adata.py +19 -23
- smftools/tools/train_hmm.py +78 -0
- smftools/tools/training/__init__.py +1 -0
- smftools/tools/training/train_lightning_model.py +47 -0
- smftools/tools/utils/__init__.py +2 -0
- smftools/tools/utils/device.py +10 -0
- smftools/tools/utils/grl.py +14 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
- smftools-0.1.7.dist-info/RECORD +136 -0
- smftools/tools/apply_HMM.py +0 -1
- smftools/tools/read_HMM.py +0 -1
- smftools/tools/train_HMM.py +0 -43
- smftools-0.1.3.dist-info/RECORD +0 -84
- /smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
- /smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
## mark_duplicates
|
|
2
2
|
|
|
3
|
-
def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names',
|
|
3
|
+
def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', method='N_masked_distances', distance_thresholds={}):
|
|
4
4
|
"""
|
|
5
5
|
Marks duplicates in the adata object.
|
|
6
6
|
|
|
@@ -8,8 +8,9 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
|
|
|
8
8
|
adata (AnnData): An adata object.
|
|
9
9
|
layers (list): A list of strings representing the layers to use.
|
|
10
10
|
obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
|
|
11
|
-
sample_col (str):
|
|
12
|
-
|
|
11
|
+
sample_col (str): A string representing the obs column name to second subset on. Default is 'Sample_names'.
|
|
12
|
+
method (str): method to use for calculating the distance metric
|
|
13
|
+
distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
|
|
13
14
|
|
|
14
15
|
Returns:
|
|
15
16
|
None
|
|
@@ -21,7 +22,7 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
|
|
|
21
22
|
from scipy.signal import find_peaks
|
|
22
23
|
import networkx as nx
|
|
23
24
|
from .binary_layers_to_ohe import binary_layers_to_ohe
|
|
24
|
-
from .
|
|
25
|
+
from .calculate_pairwise_differences import calculate_pairwise_differences
|
|
25
26
|
from .min_non_diagonal import min_non_diagonal
|
|
26
27
|
|
|
27
28
|
categories = adata.obs[obs_column].cat.categories
|
|
@@ -29,49 +30,59 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
|
|
|
29
30
|
|
|
30
31
|
# Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
|
|
31
32
|
adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
|
|
33
|
+
cat_sample_dict = {}
|
|
32
34
|
for cat in categories:
|
|
33
35
|
cat_subset = adata[adata.obs[obs_column] == cat].copy()
|
|
34
36
|
for sample in sample_names:
|
|
35
37
|
sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
|
|
38
|
+
sample_subset = sample_subset[:, sample_subset.var[f'{cat}_any_C_site'] == True].copy() # only uses C sites from the converted strand
|
|
36
39
|
# Encode sequencing reads as a one-hot-encodings
|
|
37
|
-
|
|
40
|
+
print(f'One-hot encoding reads from {sample} on {cat}')
|
|
41
|
+
cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
|
|
38
42
|
# Unpack the read names and one hot encodings into lists
|
|
39
43
|
read_names = []
|
|
40
44
|
ohe_list = []
|
|
41
|
-
for read_name, ohe in
|
|
45
|
+
for read_name, ohe in cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'].items():
|
|
42
46
|
read_names.append(read_name)
|
|
43
47
|
ohe_list.append(ohe)
|
|
44
48
|
# Calculate the pairwise hamming distances
|
|
45
|
-
|
|
46
|
-
|
|
49
|
+
if method == 'N_masked_distances':
|
|
50
|
+
print(f'Calculating N_masked_distances for {sample} on {cat} allele')
|
|
51
|
+
distance_matrix = calculate_pairwise_differences(ohe_list)
|
|
52
|
+
else:
|
|
53
|
+
print(f'{method} for calculating differences is not available')
|
|
47
54
|
n_reads = distance_matrix.shape[0]
|
|
48
55
|
# Load the hamming matrix into a dataframe with index and column names as the read_names
|
|
49
56
|
distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
|
|
50
|
-
|
|
51
|
-
|
|
57
|
+
cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
|
|
58
|
+
|
|
52
59
|
if n_reads > 1:
|
|
53
60
|
# Calculate the minimum non-self distance for every read in the reference and sample
|
|
54
61
|
min_distance_values = min_non_diagonal(distance_matrix)
|
|
55
62
|
min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
|
|
56
63
|
adata.obs.update(min_distance_df)
|
|
57
|
-
|
|
58
|
-
if
|
|
59
|
-
|
|
60
|
-
else:
|
|
61
|
-
n_bins = 1
|
|
62
|
-
min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
|
|
63
|
-
if cat in hamming_distance_thresholds:
|
|
64
|
-
adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = hamming_distance_thresholds[cat]
|
|
64
|
+
|
|
65
|
+
if cat in distance_thresholds:
|
|
66
|
+
adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = distance_thresholds[cat]
|
|
65
67
|
else: # eventually this should be written to use known PCR duplicate controls for thresholding.
|
|
68
|
+
# Generate a histogram of minimum non-self distances for each read
|
|
69
|
+
if n_reads > 3:
|
|
70
|
+
n_bins = n_reads // 4
|
|
71
|
+
else:
|
|
72
|
+
n_bins = 1
|
|
73
|
+
min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
|
|
66
74
|
# Normalize the max value in any histogram bin to 1
|
|
67
75
|
normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
|
|
68
76
|
# Extract the bin index of peak centers in the histogram
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
77
|
+
try:
|
|
78
|
+
peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
|
|
79
|
+
first_peak_index = peak_centers[0]
|
|
80
|
+
offset_index = first_peak_index-1
|
|
81
|
+
# Use the distance corresponding to the first peak as the threshold distance in graph construction
|
|
82
|
+
first_peak_distance = min_distance_bins[1][first_peak_index]
|
|
83
|
+
offset_distance = min_distance_bins[1][offset_index]
|
|
84
|
+
except:
|
|
85
|
+
offset_distance = normalized_min_distance_counts[0]
|
|
75
86
|
adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
|
|
76
87
|
else:
|
|
77
88
|
adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
|
|
@@ -83,7 +94,7 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
|
|
|
83
94
|
|
|
84
95
|
for cat in categories:
|
|
85
96
|
for sample in sample_names:
|
|
86
|
-
distance_df =
|
|
97
|
+
distance_df = cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
|
|
87
98
|
read_names = distance_df.index
|
|
88
99
|
distance_matrix = distance_df.values
|
|
89
100
|
n_reads = distance_matrix.shape[0]
|
|
@@ -106,7 +117,8 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
|
|
|
106
117
|
fraction_unique = cluster_count / n_reads
|
|
107
118
|
else:
|
|
108
119
|
fraction_unique = 0
|
|
109
|
-
adata.uns[f'
|
|
120
|
+
adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}'] = cluster_count
|
|
121
|
+
adata.uns[f'total_reads_within_{cat}_{sample}'] = n_reads
|
|
110
122
|
# Update the adata object
|
|
111
123
|
read_cluster_map = {}
|
|
112
124
|
read_duplicate_map = {}
|
|
@@ -1,42 +1,45 @@
|
|
|
1
|
-
## binarize_on_Youden
|
|
2
|
-
|
|
3
1
|
def binarize_on_Youden(adata, obs_column='Reference'):
|
|
4
2
|
"""
|
|
5
|
-
|
|
3
|
+
Binarize SMF values based on position thresholds determined by calculate_position_Youden.
|
|
6
4
|
|
|
7
5
|
Parameters:
|
|
8
|
-
adata (AnnData): The anndata object to binarize.
|
|
9
|
-
obs_column (str): The
|
|
10
|
-
|
|
11
|
-
|
|
6
|
+
adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
|
|
7
|
+
obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
|
|
8
|
+
|
|
9
|
+
Modifies:
|
|
10
|
+
Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
|
|
12
11
|
"""
|
|
13
12
|
import numpy as np
|
|
14
|
-
import anndata as ad
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
import anndata as ad
|
|
14
|
+
|
|
15
|
+
# Initialize an empty matrix to store the binarized methylation values
|
|
16
|
+
binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
|
|
17
|
+
|
|
18
|
+
# Get unique categories
|
|
19
|
+
categories = adata.obs[obs_column].cat.categories
|
|
20
|
+
|
|
17
21
|
for cat in categories:
|
|
18
|
-
#
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
#
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
# Select subset for this category
|
|
23
|
+
cat_mask = adata.obs[obs_column] == cat
|
|
24
|
+
cat_subset = adata[cat_mask]
|
|
25
|
+
|
|
26
|
+
# Extract the probability matrix
|
|
27
|
+
original_matrix = cat_subset.X.copy()
|
|
28
|
+
|
|
29
|
+
# Extract the thresholds for each position efficiently
|
|
30
|
+
thresholds = np.array(cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
|
|
31
|
+
|
|
32
|
+
# Identify NaN values
|
|
25
33
|
nan_mask = np.isnan(original_matrix)
|
|
26
|
-
|
|
34
|
+
|
|
35
|
+
# Binarize based on threshold
|
|
27
36
|
binarized_matrix = (original_matrix > thresholds).astype(float)
|
|
28
|
-
|
|
37
|
+
|
|
38
|
+
# Restore NaN values
|
|
29
39
|
binarized_matrix[nan_mask] = np.nan
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# If temp_adata is still None, initialize temp_adata with reference_subset
|
|
37
|
-
temp_adata = cat_subset.copy()
|
|
38
|
-
|
|
39
|
-
# Sort the temp adata on the index names of the primary adata
|
|
40
|
-
temp_adata = temp_adata[adata.obs_names].copy()
|
|
41
|
-
# Pull back the new binarized layers into the original adata object
|
|
42
|
-
adata.layers['binarized_methylation'] = temp_adata.layers['binarized_methylation']
|
|
40
|
+
|
|
41
|
+
# Assign the binarized values back into the preallocated storage
|
|
42
|
+
binarized_methylation[cat_mask, :] = binarized_matrix
|
|
43
|
+
|
|
44
|
+
# Store the binarized matrix in a new layer
|
|
45
|
+
adata.layers['binarized_methylation'] = binarized_methylation
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
## binary_layers_to_ohe
|
|
2
2
|
|
|
3
3
|
## Conversion SMF Specific
|
|
4
|
-
def binary_layers_to_ohe(adata,
|
|
4
|
+
def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
|
|
5
5
|
"""
|
|
6
6
|
Parameters:
|
|
7
7
|
adata (AnnData): Anndata object.
|
|
8
|
-
|
|
8
|
+
binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
|
|
9
9
|
stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
|
|
10
10
|
|
|
11
11
|
Returns:
|
|
@@ -14,8 +14,18 @@ def binary_layers_to_ohe(adata, layers, stack='hstack'):
|
|
|
14
14
|
"""
|
|
15
15
|
import numpy as np
|
|
16
16
|
import anndata as ad
|
|
17
|
+
|
|
18
|
+
# Ensure that the N layer is last!
|
|
19
|
+
# Grab all binary layers that are not encoding N
|
|
20
|
+
ACGT_binary_layers = [layer for layer in binary_layers if 'binary' in layer and layer != 'N_binary_encoding']
|
|
21
|
+
# If there is a binary layer encoding N, hold it in N_binary_layer
|
|
22
|
+
N_binary_layer = [layer for layer in binary_layers if layer == 'N_binary_encoding']
|
|
23
|
+
# Add the N_binary_encoding layer to the end of the list of binary layers
|
|
24
|
+
all_binary_layers = ACGT_binary_layers + N_binary_layer
|
|
25
|
+
print(f'Found {all_binary_layers} layers in adata')
|
|
26
|
+
|
|
17
27
|
# Extract the layers
|
|
18
|
-
layers = [adata.layers[layer_name] for layer_name in
|
|
28
|
+
layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
|
|
19
29
|
n_reads = layers[0].shape[0]
|
|
20
30
|
ohe_dict = {}
|
|
21
31
|
for i in range(n_reads):
|
|
@@ -32,7 +32,8 @@ def calculate_complexity(adata, output_directory='', obs_column='Reference', sam
|
|
|
32
32
|
|
|
33
33
|
for cat in categories:
|
|
34
34
|
for sample in sample_names:
|
|
35
|
-
unique_reads
|
|
35
|
+
unique_reads = adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}']
|
|
36
|
+
total_reads = adata.uns[f'total_reads_within_{cat}_{sample}']
|
|
36
37
|
reads = np.concatenate((np.arange(unique_reads), np.random.choice(unique_reads, total_reads - unique_reads, replace=True)))
|
|
37
38
|
# Subsampling depths
|
|
38
39
|
subsampling_depths = [total_reads // (i+1) for i in range(10)]
|
|
@@ -49,7 +50,7 @@ def calculate_complexity(adata, output_directory='', obs_column='Reference', sam
|
|
|
49
50
|
# Generate data for the complexity curve
|
|
50
51
|
x_data = np.linspace(0, 5000, 100)
|
|
51
52
|
y_data = lander_waterman(x_data, *popt)
|
|
52
|
-
adata.uns[f'
|
|
53
|
+
adata.uns[f'Library_complexity_of_{sample}_on_{cat}'] = popt[0]
|
|
53
54
|
if plot:
|
|
54
55
|
import matplotlib.pyplot as plt
|
|
55
56
|
# Plot the complexity curve
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
## Conversion SMF Specific
|
|
4
4
|
# Read methylation QC
|
|
5
5
|
|
|
6
|
-
def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col
|
|
6
|
+
def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col):
|
|
7
7
|
"""
|
|
8
8
|
Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives).
|
|
9
9
|
|
|
@@ -11,9 +11,6 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
|
|
|
11
11
|
adata (AnnData): An adata object
|
|
12
12
|
reference_column (str): String representing the name of the Reference column to use
|
|
13
13
|
sample_names_col (str): String representing the name of the sample name column to use
|
|
14
|
-
output_directory (str): String representing the output directory to make and write out the histograms.
|
|
15
|
-
show_methylation_histogram (bool): Whether to display the histograms.
|
|
16
|
-
save_methylation_histogram (bool): Whether to save the histograms.
|
|
17
14
|
|
|
18
15
|
Returns:
|
|
19
16
|
None
|
|
@@ -21,8 +18,8 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
|
|
|
21
18
|
import numpy as np
|
|
22
19
|
import anndata as ad
|
|
23
20
|
import pandas as pd
|
|
24
|
-
|
|
25
|
-
|
|
21
|
+
|
|
22
|
+
print('Calculating read level methylation statistics')
|
|
26
23
|
|
|
27
24
|
references = set(adata.obs[reference_column])
|
|
28
25
|
sample_names = set(adata.obs[sample_names_col])
|
|
@@ -53,44 +50,45 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
|
|
|
53
50
|
pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
|
|
54
51
|
adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
|
|
55
52
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
53
|
+
# Below should be a plotting function
|
|
54
|
+
# adata.uns['methylation_dict'] = {}
|
|
55
|
+
# n_bins = 50
|
|
56
|
+
# site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
|
|
59
57
|
|
|
60
|
-
for reference in references:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
58
|
+
# for reference in references:
|
|
59
|
+
# reference_adata = adata[adata.obs[reference_column] == reference].copy()
|
|
60
|
+
# split_reference = reference.split('_')[0][1:]
|
|
61
|
+
# for sample in sample_names:
|
|
62
|
+
# sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
|
|
63
|
+
# for site_type in site_types_to_analyze:
|
|
64
|
+
# methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
|
|
65
|
+
# max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
|
|
66
|
+
# if not np.isnan(max_meth):
|
|
67
|
+
# n_bins = int(max_meth // 2)
|
|
68
|
+
# else:
|
|
69
|
+
# n_bins = 1
|
|
70
|
+
# mean = np.mean(methylation_data)
|
|
71
|
+
# median = np.median(methylation_data)
|
|
72
|
+
# stdev = np.std(methylation_data)
|
|
73
|
+
# adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
|
|
74
|
+
# if show_methylation_histogram or save_methylation_histogram:
|
|
75
|
+
# fig, ax = plt.subplots(figsize=(6, 4))
|
|
76
|
+
# count, bins, patches = plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
|
|
77
|
+
# plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
|
|
78
|
+
# plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
|
|
79
|
+
# plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
|
|
80
|
+
# plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
|
|
81
|
+
# plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
|
|
82
|
+
# plt.xlabel('Fraction methylated')
|
|
83
|
+
# plt.ylabel('Proportion')
|
|
84
|
+
# title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
|
|
85
|
+
# plt.title(title, pad=20)
|
|
86
|
+
# plt.xlim(-0.05, 1.05) # Set x-axis range from 0 to 1
|
|
87
|
+
# ax.spines['right'].set_visible(False)
|
|
88
|
+
# ax.spines['top'].set_visible(False)
|
|
89
|
+
# save_name = output_directory + f'/{readwrite.date_string()} {title}'
|
|
90
|
+
# if save_methylation_histogram:
|
|
91
|
+
# plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
|
|
92
|
+
# plt.close()
|
|
93
|
+
# else:
|
|
94
|
+
# plt.show()
|
|
@@ -1,41 +1,42 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.05):
|
|
1
|
+
def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.05):
|
|
4
2
|
"""
|
|
5
|
-
Append position
|
|
3
|
+
Append position-level metadata regarding whether the position is informative within the given observation category.
|
|
6
4
|
|
|
7
5
|
Parameters:
|
|
8
6
|
adata (AnnData): An AnnData object
|
|
9
7
|
obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
|
|
10
8
|
position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
|
|
11
9
|
|
|
12
|
-
|
|
13
|
-
|
|
10
|
+
Modifies:
|
|
11
|
+
- Adds new columns to `adata.var` containing coverage statistics.
|
|
14
12
|
"""
|
|
15
13
|
import numpy as np
|
|
16
|
-
import anndata as ad
|
|
17
14
|
import pandas as pd
|
|
18
|
-
|
|
15
|
+
import anndata as ad
|
|
16
|
+
|
|
19
17
|
categories = adata.obs[obs_column].cat.categories
|
|
20
18
|
n_categories_with_position = np.zeros(adata.shape[1])
|
|
19
|
+
|
|
21
20
|
# Loop over categories
|
|
22
21
|
for cat in categories:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
#
|
|
22
|
+
print(f'Assessing positional coverage across samples for {cat} reference')
|
|
23
|
+
|
|
24
|
+
# Subset to current category
|
|
25
|
+
cat_mask = adata.obs[obs_column] == cat
|
|
26
|
+
temp_cat_adata = adata[cat_mask]
|
|
27
|
+
|
|
28
|
+
# Compute fraction of valid coverage
|
|
26
29
|
cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
#
|
|
30
|
+
cat_valid_fraction = cat_valid_coverage / temp_cat_adata.shape[0] # Avoid extra computation
|
|
31
|
+
|
|
32
|
+
# Store coverage stats
|
|
30
33
|
adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# Final array with the sum at each position of the number of categories covering that position
|
|
41
|
-
adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
|
|
34
|
+
|
|
35
|
+
# Assign whether the position is covered based on threshold
|
|
36
|
+
adata.var[f'position_in_{cat}'] = cat_valid_fraction >= position_nan_threshold
|
|
37
|
+
|
|
38
|
+
# Sum the number of categories covering each position
|
|
39
|
+
n_categories_with_position += adata.var[f'position_in_{cat}'].values
|
|
40
|
+
|
|
41
|
+
# Store final category count
|
|
42
|
+
adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# calculate_pairwise_differences
|
|
2
|
+
|
|
3
|
+
def calculate_pairwise_differences(arrays):
|
|
4
|
+
"""
|
|
5
|
+
Calculate the pairwise differences for a list of h-stacked ndarrays. Ignore N-positions
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
arrays (str): A list of ndarrays.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
distance_matrix (ndarray): a 2D array containing the pairwise differences between all arrays.
|
|
12
|
+
"""
|
|
13
|
+
import numpy as np
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
|
|
16
|
+
num_arrays = len(arrays)
|
|
17
|
+
|
|
18
|
+
n_rows = 5
|
|
19
|
+
reshaped_arrays = [array.reshape(n_rows, -1) for array in arrays]
|
|
20
|
+
N_masks = [array[-1].astype(bool) for array in reshaped_arrays]
|
|
21
|
+
reshaped_arrays_minus_N = [array[:-1].flatten() for array in reshaped_arrays]
|
|
22
|
+
|
|
23
|
+
# Precompute the repeated N masks to avoid repeated computations
|
|
24
|
+
repeated_N_masks = [np.tile(N_mask, (n_rows - 1)) for N_mask in N_masks]
|
|
25
|
+
|
|
26
|
+
# Initialize the distance matrix
|
|
27
|
+
distance_matrix = np.zeros((num_arrays, num_arrays), dtype=np.float32)
|
|
28
|
+
|
|
29
|
+
# Calculate pairwise distances with progress bar
|
|
30
|
+
for i in tqdm(range(num_arrays), desc="Calculating Pairwise Differences"):
|
|
31
|
+
array_i = reshaped_arrays_minus_N[i]
|
|
32
|
+
N_mask_i = repeated_N_masks[i]
|
|
33
|
+
|
|
34
|
+
for j in range(i + 1, num_arrays):
|
|
35
|
+
array_j = reshaped_arrays_minus_N[j]
|
|
36
|
+
N_mask_j = repeated_N_masks[j]
|
|
37
|
+
|
|
38
|
+
# Combined mask to ignore N positions
|
|
39
|
+
combined_mask = N_mask_i | N_mask_j
|
|
40
|
+
|
|
41
|
+
# Calculate the hamming distance directly with boolean operations
|
|
42
|
+
differences = (array_i != array_j) & ~combined_mask
|
|
43
|
+
distance = np.sum(differences) / np.sum(~combined_mask)
|
|
44
|
+
|
|
45
|
+
# Store the symmetric distances
|
|
46
|
+
distance_matrix[i, j] = distance
|
|
47
|
+
distance_matrix[j, i] = distance
|
|
48
|
+
|
|
49
|
+
return distance_matrix
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
## calculate_position_Youden
|
|
2
2
|
|
|
3
3
|
## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
|
|
4
|
-
def calculate_position_Youden(adata, positive_control_sample, negative_control_sample, J_threshold=0.
|
|
4
|
+
def calculate_position_Youden(adata, positive_control_sample='positive', negative_control_sample='negative', J_threshold=0.5, obs_column='Reference', infer_on_percentile=False, inference_variable='', save=False, output_directory=''):
|
|
5
5
|
"""
|
|
6
6
|
Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
|
|
7
7
|
|
|
@@ -11,6 +11,8 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
|
|
|
11
11
|
negative_control_sample (str): string representing the sample name corresponding to the Minus MTase control sample.
|
|
12
12
|
J_threshold (float): A float indicating the J-statistic used to indicate whether a position passes QC for methylation calls.
|
|
13
13
|
obs_column (str): The category to iterate over.
|
|
14
|
+
infer_on_perdentile (bool | int): If False, use defined postive and negative control samples. If an int (0 < int < 100) is passed, this uses the top and bottom int percentile of methylated reads based on metric in inference_variable column.
|
|
15
|
+
inference_variable (str): If infer_on_percentile has an integer value passed, use the AnnData observation column name passed by this string as the metric.
|
|
14
16
|
save (bool): Whether to save the ROC plots.
|
|
15
17
|
output_directory (str): String representing the path to the output directory to output the ROC curves.
|
|
16
18
|
|
|
@@ -27,15 +29,25 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
|
|
|
27
29
|
categories = adata.obs[obs_column].cat.categories
|
|
28
30
|
# Iterate over each category in the specified obs_column
|
|
29
31
|
for cat in categories:
|
|
32
|
+
print(f"Calculating position Youden statistics for {cat}")
|
|
30
33
|
# Subset to keep only reads associated with the category
|
|
31
|
-
cat_subset = adata[adata.obs[obs_column] == cat]
|
|
34
|
+
cat_subset = adata[adata.obs[obs_column] == cat]
|
|
32
35
|
# Iterate over positive and negative control samples
|
|
33
36
|
for control in control_samples:
|
|
34
37
|
# Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
|
|
35
38
|
adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
+
if infer_on_percentile:
|
|
40
|
+
sorted_column = cat_subset.obs[inference_variable].sort_values(ascending=False)
|
|
41
|
+
if control == "positive":
|
|
42
|
+
threshold = np.percentile(sorted_column, 100 - infer_on_percentile)
|
|
43
|
+
control_subset = cat_subset[cat_subset.obs[inference_variable] >= threshold, :]
|
|
44
|
+
else:
|
|
45
|
+
threshold = np.percentile(sorted_column, infer_on_percentile)
|
|
46
|
+
control_subset = cat_subset[cat_subset.obs[inference_variable] <= threshold, :]
|
|
47
|
+
else:
|
|
48
|
+
# get the current control subset on the given category
|
|
49
|
+
filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
|
|
50
|
+
control_subset = cat_subset[filtered_obs.index]
|
|
39
51
|
# Iterate through every position in the control subset
|
|
40
52
|
for position in range(control_subset.shape[1]):
|
|
41
53
|
# Get the coordinate name associated with that position
|
|
@@ -91,8 +103,7 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
|
|
|
91
103
|
probability_thresholding_list[position] = (0.8, np.nan)
|
|
92
104
|
title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
|
|
93
105
|
plt.title(title)
|
|
94
|
-
|
|
95
|
-
save_name = output_directory + f'/{date_string} {title}'
|
|
106
|
+
save_name = output_directory + f'/{title}'
|
|
96
107
|
if save:
|
|
97
108
|
plt.savefig(save_name)
|
|
98
109
|
plt.close()
|