smftools 0.1.1__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools-0.1.6.dist-info/METADATA +127 -0
- smftools-0.1.6.dist-info/RECORD +4 -0
- smftools/__init__.py +0 -25
- smftools/_settings.py +0 -19
- smftools/_version.py +0 -1
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/__init__.py +0 -9
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +0 -27
- smftools/informatics/__init__.py +0 -12
- smftools/informatics/bam_conversion.py +0 -47
- smftools/informatics/bam_direct.py +0 -49
- smftools/informatics/basecalls_to_adata.py +0 -42
- smftools/informatics/fast5_to_pod5.py +0 -19
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
- smftools/informatics/helpers/__init__.py +0 -42
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -52
- smftools/informatics/helpers/archived/informatics.py +0 -260
- smftools/informatics/helpers/archived/load_adata.py +0 -516
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
- smftools/informatics/helpers/canoncall.py +0 -23
- smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
- smftools/informatics/helpers/count_aligned_reads.py +0 -39
- smftools/informatics/helpers/extract_base_identities.py +0 -43
- smftools/informatics/helpers/extract_mods.py +0 -51
- smftools/informatics/helpers/find_conversion_sites.py +0 -59
- smftools/informatics/helpers/generate_converted_FASTA.py +0 -79
- smftools/informatics/helpers/get_native_references.py +0 -28
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/make_modbed.py +0 -27
- smftools/informatics/helpers/modQC.py +0 -27
- smftools/informatics/helpers/modcall.py +0 -26
- smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
- smftools/informatics/helpers/one_hot_encode.py +0 -19
- smftools/informatics/helpers/separate_bam_by_bc.py +0 -41
- smftools/informatics/helpers/split_and_index_BAM.py +0 -29
- smftools/informatics/pod5_conversion.py +0 -53
- smftools/informatics/pod5_direct.py +0 -55
- smftools/informatics/pod5_to_adata.py +0 -40
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_pod5.py +0 -48
- smftools/plotting/__init__.py +0 -0
- smftools/preprocessing/__init__.py +0 -29
- smftools/preprocessing/append_C_context.py +0 -46
- smftools/preprocessing/archives/preprocessing.py +0 -614
- smftools/preprocessing/binarize_on_Youden.py +0 -42
- smftools/preprocessing/binary_layers_to_ohe.py +0 -30
- smftools/preprocessing/calculate_complexity.py +0 -71
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
- smftools/preprocessing/calculate_coverage.py +0 -41
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
- smftools/preprocessing/calculate_position_Youden.py +0 -104
- smftools/preprocessing/calculate_read_length_stats.py +0 -32
- smftools/preprocessing/clean_NaN.py +0 -38
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -27
- smftools/preprocessing/filter_reads_on_length.py +0 -39
- smftools/preprocessing/invert_adata.py +0 -22
- smftools/preprocessing/mark_duplicates.py +0 -119
- smftools/preprocessing/min_non_diagonal.py +0 -25
- smftools/preprocessing/remove_duplicates.py +0 -18
- smftools/readwrite.py +0 -106
- smftools/tools/__init__.py +0 -0
- smftools-0.1.1.dist-info/METADATA +0 -88
- smftools-0.1.1.dist-info/RECORD +0 -64
- {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
- {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
## binary_layers_to_ohe
|
|
2
|
-
|
|
3
|
-
## Conversion SMF Specific
|
|
4
|
-
def binary_layers_to_ohe(adata, layers, stack='hstack'):
|
|
5
|
-
"""
|
|
6
|
-
Parameters:
|
|
7
|
-
adata (AnnData): Anndata object.
|
|
8
|
-
layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix
|
|
9
|
-
stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
|
|
13
|
-
Input: An adata object and a list of layers containing a binary encoding.
|
|
14
|
-
"""
|
|
15
|
-
import numpy as np
|
|
16
|
-
import anndata as ad
|
|
17
|
-
# Extract the layers
|
|
18
|
-
layers = [adata.layers[layer_name] for layer_name in layers]
|
|
19
|
-
n_reads = layers[0].shape[0]
|
|
20
|
-
ohe_dict = {}
|
|
21
|
-
for i in range(n_reads):
|
|
22
|
-
read_ohe = []
|
|
23
|
-
for layer in layers:
|
|
24
|
-
read_ohe.append(layer[i])
|
|
25
|
-
read_name = adata.obs_names[i]
|
|
26
|
-
if stack == 'hstack':
|
|
27
|
-
ohe_dict[read_name] = np.hstack(read_ohe)
|
|
28
|
-
elif stack == 'vstack':
|
|
29
|
-
ohe_dict[read_name] = np.vstack(read_ohe)
|
|
30
|
-
return ohe_dict
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
## calculate_complexity
|
|
2
|
-
|
|
3
|
-
def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False, output_directory=''):
|
|
4
|
-
"""
|
|
5
|
-
A complexity analysis of the library.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
adata (AnnData): An adata object with mark_duplicates already run.
|
|
9
|
-
obs_column (str): String of the obs column to iterate over.
|
|
10
|
-
sample_col (str): String of the sample column to iterate over.
|
|
11
|
-
plot (bool): Whether to plot the complexity model.
|
|
12
|
-
save_plot (bool): Whether to save the complexity model.
|
|
13
|
-
output_directory (str): String representing the path to the output directory.
|
|
14
|
-
|
|
15
|
-
Returns:
|
|
16
|
-
None
|
|
17
|
-
|
|
18
|
-
"""
|
|
19
|
-
import numpy as np
|
|
20
|
-
import pandas as pd
|
|
21
|
-
from scipy.optimize import curve_fit
|
|
22
|
-
|
|
23
|
-
def lander_waterman(x, C0):
|
|
24
|
-
return C0 * (1 - np.exp(-x / C0))
|
|
25
|
-
|
|
26
|
-
def count_unique_reads(reads, depth):
|
|
27
|
-
subsample = np.random.choice(reads, depth, replace=False)
|
|
28
|
-
return len(np.unique(subsample))
|
|
29
|
-
|
|
30
|
-
categories = adata.obs[obs_column].cat.categories
|
|
31
|
-
sample_names = adata.obs[sample_col].cat.categories
|
|
32
|
-
|
|
33
|
-
for cat in categories:
|
|
34
|
-
for sample in sample_names:
|
|
35
|
-
unique_reads, total_reads = adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'][0:2]
|
|
36
|
-
reads = np.concatenate((np.arange(unique_reads), np.random.choice(unique_reads, total_reads - unique_reads, replace=True)))
|
|
37
|
-
# Subsampling depths
|
|
38
|
-
subsampling_depths = [total_reads // (i+1) for i in range(10)]
|
|
39
|
-
# Arrays to store results
|
|
40
|
-
subsampled_total_reads = []
|
|
41
|
-
subsampled_unique_reads = []
|
|
42
|
-
# Perform subsampling
|
|
43
|
-
for depth in subsampling_depths:
|
|
44
|
-
unique_count = count_unique_reads(reads, depth)
|
|
45
|
-
subsampled_total_reads.append(depth)
|
|
46
|
-
subsampled_unique_reads.append(unique_count)
|
|
47
|
-
# Fit the Lander-Waterman model to the data
|
|
48
|
-
popt, _ = curve_fit(lander_waterman, subsampled_total_reads, subsampled_unique_reads)
|
|
49
|
-
# Generate data for the complexity curve
|
|
50
|
-
x_data = np.linspace(0, 5000, 100)
|
|
51
|
-
y_data = lander_waterman(x_data, *popt)
|
|
52
|
-
adata.uns[f'Library_complexity_{sample}_on_{cat}'] = popt[0]
|
|
53
|
-
if plot:
|
|
54
|
-
import matplotlib.pyplot as plt
|
|
55
|
-
# Plot the complexity curve
|
|
56
|
-
plt.figure(figsize=(6, 4))
|
|
57
|
-
plt.plot(total_reads, unique_reads, 'o', label='Observed unique reads')
|
|
58
|
-
plt.plot(x_data, y_data, '-', label=f'Lander-Waterman fit\nEstimated C0 = {popt[0]:.2f}')
|
|
59
|
-
plt.xlabel('Total number of reads')
|
|
60
|
-
plt.ylabel('Number of unique reads')
|
|
61
|
-
title = f'Library Complexity Analysis for {sample} on {cat}'
|
|
62
|
-
plt.title(title)
|
|
63
|
-
plt.legend()
|
|
64
|
-
plt.grid(True)
|
|
65
|
-
if save_plot:
|
|
66
|
-
date_string = date_string()
|
|
67
|
-
save_name = output_directory + f'/{date_string}_{title}'
|
|
68
|
-
plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
|
|
69
|
-
plt.close()
|
|
70
|
-
else:
|
|
71
|
-
plt.show()
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
## calculate_converted_read_methylation_stats
|
|
2
|
-
|
|
3
|
-
## Conversion SMF Specific
|
|
4
|
-
# Read methylation QC
|
|
5
|
-
|
|
6
|
-
def calculate_converted_read_methylation_stats(adata, obs_column='Reference'):
|
|
7
|
-
"""
|
|
8
|
-
Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives)
|
|
9
|
-
|
|
10
|
-
Parameters:
|
|
11
|
-
adata (AnnData): An AnnData object
|
|
12
|
-
obs_column (str): observation category of interest
|
|
13
|
-
|
|
14
|
-
Returns:
|
|
15
|
-
None
|
|
16
|
-
"""
|
|
17
|
-
import numpy as np
|
|
18
|
-
import anndata as ad
|
|
19
|
-
import pandas as pd
|
|
20
|
-
|
|
21
|
-
site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
|
|
22
|
-
categories = adata.obs[obs_column].cat.categories
|
|
23
|
-
for site_type in site_types:
|
|
24
|
-
adata.obs[f'{site_type}_row_methylation_sums'] = pd.Series(0, index=adata.obs_names, dtype=int)
|
|
25
|
-
adata.obs[f'{site_type}_row_methylation_means'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
|
|
26
|
-
adata.obs[f'number_valid_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
|
|
27
|
-
adata.obs[f'fraction_valid_{site_type}_in_range'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
|
|
28
|
-
for cat in categories:
|
|
29
|
-
cat_subset = adata[adata.obs[obs_column] == cat].copy()
|
|
30
|
-
for site_type in site_types:
|
|
31
|
-
print(f'Iterating over {cat}_{site_type}')
|
|
32
|
-
observation_matrix = cat_subset.obsm[f'{cat}_{site_type}']
|
|
33
|
-
number_valid_positions_in_read = np.nansum(~np.isnan(observation_matrix), axis=1)
|
|
34
|
-
row_methylation_sums = np.nansum(observation_matrix, axis=1)
|
|
35
|
-
number_valid_positions_in_read[number_valid_positions_in_read == 0] = 1
|
|
36
|
-
fraction_valid_positions_in_range = number_valid_positions_in_read / np.max(number_valid_positions_in_read)
|
|
37
|
-
row_methylation_means = np.divide(row_methylation_sums, number_valid_positions_in_read)
|
|
38
|
-
temp_obs_data = pd.DataFrame({f'number_valid_{site_type}_in_read': number_valid_positions_in_read,
|
|
39
|
-
f'fraction_valid_{site_type}_in_range': fraction_valid_positions_in_range,
|
|
40
|
-
f'{site_type}_row_methylation_sums': row_methylation_sums,
|
|
41
|
-
f'{site_type}_row_methylation_means': row_methylation_means}, index=cat_subset.obs.index)
|
|
42
|
-
adata.obs.update(temp_obs_data)
|
|
43
|
-
# Indicate whether the read-level GpC methylation rate exceeds the false methylation rate of the read
|
|
44
|
-
pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
|
|
45
|
-
adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
## calculate_coverage
|
|
2
|
-
|
|
3
|
-
def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.05):
|
|
4
|
-
"""
|
|
5
|
-
Append position level metadata regarding whether the position is informative within the given observation category.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
adata (AnnData): An AnnData object
|
|
9
|
-
obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
|
|
10
|
-
position_nan_threshold (float): A minimal threshold of coverage to call the position as valid.
|
|
11
|
-
|
|
12
|
-
Returns:
|
|
13
|
-
None
|
|
14
|
-
"""
|
|
15
|
-
import numpy as np
|
|
16
|
-
import anndata as ad
|
|
17
|
-
import pandas as pd
|
|
18
|
-
|
|
19
|
-
categories = adata.obs[obs_column].cat.categories
|
|
20
|
-
n_categories_with_position = np.zeros(adata.shape[1])
|
|
21
|
-
# Loop over categories
|
|
22
|
-
for cat in categories:
|
|
23
|
-
# Look at positional information for each reference
|
|
24
|
-
temp_cat_adata = adata[adata.obs[obs_column] == cat]
|
|
25
|
-
# Look at read coverage on the given category strand
|
|
26
|
-
cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
|
|
27
|
-
cat_invalid_coverage = np.sum(np.isnan(temp_cat_adata.X), axis=0)
|
|
28
|
-
cat_valid_fraction = cat_valid_coverage / (cat_valid_coverage + cat_invalid_coverage)
|
|
29
|
-
# Append metadata for category to the anndata object
|
|
30
|
-
adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
|
|
31
|
-
# Characterize if the position is in the given category or not
|
|
32
|
-
conditions = [
|
|
33
|
-
(adata.var[f'{cat}_valid_fraction'] >= position_nan_threshold),
|
|
34
|
-
(adata.var[f'{cat}_valid_fraction'] < position_nan_threshold)
|
|
35
|
-
]
|
|
36
|
-
choices = [True, False]
|
|
37
|
-
adata.var[f'position_in_{cat}'] = np.select(conditions, choices, default=False)
|
|
38
|
-
n_categories_with_position += np.array(adata.var[f'position_in_{cat}'])
|
|
39
|
-
|
|
40
|
-
# Final array with the sum at each position of the number of categories covering that position
|
|
41
|
-
adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
## calculate_pairwise_hamming_distances
|
|
2
|
-
|
|
3
|
-
## Conversion SMF Specific
|
|
4
|
-
def calculate_pairwise_hamming_distances(arrays):
|
|
5
|
-
"""
|
|
6
|
-
Calculate the pairwise Hamming distances for a list of h-stacked ndarrays.
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
arrays (str): A list of ndarrays.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
distance_matrix (ndarray): a 2D array containing the pairwise Hamming distances between all arrays.
|
|
13
|
-
|
|
14
|
-
"""
|
|
15
|
-
import numpy as np
|
|
16
|
-
import tqdm
|
|
17
|
-
from scipy.spatial.distance import hamming
|
|
18
|
-
num_arrays = len(arrays)
|
|
19
|
-
# Initialize an empty distance matrix
|
|
20
|
-
distance_matrix = np.zeros((num_arrays, num_arrays))
|
|
21
|
-
# Calculate pairwise distances with progress bar
|
|
22
|
-
for i in tqdm(range(num_arrays), desc="Calculating Hamming Distances"):
|
|
23
|
-
for j in range(i + 1, num_arrays):
|
|
24
|
-
distance = hamming(arrays[i], arrays[j])
|
|
25
|
-
distance_matrix[i, j] = distance
|
|
26
|
-
distance_matrix[j, i] = distance
|
|
27
|
-
return distance_matrix
|
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
## calculate_position_Youden
|
|
2
|
-
|
|
3
|
-
## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
|
|
4
|
-
def calculate_position_Youden(adata, positive_control_sample, negative_control_sample, J_threshold=0.4, obs_column='Reference', save=False, output_directory=''):
|
|
5
|
-
"""
|
|
6
|
-
Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
adata (AnnData): An AnnData object.
|
|
10
|
-
positive_control_sample (str): string representing the sample name corresponding to the Plus MTase control sample.
|
|
11
|
-
negative_control_sample (str): string representing the sample name corresponding to the Minus MTase control sample.
|
|
12
|
-
J_threshold (float): A float indicating the J-statistic used to indicate whether a position passes QC for methylation calls.
|
|
13
|
-
obs_column (str): The category to iterate over.
|
|
14
|
-
save (bool): Whether to save the ROC plots.
|
|
15
|
-
output_directory (str): String representing the path to the output directory to output the ROC curves.
|
|
16
|
-
|
|
17
|
-
Returns:
|
|
18
|
-
None
|
|
19
|
-
"""
|
|
20
|
-
import numpy as np
|
|
21
|
-
import pandas as pd
|
|
22
|
-
import anndata as ad
|
|
23
|
-
import matplotlib.pyplot as plt
|
|
24
|
-
from sklearn.metrics import roc_curve, roc_auc_score
|
|
25
|
-
|
|
26
|
-
control_samples = [positive_control_sample, negative_control_sample]
|
|
27
|
-
categories = adata.obs[obs_column].cat.categories
|
|
28
|
-
# Iterate over each category in the specified obs_column
|
|
29
|
-
for cat in categories:
|
|
30
|
-
# Subset to keep only reads associated with the category
|
|
31
|
-
cat_subset = adata[adata.obs[obs_column] == cat].copy()
|
|
32
|
-
# Iterate over positive and negative control samples
|
|
33
|
-
for control in control_samples:
|
|
34
|
-
# Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
|
|
35
|
-
adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
|
|
36
|
-
# get the current control subset on the given category
|
|
37
|
-
filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
|
|
38
|
-
control_subset = cat_subset[filtered_obs.index].copy()
|
|
39
|
-
# Iterate through every position in the control subset
|
|
40
|
-
for position in range(control_subset.shape[1]):
|
|
41
|
-
# Get the coordinate name associated with that position
|
|
42
|
-
coordinate = control_subset.var_names[position]
|
|
43
|
-
# Get the array of methlyation probabilities for each read in the subset at that position
|
|
44
|
-
position_data = control_subset.X[:, position]
|
|
45
|
-
# Get the indexes of everywhere that is not a nan value
|
|
46
|
-
nan_mask = ~np.isnan(position_data)
|
|
47
|
-
# Keep only the methlyation data that has real values
|
|
48
|
-
position_data = position_data[nan_mask]
|
|
49
|
-
# Get the position data coverage
|
|
50
|
-
position_coverage = len(position_data)
|
|
51
|
-
# Get fraction coverage
|
|
52
|
-
fraction_coverage = position_coverage / control_subset.shape[0]
|
|
53
|
-
# Save the position and the position methylation data for the control subset
|
|
54
|
-
adata.uns[f'{cat}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
|
|
55
|
-
|
|
56
|
-
for cat in categories:
|
|
57
|
-
fig, ax = plt.subplots(figsize=(6, 4))
|
|
58
|
-
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
|
|
59
|
-
plt.xlabel('False Positive Rate')
|
|
60
|
-
plt.ylabel('True Positive Rate')
|
|
61
|
-
ax.spines['right'].set_visible(False)
|
|
62
|
-
ax.spines['top'].set_visible(False)
|
|
63
|
-
n_passed_positions = 0
|
|
64
|
-
n_total_positions = 0
|
|
65
|
-
# Initialize a list that will hold the positional thresholds for the category
|
|
66
|
-
probability_thresholding_list = [(np.nan, np.nan)] * adata.shape[1]
|
|
67
|
-
for i, key in enumerate(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'].keys()):
|
|
68
|
-
position = int(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][0])
|
|
69
|
-
positive_position_array = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][1]
|
|
70
|
-
fraction_coverage = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][2]
|
|
71
|
-
if fraction_coverage > 0.2:
|
|
72
|
-
try:
|
|
73
|
-
negative_position_array = adata.uns[f'{cat}_position_methylation_dict_{negative_control_sample}'][key][1]
|
|
74
|
-
# Combine the negative and positive control data
|
|
75
|
-
data = np.concatenate([negative_position_array, positive_position_array])
|
|
76
|
-
labels = np.array([0] * len(negative_position_array) + [1] * len(positive_position_array))
|
|
77
|
-
# Calculate the ROC curve
|
|
78
|
-
fpr, tpr, thresholds = roc_curve(labels, data)
|
|
79
|
-
# Calculate Youden's J statistic
|
|
80
|
-
J = tpr - fpr
|
|
81
|
-
optimal_idx = np.argmax(J)
|
|
82
|
-
optimal_threshold = thresholds[optimal_idx]
|
|
83
|
-
max_J = np.max(J)
|
|
84
|
-
data_tuple = (optimal_threshold, max_J)
|
|
85
|
-
probability_thresholding_list[position] = data_tuple
|
|
86
|
-
n_total_positions += 1
|
|
87
|
-
if max_J > J_threshold:
|
|
88
|
-
n_passed_positions += 1
|
|
89
|
-
plt.plot(fpr, tpr, label='ROC curve')
|
|
90
|
-
except:
|
|
91
|
-
probability_thresholding_list[position] = (0.8, np.nan)
|
|
92
|
-
title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
|
|
93
|
-
plt.title(title)
|
|
94
|
-
date_string = date_string()
|
|
95
|
-
save_name = output_directory + f'/{date_string} {title}'
|
|
96
|
-
if save:
|
|
97
|
-
plt.savefig(save_name)
|
|
98
|
-
plt.close()
|
|
99
|
-
else:
|
|
100
|
-
plt.show()
|
|
101
|
-
|
|
102
|
-
adata.var[f'{cat}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
|
|
103
|
-
J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
|
|
104
|
-
adata.var[f'{cat}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
## calculate_read_length_stats
|
|
2
|
-
|
|
3
|
-
# Read length QC
|
|
4
|
-
def calculate_read_length_stats(adata):
|
|
5
|
-
"""
|
|
6
|
-
Append first valid position in a read and last valid position in the read. From this determine and append the read length.
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
adata (AnnData): An adata object
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
upper_bound (int): last valid position in the dataset
|
|
13
|
-
lower_bound (int): first valid position in the dataset
|
|
14
|
-
"""
|
|
15
|
-
import numpy as np
|
|
16
|
-
import anndata as ad
|
|
17
|
-
import pandas as pd
|
|
18
|
-
## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
|
|
19
|
-
|
|
20
|
-
# Add some basic observation-level (read-level) metadata to the anndata object
|
|
21
|
-
read_first_valid_position = np.array([int(adata.var_names[i]) for i in np.argmax(~np.isnan(adata.X), axis=1)])
|
|
22
|
-
read_last_valid_position = np.array([int(adata.var_names[i]) for i in (adata.X.shape[1] - 1 - np.argmax(~np.isnan(adata.X[:, ::-1]), axis=1))])
|
|
23
|
-
read_length = read_last_valid_position - read_first_valid_position + np.ones(len(read_first_valid_position))
|
|
24
|
-
|
|
25
|
-
adata.obs['first_valid_position'] = pd.Series(read_first_valid_position, index=adata.obs.index, dtype=int)
|
|
26
|
-
adata.obs['last_valid_position'] = pd.Series(read_last_valid_position, index=adata.obs.index, dtype=int)
|
|
27
|
-
adata.obs['read_length'] = pd.Series(read_length, index=adata.obs.index, dtype=int)
|
|
28
|
-
|
|
29
|
-
# Define variables to hold the first and last valid position in the dataset
|
|
30
|
-
upper_bound = int(np.nanmax(adata.obs['last_valid_position']))
|
|
31
|
-
lower_bound = int(np.nanmin(adata.obs['first_valid_position']))
|
|
32
|
-
return upper_bound, lower_bound
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
## clean_NaN
|
|
2
|
-
from ..readwrite import adata_to_df
|
|
3
|
-
|
|
4
|
-
# NaN handling
|
|
5
|
-
def clean_NaN(adata, layer=None):
|
|
6
|
-
"""
|
|
7
|
-
Append layers to adata that contain NaN cleaning strategies.
|
|
8
|
-
|
|
9
|
-
Parameters:
|
|
10
|
-
adata (AnnData): an adata object
|
|
11
|
-
layer (str): string representing the layer to fill NaN values in
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
import numpy as np
|
|
17
|
-
import anndata as ad
|
|
18
|
-
import pandas as pd
|
|
19
|
-
# Fill NaN with closest SMF value
|
|
20
|
-
df = adata_to_df(adata, layer=layer)
|
|
21
|
-
df = df.ffill(axis=1).bfill(axis=1)
|
|
22
|
-
adata.layers['fill_nans_closest'] = df.values
|
|
23
|
-
|
|
24
|
-
# Replace NaN values with 0, and 0 with minus 1
|
|
25
|
-
old_value, new_value = [0, -1]
|
|
26
|
-
df = adata_to_df(adata, layer=layer)
|
|
27
|
-
df = df.replace(old_value, new_value)
|
|
28
|
-
old_value, new_value = [np.nan, 0]
|
|
29
|
-
df = df.replace(old_value, new_value)
|
|
30
|
-
adata.layers['nan0_0minus1'] = df.values
|
|
31
|
-
|
|
32
|
-
# Replace NaN values with 1, and 1 with 2
|
|
33
|
-
old_value, new_value = [1, 2]
|
|
34
|
-
df = adata_to_df(adata, layer=layer)
|
|
35
|
-
df = df.replace(old_value, new_value)
|
|
36
|
-
old_value, new_value = [np.nan, 1]
|
|
37
|
-
df = df.replace(old_value, new_value)
|
|
38
|
-
adata.layers['nan1_12'] = df.values
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
## filter_converted_reads_on_methylation
|
|
2
|
-
|
|
3
|
-
## Conversion SMF Specific
|
|
4
|
-
# Read methylation QC
|
|
5
|
-
def filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, min_SMF_threshold=0.025):
|
|
6
|
-
"""
|
|
7
|
-
Filter adata object using minimum thresholds for valid SMF site fraction in read, as well as minimum methylation content in read.
|
|
8
|
-
|
|
9
|
-
Parameters:
|
|
10
|
-
adata (AnnData): An adata object.
|
|
11
|
-
valid_SMF_site_threshold (float): A minimum proportion of valid SMF sites that must be present in the read. Default is 0.8
|
|
12
|
-
min_SMF_threshold (float): A minimum read methylation level. Default is 0.025
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
import numpy as np
|
|
17
|
-
import anndata as ad
|
|
18
|
-
import pandas as pd
|
|
19
|
-
|
|
20
|
-
if valid_SMF_site_threshold:
|
|
21
|
-
# Keep reads that have over a given valid GpC site content
|
|
22
|
-
adata = adata[adata.obs['fraction_valid_GpC_site_in_range'] > valid_SMF_site_threshold].copy()
|
|
23
|
-
if min_SMF_threshold:
|
|
24
|
-
# Keep reads with SMF methylation over background methylation.
|
|
25
|
-
adata = adata[adata.obs['GpC_above_other_C'] == True].copy()
|
|
26
|
-
# Keep reads over a defined methylation threshold
|
|
27
|
-
adata = adata[adata.obs['GpC_site_row_methylation_means'] > min_SMF_threshold].copy()
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
## filter_reads_on_length
|
|
2
|
-
|
|
3
|
-
def filter_reads_on_length(adata, filter_on_coordinates=False, min_read_length=2700):
|
|
4
|
-
"""
|
|
5
|
-
Filters the adata object to keep a defined coordinate window, as well as reads that are over a minimum threshold in length.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
adata (AnnData): An adata object.
|
|
9
|
-
filter_on_coordinates (bool | list): If False, skips filtering. Otherwise, provide a list containing integers representing the lower and upper bound coordinates to filter on. Default is False.
|
|
10
|
-
min_read_length (int): The minimum read length to keep in the filtered dataset. Default is 2700.
|
|
11
|
-
|
|
12
|
-
Returns:
|
|
13
|
-
None
|
|
14
|
-
Input: Adata object. a list of lower and upper bound (set to False or None if not wanted), and a minimum read length integer.
|
|
15
|
-
|
|
16
|
-
"""
|
|
17
|
-
import numpy as np
|
|
18
|
-
import anndata as ad
|
|
19
|
-
import pandas as pd
|
|
20
|
-
if filter_on_coordinates:
|
|
21
|
-
lower_bound, upper_bound = filter_on_coordinates
|
|
22
|
-
# Extract the position information from the adata object as an np array
|
|
23
|
-
var_names_arr = adata.var_names.astype(int).to_numpy()
|
|
24
|
-
# Find the upper bound coordinate that is closest to the specified value
|
|
25
|
-
closest_end_index = np.argmin(np.abs(var_names_arr - upper_bound))
|
|
26
|
-
upper_bound = int(adata.var_names[closest_end_index])
|
|
27
|
-
# Find the lower bound coordinate that is closest to the specified value
|
|
28
|
-
closest_start_index = np.argmin(np.abs(var_names_arr - lower_bound))
|
|
29
|
-
lower_bound = int(adata.var_names[closest_start_index])
|
|
30
|
-
# Get a list of positional indexes that encompass the lower and upper bounds of the dataset
|
|
31
|
-
position_list = list(range(lower_bound, upper_bound + 1))
|
|
32
|
-
position_list = [str(pos) for pos in position_list]
|
|
33
|
-
position_set = set(position_list)
|
|
34
|
-
print(f'Subsetting adata to keep data between coordinates {lower_bound} and {upper_bound}')
|
|
35
|
-
adata = adata[:, adata.var_names.isin(position_set)].copy()
|
|
36
|
-
|
|
37
|
-
if min_read_length:
|
|
38
|
-
print(f'Subsetting adata to keep reads longer than {min_read_length}')
|
|
39
|
-
adata = adata[adata.obs['read_length'] > min_read_length].copy()
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
## invert_adata
|
|
2
|
-
|
|
3
|
-
# Optional inversion of the adata
|
|
4
|
-
def invert_adata(adata):
|
|
5
|
-
"""
|
|
6
|
-
Inverts the adata object along the variable axis
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
adata (AnnData): An adata object.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
None
|
|
13
|
-
"""
|
|
14
|
-
import numpy as np
|
|
15
|
-
import anndata as ad
|
|
16
|
-
# Reassign var_names with new names
|
|
17
|
-
old_var_names = adata.var_names.astype(int).to_numpy()
|
|
18
|
-
new_var_names = np.sort(old_var_names)[::-1].astype(str)
|
|
19
|
-
adata.var['Original_positional_coordinate'] = old_var_names.astype(str)
|
|
20
|
-
adata.var_names = new_var_names
|
|
21
|
-
# Sort the AnnData object based on the old var_names
|
|
22
|
-
adata = adata[:, old_var_names.astype(str)]
|
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
## mark_duplicates
|
|
2
|
-
|
|
3
|
-
def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names'):
|
|
4
|
-
"""
|
|
5
|
-
Marks duplicates in the adata object.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
adata (AnnData): An adata object.
|
|
9
|
-
layers (list): A list of strings representing the layers to use.
|
|
10
|
-
obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
|
|
11
|
-
sample_col (str):L A string representing the obs column name to second subset on. Default is 'Sample_names'.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
import numpy as np
|
|
18
|
-
import pandas as pd
|
|
19
|
-
import matplotlib.pyplot as plt
|
|
20
|
-
from scipy.signal import find_peaks
|
|
21
|
-
import networkx as nx
|
|
22
|
-
from .binary_layers_to_ohe import binary_layers_to_ohe
|
|
23
|
-
from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
|
|
24
|
-
from .min_non_diagonal import min_non_diagonal
|
|
25
|
-
|
|
26
|
-
categories = adata.obs[obs_column].cat.categories
|
|
27
|
-
sample_names = adata.obs[sample_col].cat.categories
|
|
28
|
-
|
|
29
|
-
# Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
|
|
30
|
-
adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
|
|
31
|
-
for cat in categories:
|
|
32
|
-
cat_subset = adata[adata.obs[obs_column] == cat].copy()
|
|
33
|
-
for sample in sample_names:
|
|
34
|
-
sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
|
|
35
|
-
# Encode sequencing reads as a one-hot-encodings
|
|
36
|
-
adata.uns[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
|
|
37
|
-
# Unpack the read names and one hot encodings into lists
|
|
38
|
-
read_names = []
|
|
39
|
-
ohe_list = []
|
|
40
|
-
for read_name, ohe in adata.uns[f'{cat}_{sample}_read_OHE_dict'].items():
|
|
41
|
-
read_names.append(read_name)
|
|
42
|
-
ohe_list.append(ohe)
|
|
43
|
-
# Calculate the pairwise hamming distances
|
|
44
|
-
print(f'Calculating hamming distances for {sample} on {cat} allele')
|
|
45
|
-
distance_matrix = calculate_pairwise_hamming_distances(ohe_list)
|
|
46
|
-
n_reads = distance_matrix.shape[0]
|
|
47
|
-
# Load the hamming matrix into a dataframe with index and column names as the read_names
|
|
48
|
-
distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
|
|
49
|
-
# Save the distance dataframe into an unstructured component of the adata object
|
|
50
|
-
adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
|
|
51
|
-
# Calculate the minimum non-self distance for every read in the reference and sample
|
|
52
|
-
min_distance_values = min_non_diagonal(distance_matrix)
|
|
53
|
-
min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
|
|
54
|
-
adata.obs.update(min_distance_df)
|
|
55
|
-
# Generate a histogram of minimum non-self distances for each read
|
|
56
|
-
min_distance_bins = plt.hist(min_distance_values, bins=n_reads//4)
|
|
57
|
-
# Normalize the max value in any histogram bin to 1
|
|
58
|
-
normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
|
|
59
|
-
# Extract the bin index of peak centers in the histogram
|
|
60
|
-
peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
|
|
61
|
-
first_peak_index = peak_centers[0]
|
|
62
|
-
offset_index = first_peak_index-1
|
|
63
|
-
# Use the distance corresponding to the first peak as the threshold distance in graph construction
|
|
64
|
-
first_peak_distance = min_distance_bins[1][first_peak_index]
|
|
65
|
-
offset_distance = min_distance_bins[1][offset_index]
|
|
66
|
-
adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
|
|
67
|
-
|
|
68
|
-
## Detect likely duplicate reads and mark them in the adata object.
|
|
69
|
-
adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
|
|
70
|
-
adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
|
|
71
|
-
adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
|
|
72
|
-
|
|
73
|
-
for cat in categories:
|
|
74
|
-
for sample in sample_names:
|
|
75
|
-
distance_df = adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
|
|
76
|
-
read_names = distance_df.index
|
|
77
|
-
distance_matrix = distance_df.values
|
|
78
|
-
n_reads = distance_matrix.shape[0]
|
|
79
|
-
distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
|
|
80
|
-
# Initialize the read distance graph
|
|
81
|
-
G = nx.Graph()
|
|
82
|
-
# Add each read as a node to the graph
|
|
83
|
-
G.add_nodes_from(range(n_reads))
|
|
84
|
-
# Add edges based on the threshold
|
|
85
|
-
for i in range(n_reads):
|
|
86
|
-
for j in range(i + 1, n_reads):
|
|
87
|
-
if distance_matrix[i, j] <= distance_threshold:
|
|
88
|
-
G.add_edge(i, j)
|
|
89
|
-
# Determine distinct clusters using connected components
|
|
90
|
-
clusters = list(nx.connected_components(G))
|
|
91
|
-
clusters = [list(cluster) for cluster in clusters]
|
|
92
|
-
# Get the number of clusters
|
|
93
|
-
cluster_count = len(clusters)
|
|
94
|
-
adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, cluster_count / n_reads, clusters]
|
|
95
|
-
# Update the adata object
|
|
96
|
-
read_cluster_map = {}
|
|
97
|
-
read_duplicate_map = {}
|
|
98
|
-
read_keep_map = {}
|
|
99
|
-
for i, cluster in enumerate(clusters):
|
|
100
|
-
for j, read_index in enumerate(cluster):
|
|
101
|
-
read_name = read_names[read_index]
|
|
102
|
-
read_cluster_map[read_name] = i
|
|
103
|
-
if len(cluster) > 1:
|
|
104
|
-
read_duplicate_map[read_name] = True
|
|
105
|
-
if j == 0:
|
|
106
|
-
read_keep_map[read_name] = True
|
|
107
|
-
else:
|
|
108
|
-
read_keep_map[read_name] = False
|
|
109
|
-
elif len(cluster) == 1:
|
|
110
|
-
read_duplicate_map[read_name] = False
|
|
111
|
-
read_keep_map[read_name] = True
|
|
112
|
-
cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
|
|
113
|
-
duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
|
|
114
|
-
keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
|
|
115
|
-
df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
|
|
116
|
-
adata.obs.update(df_combined)
|
|
117
|
-
adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
|
|
118
|
-
adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
|
|
119
|
-
print(f'Hamming clusters for {sample} on {cat}\nThreshold: {first_peak_distance}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {cluster_count / n_reads}')
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
## min_non_diagonal
|
|
2
|
-
|
|
3
|
-
def min_non_diagonal(matrix):
|
|
4
|
-
"""
|
|
5
|
-
Takes a matrix and returns the smallest value from each row with the diagonal masked.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
matrix (ndarray): A 2D ndarray.
|
|
9
|
-
|
|
10
|
-
Returns:
|
|
11
|
-
min_values (list): A list of minimum values from each row of the matrix
|
|
12
|
-
"""
|
|
13
|
-
import numpy as np
|
|
14
|
-
|
|
15
|
-
n = matrix.shape[0]
|
|
16
|
-
min_values = []
|
|
17
|
-
for i in range(n):
|
|
18
|
-
# Mask to exclude the diagonal element
|
|
19
|
-
row_mask = np.ones(n, dtype=bool)
|
|
20
|
-
row_mask[i] = False
|
|
21
|
-
# Extract the row excluding the diagonal element
|
|
22
|
-
row = matrix[i, row_mask]
|
|
23
|
-
# Find the minimum value in the row
|
|
24
|
-
min_values.append(np.min(row))
|
|
25
|
-
return min_values
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
# remove_duplicates
|
|
2
|
-
|
|
3
|
-
def remove_duplicates(adata):
|
|
4
|
-
"""
|
|
5
|
-
Remove duplicates from the adata object
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
adata (Anndata): An adata object.
|
|
9
|
-
|
|
10
|
-
Returns:
|
|
11
|
-
None
|
|
12
|
-
"""
|
|
13
|
-
import anndata as ad
|
|
14
|
-
|
|
15
|
-
initial_size = adata.shape[0]
|
|
16
|
-
adata = adata[adata.obs['Unique_in_final_read_set'] == True].copy()
|
|
17
|
-
final_size = adata.shape[0]
|
|
18
|
-
print(f'Removed {initial_size-final_size} reads from the dataset')
|