smftools 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +0 -2
- smftools/_settings.py +1 -1
- smftools/_version.py +1 -0
- smftools/datasets/datasets.py +11 -9
- smftools/informatics/__init__.py +8 -7
- smftools/informatics/bam_conversion.py +47 -0
- smftools/informatics/bam_direct.py +49 -0
- smftools/informatics/basecalls_to_adata.py +42 -0
- smftools/informatics/fast5_to_pod5.py +19 -0
- smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
- smftools/informatics/helpers/__init__.py +4 -4
- smftools/informatics/helpers/align_and_sort_BAM.py +52 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +10 -3
- smftools/informatics/helpers/canoncall.py +12 -1
- smftools/informatics/helpers/converted_BAM_to_adata.py +30 -13
- smftools/informatics/helpers/count_aligned_reads.py +12 -5
- smftools/informatics/helpers/extract_base_identities.py +13 -6
- smftools/informatics/helpers/extract_mods.py +17 -5
- smftools/informatics/helpers/find_conversion_sites.py +15 -9
- smftools/informatics/helpers/generate_converted_FASTA.py +49 -29
- smftools/informatics/helpers/get_native_references.py +10 -7
- smftools/informatics/helpers/make_dirs.py +9 -3
- smftools/informatics/helpers/make_modbed.py +10 -4
- smftools/informatics/helpers/modQC.py +10 -2
- smftools/informatics/helpers/modcall.py +13 -1
- smftools/informatics/helpers/modkit_extract_to_adata.py +25 -13
- smftools/informatics/helpers/one_hot_encode.py +8 -3
- smftools/informatics/helpers/separate_bam_by_bc.py +18 -5
- smftools/informatics/helpers/split_and_index_BAM.py +18 -10
- smftools/informatics/pod5_conversion.py +34 -7
- smftools/informatics/pod5_direct.py +31 -5
- smftools/informatics/pod5_to_adata.py +31 -8
- smftools/informatics/readwrite.py +13 -16
- smftools/informatics/subsample_pod5.py +48 -0
- smftools/preprocessing/__init__.py +0 -6
- smftools/preprocessing/append_C_context.py +15 -8
- smftools/preprocessing/binarize_on_Youden.py +8 -4
- smftools/preprocessing/binary_layers_to_ohe.py +9 -4
- smftools/preprocessing/calculate_complexity.py +26 -14
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +12 -5
- smftools/preprocessing/calculate_coverage.py +13 -7
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
- smftools/preprocessing/calculate_position_Youden.py +21 -12
- smftools/preprocessing/calculate_read_length_stats.py +11 -6
- smftools/preprocessing/clean_NaN.py +12 -5
- smftools/preprocessing/filter_converted_reads_on_methylation.py +12 -5
- smftools/preprocessing/filter_reads_on_length.py +13 -5
- smftools/preprocessing/invert_adata.py +9 -5
- smftools/preprocessing/mark_duplicates.py +20 -11
- smftools/preprocessing/min_non_diagonal.py +9 -4
- smftools/preprocessing/remove_duplicates.py +9 -3
- smftools/readwrite.py +13 -16
- smftools-0.1.1.dist-info/METADATA +88 -0
- smftools-0.1.1.dist-info/RECORD +64 -0
- smftools/informatics/helpers/align_BAM.py +0 -49
- smftools/informatics/helpers/load_experiment_config.py +0 -17
- smftools-0.1.0.dist-info/METADATA +0 -75
- smftools-0.1.0.dist-info/RECORD +0 -58
- /smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
- /smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
- /smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
- {smftools-0.1.0.dist-info → smftools-0.1.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.0.dist-info → smftools-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,27 +1,12 @@
|
|
|
1
1
|
## readwrite ##
|
|
2
2
|
|
|
3
|
-
# Basic I/O
|
|
4
|
-
import os
|
|
5
|
-
# Datetime
|
|
6
|
-
from datetime import datetime
|
|
7
|
-
# Data structures and basic operations
|
|
8
|
-
import math
|
|
9
|
-
import numpy as np
|
|
10
|
-
import pandas as pd
|
|
11
|
-
import anndata as ad
|
|
12
|
-
import scipy.sparse as sp
|
|
13
|
-
|
|
14
|
-
# Runtime warnings
|
|
15
|
-
import warnings
|
|
16
|
-
warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
|
|
17
|
-
warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
|
|
18
|
-
|
|
19
3
|
######################################################################################################
|
|
20
4
|
## Datetime functionality
|
|
21
5
|
def date_string():
|
|
22
6
|
"""
|
|
23
7
|
Each time this is called, it returns the current date string
|
|
24
8
|
"""
|
|
9
|
+
from datetime import datetime
|
|
25
10
|
current_date = datetime.now()
|
|
26
11
|
date_string = current_date.strftime("%Y%m%d")
|
|
27
12
|
date_string = date_string[2:]
|
|
@@ -31,6 +16,7 @@ def time_string():
|
|
|
31
16
|
"""
|
|
32
17
|
Each time this is called, it returns the current time string
|
|
33
18
|
"""
|
|
19
|
+
from datetime import datetime
|
|
34
20
|
current_time = datetime.now()
|
|
35
21
|
return current_time.strftime("%H:%M:%S")
|
|
36
22
|
######################################################################################################
|
|
@@ -42,6 +28,9 @@ def adata_to_df(adata, layer=None):
|
|
|
42
28
|
Input: An adata object with a specified layer.
|
|
43
29
|
Output: A dataframe for the specific layer.
|
|
44
30
|
"""
|
|
31
|
+
import pandas as pd
|
|
32
|
+
import anndata as ad
|
|
33
|
+
|
|
45
34
|
# Extract the data matrix from the given layer
|
|
46
35
|
if layer:
|
|
47
36
|
data_matrix = adata.layers[layer]
|
|
@@ -60,6 +49,7 @@ def save_matrix(matrix, save_name):
|
|
|
60
49
|
Input: A numpy matrix and a save_name
|
|
61
50
|
Output: A txt file representation of the data matrix
|
|
62
51
|
"""
|
|
52
|
+
import numpy as np
|
|
63
53
|
np.savetxt(f'{save_name}.txt', matrix)
|
|
64
54
|
|
|
65
55
|
def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
|
|
@@ -67,6 +57,13 @@ def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
|
|
|
67
57
|
Concatenate all h5ad files in a directory and delete them after the final adata is written out.
|
|
68
58
|
Input: an output file path relative to the directory in which the function is called
|
|
69
59
|
"""
|
|
60
|
+
import os
|
|
61
|
+
import anndata as ad
|
|
62
|
+
# Runtime warnings
|
|
63
|
+
import warnings
|
|
64
|
+
warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
|
|
65
|
+
warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
|
|
66
|
+
|
|
70
67
|
# List all files in the directory
|
|
71
68
|
files = os.listdir(os.getcwd())
|
|
72
69
|
# get current working directory
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# subsample_pod5
|
|
2
|
+
|
|
3
|
+
def subsample_pod5(pod5_path, read_name_path, output_directory):
|
|
4
|
+
"""
|
|
5
|
+
Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
|
|
6
|
+
This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
pod5_path (str): File path to the POD5 to subsample.
|
|
10
|
+
read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
|
|
11
|
+
output_directory (str): A file path to the directory to output the file.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
16
|
+
import pod5 as p5
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
input_pod5_base = os.path.basename(pod5_path)
|
|
20
|
+
|
|
21
|
+
if type(read_name_path) == str:
|
|
22
|
+
input_read_name_base = os.path.basename(read_name_path)
|
|
23
|
+
output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
|
|
24
|
+
# extract read names into a list of strings
|
|
25
|
+
with open(read_name_path, 'r') as file:
|
|
26
|
+
read_names = [line.strip() for line in file]
|
|
27
|
+
with p5.Reader(pod5_path) as reader:
|
|
28
|
+
read_records = []
|
|
29
|
+
for read_record in reader.reads(selection=read_names):
|
|
30
|
+
read_records.append(read_record.to_read())
|
|
31
|
+
|
|
32
|
+
elif type(read_name_path) == int:
|
|
33
|
+
import random
|
|
34
|
+
output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
|
|
35
|
+
with p5.Reader(pod5_path) as reader:
|
|
36
|
+
all_read_records = []
|
|
37
|
+
for read_record in reader.reads():
|
|
38
|
+
all_read_records.append(read_record.to_read())
|
|
39
|
+
if read_name_path <= len(all_read_records):
|
|
40
|
+
read_records = random.sample(all_read_records, read_name_path)
|
|
41
|
+
else:
|
|
42
|
+
print('Trying to sample more reads than are contained in the input pod5, please try a lower value.')
|
|
43
|
+
|
|
44
|
+
output_pod5 = os.path.join(output_directory, output_base)
|
|
45
|
+
|
|
46
|
+
# Write the subsampled POD5
|
|
47
|
+
with p5.Writer(output_pod5) as writer:
|
|
48
|
+
writer.add_reads(read_records)
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
from .append_C_context import append_C_context
|
|
2
2
|
from .binarize_on_Youden import binarize_on_Youden
|
|
3
|
-
from .binary_layers_to_ohe import binary_layers_to_ohe
|
|
4
3
|
from .calculate_complexity import calculate_complexity
|
|
5
4
|
from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
|
|
6
5
|
from .calculate_coverage import calculate_coverage
|
|
7
|
-
from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
|
|
8
6
|
from .calculate_position_Youden import calculate_position_Youden
|
|
9
7
|
from .calculate_read_length_stats import calculate_read_length_stats
|
|
10
8
|
from .clean_NaN import clean_NaN
|
|
@@ -12,17 +10,14 @@ from .filter_converted_reads_on_methylation import filter_converted_reads_on_met
|
|
|
12
10
|
from .filter_reads_on_length import filter_reads_on_length
|
|
13
11
|
from .invert_adata import invert_adata
|
|
14
12
|
from .mark_duplicates import mark_duplicates
|
|
15
|
-
from .min_non_diagonal import min_non_diagonal
|
|
16
13
|
from .remove_duplicates import remove_duplicates
|
|
17
14
|
|
|
18
15
|
__all__ = [
|
|
19
16
|
"append_C_context",
|
|
20
17
|
"binarize_on_Youden",
|
|
21
|
-
"binary_layers_to_ohe",
|
|
22
18
|
"calculate_complexity",
|
|
23
19
|
"calculate_converted_read_methylation_stats",
|
|
24
20
|
"calculate_coverage",
|
|
25
|
-
"calculate_pairwise_hamming_distances",
|
|
26
21
|
"calculate_position_Youden",
|
|
27
22
|
"calculate_read_length_stats",
|
|
28
23
|
"clean_NaN",
|
|
@@ -30,6 +25,5 @@ __all__ = [
|
|
|
30
25
|
"filter_reads_on_length",
|
|
31
26
|
"invert_adata",
|
|
32
27
|
"mark_duplicates",
|
|
33
|
-
"min_non_diagonal",
|
|
34
28
|
"remove_duplicates"
|
|
35
29
|
]
|
|
@@ -1,22 +1,29 @@
|
|
|
1
1
|
## append_C_context
|
|
2
|
-
import numpy as np
|
|
3
|
-
import anndata as ad
|
|
4
|
-
import pandas as pd
|
|
5
2
|
|
|
6
3
|
## Conversion SMF Specific
|
|
7
4
|
# Read methylation QC
|
|
8
5
|
def append_C_context(adata, obs_column='Reference', use_consensus=False):
|
|
9
6
|
"""
|
|
7
|
+
Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
adata (AnnData): The input adata object.
|
|
11
|
+
obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
|
|
12
|
+
use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
|
|
10
13
|
Input: An adata object, the obs_column of interst, and whether to use the consensus sequence from the category.
|
|
11
|
-
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
None
|
|
12
17
|
"""
|
|
18
|
+
import numpy as np
|
|
19
|
+
import anndata as ad
|
|
13
20
|
site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
|
|
14
21
|
categories = adata.obs[obs_column].cat.categories
|
|
15
|
-
if use_consensus:
|
|
16
|
-
sequence = adata.uns[f'{cat}_consensus_sequence']
|
|
17
|
-
else:
|
|
18
|
-
sequence = adata.uns[f'{cat}_FASTA_sequence']
|
|
19
22
|
for cat in categories:
|
|
23
|
+
if use_consensus:
|
|
24
|
+
sequence = adata.uns[f'{cat}_consensus_sequence']
|
|
25
|
+
else:
|
|
26
|
+
sequence = adata.uns[f'{cat}_FASTA_sequence']
|
|
20
27
|
boolean_dict = {}
|
|
21
28
|
for site_type in site_types:
|
|
22
29
|
boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
## binarize_on_Youden
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import anndata as ad
|
|
5
2
|
|
|
6
3
|
def binarize_on_Youden(adata, obs_column='Reference'):
|
|
7
4
|
"""
|
|
5
|
+
Add a new layer to the adata object that has binarized SMF values based on the position thresholds determined by calculate_position_Youden
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): The anndata object to binarize. pp.calculate_position_Youden function has to be run first.
|
|
9
|
+
obs_column (str): The obs_column to stratify on. Needs to be the same as passed in pp.calculate_position_Youden.
|
|
8
10
|
Input: adata object that has had calculate_position_Youden called on it.
|
|
9
|
-
Output:
|
|
11
|
+
Output:
|
|
10
12
|
"""
|
|
13
|
+
import numpy as np
|
|
14
|
+
import anndata as ad
|
|
11
15
|
temp_adata = None
|
|
12
16
|
categories = adata.obs[obs_column].cat.categories
|
|
13
17
|
for cat in categories:
|
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
## binary_layers_to_ohe
|
|
2
|
-
import numpy as np
|
|
3
|
-
import anndata as ad
|
|
4
|
-
import pandas as pd
|
|
5
2
|
|
|
6
3
|
## Conversion SMF Specific
|
|
7
4
|
def binary_layers_to_ohe(adata, layers, stack='hstack'):
|
|
8
5
|
"""
|
|
6
|
+
Parameters:
|
|
7
|
+
adata (AnnData): Anndata object.
|
|
8
|
+
layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix
|
|
9
|
+
stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
|
|
9
13
|
Input: An adata object and a list of layers containing a binary encoding.
|
|
10
|
-
Output: A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
|
|
11
14
|
"""
|
|
15
|
+
import numpy as np
|
|
16
|
+
import anndata as ad
|
|
12
17
|
# Extract the layers
|
|
13
18
|
layers = [adata.layers[layer_name] for layer_name in layers]
|
|
14
19
|
n_reads = layers[0].shape[0]
|
|
@@ -1,21 +1,32 @@
|
|
|
1
1
|
## calculate_complexity
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from scipy.optimize import curve_fit
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
2
|
|
|
7
|
-
def
|
|
8
|
-
|
|
3
|
+
def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False, output_directory=''):
|
|
4
|
+
"""
|
|
5
|
+
A complexity analysis of the library.
|
|
9
6
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): An adata object with mark_duplicates already run.
|
|
9
|
+
obs_column (str): String of the obs column to iterate over.
|
|
10
|
+
sample_col (str): String of the sample column to iterate over.
|
|
11
|
+
plot (bool): Whether to plot the complexity model.
|
|
12
|
+
save_plot (bool): Whether to save the complexity model.
|
|
13
|
+
output_directory (str): String representing the path to the output directory.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
None
|
|
13
17
|
|
|
14
|
-
def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False):
|
|
15
|
-
"""
|
|
16
|
-
Input: adata object with mark_duplicates already run.
|
|
17
|
-
Output: A complexity analysis of the library
|
|
18
18
|
"""
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from scipy.optimize import curve_fit
|
|
22
|
+
|
|
23
|
+
def lander_waterman(x, C0):
|
|
24
|
+
return C0 * (1 - np.exp(-x / C0))
|
|
25
|
+
|
|
26
|
+
def count_unique_reads(reads, depth):
|
|
27
|
+
subsample = np.random.choice(reads, depth, replace=False)
|
|
28
|
+
return len(np.unique(subsample))
|
|
29
|
+
|
|
19
30
|
categories = adata.obs[obs_column].cat.categories
|
|
20
31
|
sample_names = adata.obs[sample_col].cat.categories
|
|
21
32
|
|
|
@@ -40,6 +51,7 @@ def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names
|
|
|
40
51
|
y_data = lander_waterman(x_data, *popt)
|
|
41
52
|
adata.uns[f'Library_complexity_{sample}_on_{cat}'] = popt[0]
|
|
42
53
|
if plot:
|
|
54
|
+
import matplotlib.pyplot as plt
|
|
43
55
|
# Plot the complexity curve
|
|
44
56
|
plt.figure(figsize=(6, 4))
|
|
45
57
|
plt.plot(total_reads, unique_reads, 'o', label='Observed unique reads')
|
|
@@ -52,7 +64,7 @@ def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names
|
|
|
52
64
|
plt.grid(True)
|
|
53
65
|
if save_plot:
|
|
54
66
|
date_string = date_string()
|
|
55
|
-
save_name = output_directory + f'/{date_string}
|
|
67
|
+
save_name = output_directory + f'/{date_string}_{title}'
|
|
56
68
|
plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
|
|
57
69
|
plt.close()
|
|
58
70
|
else:
|
|
@@ -1,16 +1,23 @@
|
|
|
1
1
|
## calculate_converted_read_methylation_stats
|
|
2
|
-
import numpy as np
|
|
3
|
-
import anndata as ad
|
|
4
|
-
import pandas as pd
|
|
5
2
|
|
|
6
3
|
## Conversion SMF Specific
|
|
7
4
|
# Read methylation QC
|
|
8
5
|
|
|
9
6
|
def calculate_converted_read_methylation_stats(adata, obs_column='Reference'):
|
|
10
7
|
"""
|
|
11
|
-
|
|
12
|
-
|
|
8
|
+
Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives)
|
|
9
|
+
|
|
10
|
+
Parameters:
|
|
11
|
+
adata (AnnData): An AnnData object
|
|
12
|
+
obs_column (str): observation category of interest
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
None
|
|
13
16
|
"""
|
|
17
|
+
import numpy as np
|
|
18
|
+
import anndata as ad
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
14
21
|
site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
|
|
15
22
|
categories = adata.obs[obs_column].cat.categories
|
|
16
23
|
for site_type in site_types:
|
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
## calculate_coverage
|
|
2
|
-
from .. import readwrite
|
|
3
|
-
import numpy as np
|
|
4
|
-
import anndata as ad
|
|
5
|
-
import pandas as pd
|
|
6
|
-
|
|
7
2
|
|
|
8
3
|
def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.05):
|
|
9
4
|
"""
|
|
10
|
-
|
|
11
|
-
|
|
5
|
+
Append position level metadata regarding whether the position is informative within the given observation category.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): An AnnData object
|
|
9
|
+
obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
|
|
10
|
+
position_nan_threshold (float): A minimal threshold of coverage to call the position as valid.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
None
|
|
12
14
|
"""
|
|
15
|
+
import numpy as np
|
|
16
|
+
import anndata as ad
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
13
19
|
categories = adata.obs[obs_column].cat.categories
|
|
14
20
|
n_categories_with_position = np.zeros(adata.shape[1])
|
|
15
21
|
# Loop over categories
|
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
## calculate_pairwise_hamming_distances
|
|
2
|
-
import numpy as np
|
|
3
|
-
import tqdm
|
|
4
|
-
from scipy.spatial.distance import hamming
|
|
5
2
|
|
|
6
3
|
## Conversion SMF Specific
|
|
7
4
|
def calculate_pairwise_hamming_distances(arrays):
|
|
8
5
|
"""
|
|
9
|
-
Calculate the pairwise Hamming distances for a list of ndarrays.
|
|
10
|
-
|
|
11
|
-
|
|
6
|
+
Calculate the pairwise Hamming distances for a list of h-stacked ndarrays.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
arrays (str): A list of ndarrays.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
distance_matrix (ndarray): a 2D array containing the pairwise Hamming distances between all arrays.
|
|
13
|
+
|
|
12
14
|
"""
|
|
15
|
+
import numpy as np
|
|
16
|
+
import tqdm
|
|
17
|
+
from scipy.spatial.distance import hamming
|
|
13
18
|
num_arrays = len(arrays)
|
|
14
19
|
# Initialize an empty distance matrix
|
|
15
20
|
distance_matrix = np.zeros((num_arrays, num_arrays))
|
|
@@ -1,20 +1,28 @@
|
|
|
1
1
|
## calculate_position_Youden
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import anndata as ad
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
|
-
from sklearn.metrics import roc_curve, roc_auc_score
|
|
7
|
-
|
|
8
|
-
|
|
9
2
|
|
|
10
3
|
## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
|
|
11
4
|
def calculate_position_Youden(adata, positive_control_sample, negative_control_sample, J_threshold=0.4, obs_column='Reference', save=False, output_directory=''):
|
|
12
5
|
"""
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
6
|
+
Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
adata (AnnData): An AnnData object.
|
|
10
|
+
positive_control_sample (str): string representing the sample name corresponding to the Plus MTase control sample.
|
|
11
|
+
negative_control_sample (str): string representing the sample name corresponding to the Minus MTase control sample.
|
|
12
|
+
J_threshold (float): A float indicating the J-statistic used to indicate whether a position passes QC for methylation calls.
|
|
13
|
+
obs_column (str): The category to iterate over.
|
|
14
|
+
save (bool): Whether to save the ROC plots.
|
|
15
|
+
output_directory (str): String representing the path to the output directory to output the ROC curves.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
None
|
|
17
19
|
"""
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
import anndata as ad
|
|
23
|
+
import matplotlib.pyplot as plt
|
|
24
|
+
from sklearn.metrics import roc_curve, roc_auc_score
|
|
25
|
+
|
|
18
26
|
control_samples = [positive_control_sample, negative_control_sample]
|
|
19
27
|
categories = adata.obs[obs_column].cat.categories
|
|
20
28
|
# Iterate over each category in the specified obs_column
|
|
@@ -89,7 +97,8 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
|
|
|
89
97
|
plt.savefig(save_name)
|
|
90
98
|
plt.close()
|
|
91
99
|
else:
|
|
92
|
-
plt.show()
|
|
100
|
+
plt.show()
|
|
101
|
+
|
|
93
102
|
adata.var[f'{cat}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
|
|
94
103
|
J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
|
|
95
104
|
adata.var[f'{cat}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
|
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
## calculate_read_length_stats
|
|
2
|
-
import numpy as np
|
|
3
|
-
import anndata as ad
|
|
4
|
-
import pandas as pd
|
|
5
2
|
|
|
6
3
|
# Read length QC
|
|
7
4
|
def calculate_read_length_stats(adata):
|
|
8
5
|
"""
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
6
|
+
Append first valid position in a read and last valid position in the read. From this determine and append the read length.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
adata (AnnData): An adata object
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
upper_bound (int): last valid position in the dataset
|
|
13
|
+
lower_bound (int): first valid position in the dataset
|
|
12
14
|
"""
|
|
15
|
+
import numpy as np
|
|
16
|
+
import anndata as ad
|
|
17
|
+
import pandas as pd
|
|
13
18
|
## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
|
|
14
19
|
|
|
15
20
|
# Add some basic observation-level (read-level) metadata to the anndata object
|
|
@@ -1,14 +1,21 @@
|
|
|
1
1
|
## clean_NaN
|
|
2
|
-
|
|
3
|
-
import anndata as ad
|
|
4
|
-
import pandas as pd
|
|
2
|
+
from ..readwrite import adata_to_df
|
|
5
3
|
|
|
6
4
|
# NaN handling
|
|
7
5
|
def clean_NaN(adata, layer=None):
|
|
8
6
|
"""
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
Append layers to adata that contain NaN cleaning strategies.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
adata (AnnData): an adata object
|
|
11
|
+
layer (str): string representing the layer to fill NaN values in
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
11
15
|
"""
|
|
16
|
+
import numpy as np
|
|
17
|
+
import anndata as ad
|
|
18
|
+
import pandas as pd
|
|
12
19
|
# Fill NaN with closest SMF value
|
|
13
20
|
df = adata_to_df(adata, layer=layer)
|
|
14
21
|
df = df.ffill(axis=1).bfill(axis=1)
|
|
@@ -1,15 +1,22 @@
|
|
|
1
1
|
## filter_converted_reads_on_methylation
|
|
2
|
-
import numpy as np
|
|
3
|
-
import anndata as ad
|
|
4
|
-
import pandas as pd
|
|
5
2
|
|
|
6
3
|
## Conversion SMF Specific
|
|
7
4
|
# Read methylation QC
|
|
8
5
|
def filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, min_SMF_threshold=0.025):
|
|
9
6
|
"""
|
|
10
|
-
|
|
11
|
-
|
|
7
|
+
Filter adata object using minimum thresholds for valid SMF site fraction in read, as well as minimum methylation content in read.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
adata (AnnData): An adata object.
|
|
11
|
+
valid_SMF_site_threshold (float): A minimum proportion of valid SMF sites that must be present in the read. Default is 0.8
|
|
12
|
+
min_SMF_threshold (float): A minimum read methylation level. Default is 0.025
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
12
15
|
"""
|
|
16
|
+
import numpy as np
|
|
17
|
+
import anndata as ad
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
13
20
|
if valid_SMF_site_threshold:
|
|
14
21
|
# Keep reads that have over a given valid GpC site content
|
|
15
22
|
adata = adata[adata.obs['fraction_valid_GpC_site_in_range'] > valid_SMF_site_threshold].copy()
|
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
## filter_reads_on_length
|
|
2
|
-
import numpy as np
|
|
3
|
-
import anndata as ad
|
|
4
|
-
import pandas as pd
|
|
5
2
|
|
|
6
3
|
def filter_reads_on_length(adata, filter_on_coordinates=False, min_read_length=2700):
|
|
7
4
|
"""
|
|
5
|
+
Filters the adata object to keep a defined coordinate window, as well as reads that are over a minimum threshold in length.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): An adata object.
|
|
9
|
+
filter_on_coordinates (bool | list): If False, skips filtering. Otherwise, provide a list containing integers representing the lower and upper bound coordinates to filter on. Default is False.
|
|
10
|
+
min_read_length (int): The minimum read length to keep in the filtered dataset. Default is 2700.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
None
|
|
8
14
|
Input: Adata object. a list of lower and upper bound (set to False or None if not wanted), and a minimum read length integer.
|
|
9
|
-
|
|
15
|
+
|
|
10
16
|
"""
|
|
11
|
-
|
|
17
|
+
import numpy as np
|
|
18
|
+
import anndata as ad
|
|
19
|
+
import pandas as pd
|
|
12
20
|
if filter_on_coordinates:
|
|
13
21
|
lower_bound, upper_bound = filter_on_coordinates
|
|
14
22
|
# Extract the position information from the adata object as an np array
|
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
## invert_adata
|
|
2
|
-
import numpy as np
|
|
3
|
-
import anndata as ad
|
|
4
|
-
import pandas as pd
|
|
5
2
|
|
|
6
3
|
# Optional inversion of the adata
|
|
7
4
|
def invert_adata(adata):
|
|
8
5
|
"""
|
|
9
|
-
|
|
10
|
-
|
|
6
|
+
Inverts the adata object along the variable axis
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
adata (AnnData): An adata object.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
None
|
|
11
13
|
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
import anndata as ad
|
|
12
16
|
# Reassign var_names with new names
|
|
13
17
|
old_var_names = adata.var_names.astype(int).to_numpy()
|
|
14
18
|
new_var_names = np.sort(old_var_names)[::-1].astype(str)
|
|
@@ -1,19 +1,28 @@
|
|
|
1
1
|
## mark_duplicates
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import matplotlib.pyplot as plt
|
|
5
|
-
from scipy.signal import find_peaks
|
|
6
|
-
import networkx as nx
|
|
7
|
-
from .binary_layers_to_ohe import binary_layers_to_ohe
|
|
8
|
-
from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
|
|
9
|
-
from .min_non_diagonal import min_non_diagonal
|
|
10
|
-
|
|
11
2
|
|
|
12
3
|
def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names'):
|
|
13
4
|
"""
|
|
14
|
-
|
|
15
|
-
|
|
5
|
+
Marks duplicates in the adata object.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): An adata object.
|
|
9
|
+
layers (list): A list of strings representing the layers to use.
|
|
10
|
+
obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
|
|
11
|
+
sample_col (str):L A string representing the obs column name to second subset on. Default is 'Sample_names'.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
16
15
|
"""
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import matplotlib.pyplot as plt
|
|
20
|
+
from scipy.signal import find_peaks
|
|
21
|
+
import networkx as nx
|
|
22
|
+
from .binary_layers_to_ohe import binary_layers_to_ohe
|
|
23
|
+
from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
|
|
24
|
+
from .min_non_diagonal import min_non_diagonal
|
|
25
|
+
|
|
17
26
|
categories = adata.obs[obs_column].cat.categories
|
|
18
27
|
sample_names = adata.obs[sample_col].cat.categories
|
|
19
28
|
|
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
## min_non_diagonal
|
|
2
|
-
import numpy as np
|
|
3
2
|
|
|
4
3
|
def min_non_diagonal(matrix):
|
|
5
4
|
"""
|
|
6
|
-
Takes a matrix and returns the smallest value from each row with the diagonal masked
|
|
7
|
-
|
|
8
|
-
|
|
5
|
+
Takes a matrix and returns the smallest value from each row with the diagonal masked.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
matrix (ndarray): A 2D ndarray.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
min_values (list): A list of minimum values from each row of the matrix
|
|
9
12
|
"""
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
10
15
|
n = matrix.shape[0]
|
|
11
16
|
min_values = []
|
|
12
17
|
for i in range(n):
|
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
# remove_duplicates
|
|
2
|
-
import anndata as ad
|
|
3
2
|
|
|
4
3
|
def remove_duplicates(adata):
|
|
5
4
|
"""
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
Remove duplicates from the adata object
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (Anndata): An adata object.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
None
|
|
8
12
|
"""
|
|
13
|
+
import anndata as ad
|
|
14
|
+
|
|
9
15
|
initial_size = adata.shape[0]
|
|
10
16
|
adata = adata[adata.obs['Unique_in_final_read_set'] == True].copy()
|
|
11
17
|
final_size = adata.shape[0]
|