smftools 0.1.1__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools-0.1.6.dist-info/METADATA +127 -0
- smftools-0.1.6.dist-info/RECORD +4 -0
- smftools/__init__.py +0 -25
- smftools/_settings.py +0 -19
- smftools/_version.py +0 -1
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/__init__.py +0 -9
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +0 -27
- smftools/informatics/__init__.py +0 -12
- smftools/informatics/bam_conversion.py +0 -47
- smftools/informatics/bam_direct.py +0 -49
- smftools/informatics/basecalls_to_adata.py +0 -42
- smftools/informatics/fast5_to_pod5.py +0 -19
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
- smftools/informatics/helpers/__init__.py +0 -42
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -52
- smftools/informatics/helpers/archived/informatics.py +0 -260
- smftools/informatics/helpers/archived/load_adata.py +0 -516
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
- smftools/informatics/helpers/canoncall.py +0 -23
- smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
- smftools/informatics/helpers/count_aligned_reads.py +0 -39
- smftools/informatics/helpers/extract_base_identities.py +0 -43
- smftools/informatics/helpers/extract_mods.py +0 -51
- smftools/informatics/helpers/find_conversion_sites.py +0 -59
- smftools/informatics/helpers/generate_converted_FASTA.py +0 -79
- smftools/informatics/helpers/get_native_references.py +0 -28
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/make_modbed.py +0 -27
- smftools/informatics/helpers/modQC.py +0 -27
- smftools/informatics/helpers/modcall.py +0 -26
- smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
- smftools/informatics/helpers/one_hot_encode.py +0 -19
- smftools/informatics/helpers/separate_bam_by_bc.py +0 -41
- smftools/informatics/helpers/split_and_index_BAM.py +0 -29
- smftools/informatics/pod5_conversion.py +0 -53
- smftools/informatics/pod5_direct.py +0 -55
- smftools/informatics/pod5_to_adata.py +0 -40
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_pod5.py +0 -48
- smftools/plotting/__init__.py +0 -0
- smftools/preprocessing/__init__.py +0 -29
- smftools/preprocessing/append_C_context.py +0 -46
- smftools/preprocessing/archives/preprocessing.py +0 -614
- smftools/preprocessing/binarize_on_Youden.py +0 -42
- smftools/preprocessing/binary_layers_to_ohe.py +0 -30
- smftools/preprocessing/calculate_complexity.py +0 -71
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
- smftools/preprocessing/calculate_coverage.py +0 -41
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
- smftools/preprocessing/calculate_position_Youden.py +0 -104
- smftools/preprocessing/calculate_read_length_stats.py +0 -32
- smftools/preprocessing/clean_NaN.py +0 -38
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -27
- smftools/preprocessing/filter_reads_on_length.py +0 -39
- smftools/preprocessing/invert_adata.py +0 -22
- smftools/preprocessing/mark_duplicates.py +0 -119
- smftools/preprocessing/min_non_diagonal.py +0 -25
- smftools/preprocessing/remove_duplicates.py +0 -18
- smftools/readwrite.py +0 -106
- smftools/tools/__init__.py +0 -0
- smftools-0.1.1.dist-info/METADATA +0 -88
- smftools-0.1.1.dist-info/RECORD +0 -64
- {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
- {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
## readwrite ##
|
|
2
|
-
|
|
3
|
-
######################################################################################################
|
|
4
|
-
## Datetime functionality
|
|
5
|
-
def date_string():
|
|
6
|
-
"""
|
|
7
|
-
Each time this is called, it returns the current date string
|
|
8
|
-
"""
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
current_date = datetime.now()
|
|
11
|
-
date_string = current_date.strftime("%Y%m%d")
|
|
12
|
-
date_string = date_string[2:]
|
|
13
|
-
return date_string
|
|
14
|
-
|
|
15
|
-
def time_string():
|
|
16
|
-
"""
|
|
17
|
-
Each time this is called, it returns the current time string
|
|
18
|
-
"""
|
|
19
|
-
from datetime import datetime
|
|
20
|
-
current_time = datetime.now()
|
|
21
|
-
return current_time.strftime("%H:%M:%S")
|
|
22
|
-
######################################################################################################
|
|
23
|
-
|
|
24
|
-
######################################################################################################
|
|
25
|
-
## Numpy, Pandas, Anndata functionality
|
|
26
|
-
def adata_to_df(adata, layer=None):
|
|
27
|
-
"""
|
|
28
|
-
Input: An adata object with a specified layer.
|
|
29
|
-
Output: A dataframe for the specific layer.
|
|
30
|
-
"""
|
|
31
|
-
import pandas as pd
|
|
32
|
-
import anndata as ad
|
|
33
|
-
|
|
34
|
-
# Extract the data matrix from the given layer
|
|
35
|
-
if layer:
|
|
36
|
-
data_matrix = adata.layers[layer]
|
|
37
|
-
else:
|
|
38
|
-
data_matrix = adata.X
|
|
39
|
-
# Extract observation (read) annotations
|
|
40
|
-
obs_df = adata.obs
|
|
41
|
-
# Extract variable (position) annotations
|
|
42
|
-
var_df = adata.var
|
|
43
|
-
# Convert data matrix and annotations to pandas DataFrames
|
|
44
|
-
df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
|
|
45
|
-
return df
|
|
46
|
-
|
|
47
|
-
def save_matrix(matrix, save_name):
|
|
48
|
-
"""
|
|
49
|
-
Input: A numpy matrix and a save_name
|
|
50
|
-
Output: A txt file representation of the data matrix
|
|
51
|
-
"""
|
|
52
|
-
import numpy as np
|
|
53
|
-
np.savetxt(f'{save_name}.txt', matrix)
|
|
54
|
-
|
|
55
|
-
def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
|
|
56
|
-
"""
|
|
57
|
-
Concatenate all h5ad files in a directory and delete them after the final adata is written out.
|
|
58
|
-
Input: an output file path relative to the directory in which the function is called
|
|
59
|
-
"""
|
|
60
|
-
import os
|
|
61
|
-
import anndata as ad
|
|
62
|
-
# Runtime warnings
|
|
63
|
-
import warnings
|
|
64
|
-
warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
|
|
65
|
-
warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
|
|
66
|
-
|
|
67
|
-
# List all files in the directory
|
|
68
|
-
files = os.listdir(os.getcwd())
|
|
69
|
-
# get current working directory
|
|
70
|
-
cwd = os.getcwd()
|
|
71
|
-
suffix = file_suffix
|
|
72
|
-
# Filter file names that contain the search string in their filename and keep them in a list
|
|
73
|
-
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
74
|
-
# Sort file list by names and print the list of file names
|
|
75
|
-
hdfs.sort()
|
|
76
|
-
print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
|
|
77
|
-
# Iterate over all of the hdf5 files and concatenate them.
|
|
78
|
-
final_adata = None
|
|
79
|
-
for hdf in hdfs:
|
|
80
|
-
print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
|
|
81
|
-
temp_adata = ad.read_h5ad(hdf)
|
|
82
|
-
if final_adata:
|
|
83
|
-
print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
84
|
-
final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
|
|
85
|
-
else:
|
|
86
|
-
print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
87
|
-
final_adata = temp_adata
|
|
88
|
-
print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
|
|
89
|
-
final_adata.write_h5ad(output_file, compression='gzip')
|
|
90
|
-
|
|
91
|
-
# Delete the individual h5ad files and only keep the final concatenated file
|
|
92
|
-
if delete_inputs:
|
|
93
|
-
files = os.listdir(os.getcwd())
|
|
94
|
-
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
95
|
-
if output_file in hdfs:
|
|
96
|
-
hdfs.remove(output_file)
|
|
97
|
-
# Iterate over the files and delete them
|
|
98
|
-
for hdf in hdfs:
|
|
99
|
-
try:
|
|
100
|
-
os.remove(hdf)
|
|
101
|
-
print(f"Deleted file: {hdf}")
|
|
102
|
-
except OSError as e:
|
|
103
|
-
print(f"Error deleting file {hdf}: {e}")
|
|
104
|
-
else:
|
|
105
|
-
print('Keeping input files')
|
|
106
|
-
######################################################################################################
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
# subsample_pod5
|
|
2
|
-
|
|
3
|
-
def subsample_pod5(pod5_path, read_name_path, output_directory):
|
|
4
|
-
"""
|
|
5
|
-
Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
|
|
6
|
-
This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
pod5_path (str): File path to the POD5 to subsample.
|
|
10
|
-
read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
|
|
11
|
-
output_directory (str): A file path to the directory to output the file.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
import pod5 as p5
|
|
17
|
-
import os
|
|
18
|
-
|
|
19
|
-
input_pod5_base = os.path.basename(pod5_path)
|
|
20
|
-
|
|
21
|
-
if type(read_name_path) == str:
|
|
22
|
-
input_read_name_base = os.path.basename(read_name_path)
|
|
23
|
-
output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
|
|
24
|
-
# extract read names into a list of strings
|
|
25
|
-
with open(read_name_path, 'r') as file:
|
|
26
|
-
read_names = [line.strip() for line in file]
|
|
27
|
-
with p5.Reader(pod5_path) as reader:
|
|
28
|
-
read_records = []
|
|
29
|
-
for read_record in reader.reads(selection=read_names):
|
|
30
|
-
read_records.append(read_record.to_read())
|
|
31
|
-
|
|
32
|
-
elif type(read_name_path) == int:
|
|
33
|
-
import random
|
|
34
|
-
output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
|
|
35
|
-
with p5.Reader(pod5_path) as reader:
|
|
36
|
-
all_read_records = []
|
|
37
|
-
for read_record in reader.reads():
|
|
38
|
-
all_read_records.append(read_record.to_read())
|
|
39
|
-
if read_name_path <= len(all_read_records):
|
|
40
|
-
read_records = random.sample(all_read_records, read_name_path)
|
|
41
|
-
else:
|
|
42
|
-
print('Trying to sample more reads than are contained in the input pod5, please try a lower value.')
|
|
43
|
-
|
|
44
|
-
output_pod5 = os.path.join(output_directory, output_base)
|
|
45
|
-
|
|
46
|
-
# Write the subsampled POD5
|
|
47
|
-
with p5.Writer(output_pod5) as writer:
|
|
48
|
-
writer.add_reads(read_records)
|
smftools/plotting/__init__.py
DELETED
|
File without changes
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from .append_C_context import append_C_context
|
|
2
|
-
from .binarize_on_Youden import binarize_on_Youden
|
|
3
|
-
from .calculate_complexity import calculate_complexity
|
|
4
|
-
from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
|
|
5
|
-
from .calculate_coverage import calculate_coverage
|
|
6
|
-
from .calculate_position_Youden import calculate_position_Youden
|
|
7
|
-
from .calculate_read_length_stats import calculate_read_length_stats
|
|
8
|
-
from .clean_NaN import clean_NaN
|
|
9
|
-
from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
|
|
10
|
-
from .filter_reads_on_length import filter_reads_on_length
|
|
11
|
-
from .invert_adata import invert_adata
|
|
12
|
-
from .mark_duplicates import mark_duplicates
|
|
13
|
-
from .remove_duplicates import remove_duplicates
|
|
14
|
-
|
|
15
|
-
__all__ = [
|
|
16
|
-
"append_C_context",
|
|
17
|
-
"binarize_on_Youden",
|
|
18
|
-
"calculate_complexity",
|
|
19
|
-
"calculate_converted_read_methylation_stats",
|
|
20
|
-
"calculate_coverage",
|
|
21
|
-
"calculate_position_Youden",
|
|
22
|
-
"calculate_read_length_stats",
|
|
23
|
-
"clean_NaN",
|
|
24
|
-
"filter_converted_reads_on_methylation",
|
|
25
|
-
"filter_reads_on_length",
|
|
26
|
-
"invert_adata",
|
|
27
|
-
"mark_duplicates",
|
|
28
|
-
"remove_duplicates"
|
|
29
|
-
]
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
## append_C_context
|
|
2
|
-
|
|
3
|
-
## Conversion SMF Specific
|
|
4
|
-
# Read methylation QC
|
|
5
|
-
def append_C_context(adata, obs_column='Reference', use_consensus=False):
|
|
6
|
-
"""
|
|
7
|
-
Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
|
|
8
|
-
|
|
9
|
-
Parameters:
|
|
10
|
-
adata (AnnData): The input adata object.
|
|
11
|
-
obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
|
|
12
|
-
use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
|
|
13
|
-
Input: An adata object, the obs_column of interst, and whether to use the consensus sequence from the category.
|
|
14
|
-
|
|
15
|
-
Returns:
|
|
16
|
-
None
|
|
17
|
-
"""
|
|
18
|
-
import numpy as np
|
|
19
|
-
import anndata as ad
|
|
20
|
-
site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
|
|
21
|
-
categories = adata.obs[obs_column].cat.categories
|
|
22
|
-
for cat in categories:
|
|
23
|
-
if use_consensus:
|
|
24
|
-
sequence = adata.uns[f'{cat}_consensus_sequence']
|
|
25
|
-
else:
|
|
26
|
-
sequence = adata.uns[f'{cat}_FASTA_sequence']
|
|
27
|
-
boolean_dict = {}
|
|
28
|
-
for site_type in site_types:
|
|
29
|
-
boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
|
|
30
|
-
# Iterate through the sequence and apply the criteria
|
|
31
|
-
for i in range(1, len(sequence) - 1):
|
|
32
|
-
if sequence[i] == 'C':
|
|
33
|
-
if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
|
|
34
|
-
boolean_dict[f'{cat}_GpC_site'][i] = True
|
|
35
|
-
elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
|
|
36
|
-
boolean_dict[f'{cat}_ambiguous_GpC_site'][i] = True
|
|
37
|
-
elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
|
|
38
|
-
boolean_dict[f'{cat}_CpG_site'][i] = True
|
|
39
|
-
elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
|
|
40
|
-
boolean_dict[f'{cat}_ambiguous_CpG_site'][i] = True
|
|
41
|
-
elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
|
|
42
|
-
boolean_dict[f'{cat}_other_C'][i] = True
|
|
43
|
-
for site_type in site_types:
|
|
44
|
-
adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
|
|
45
|
-
adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].copy().X
|
|
46
|
-
|