smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/conversion.yaml +11 -6
- smftools/config/deaminase.yaml +12 -7
- smftools/config/default.yaml +36 -25
- smftools/config/direct.yaml +25 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +109 -12
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1037 -362
- smftools/preprocessing/__init__.py +2 -0
- smftools/preprocessing/append_base_context.py +3 -3
- smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/readwrite.py +266 -140
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
## readwrite ##
|
|
2
|
-
|
|
3
|
-
######################################################################################################
|
|
4
|
-
## Datetime functionality
|
|
5
|
-
def date_string():
|
|
6
|
-
"""
|
|
7
|
-
Each time this is called, it returns the current date string
|
|
8
|
-
"""
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
current_date = datetime.now()
|
|
11
|
-
date_string = current_date.strftime("%Y%m%d")
|
|
12
|
-
date_string = date_string[2:]
|
|
13
|
-
return date_string
|
|
14
|
-
|
|
15
|
-
def time_string():
|
|
16
|
-
"""
|
|
17
|
-
Each time this is called, it returns the current time string
|
|
18
|
-
"""
|
|
19
|
-
from datetime import datetime
|
|
20
|
-
current_time = datetime.now()
|
|
21
|
-
return current_time.strftime("%H:%M:%S")
|
|
22
|
-
######################################################################################################
|
|
23
|
-
|
|
24
|
-
######################################################################################################
|
|
25
|
-
## Numpy, Pandas, Anndata functionality
|
|
26
|
-
def adata_to_df(adata, layer=None):
|
|
27
|
-
"""
|
|
28
|
-
Input: An adata object with a specified layer.
|
|
29
|
-
Output: A dataframe for the specific layer.
|
|
30
|
-
"""
|
|
31
|
-
import pandas as pd
|
|
32
|
-
import anndata as ad
|
|
33
|
-
|
|
34
|
-
# Extract the data matrix from the given layer
|
|
35
|
-
if layer:
|
|
36
|
-
data_matrix = adata.layers[layer]
|
|
37
|
-
else:
|
|
38
|
-
data_matrix = adata.X
|
|
39
|
-
# Extract observation (read) annotations
|
|
40
|
-
obs_df = adata.obs
|
|
41
|
-
# Extract variable (position) annotations
|
|
42
|
-
var_df = adata.var
|
|
43
|
-
# Convert data matrix and annotations to pandas DataFrames
|
|
44
|
-
df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
|
|
45
|
-
return df
|
|
46
|
-
|
|
47
|
-
def save_matrix(matrix, save_name):
|
|
48
|
-
"""
|
|
49
|
-
Input: A numpy matrix and a save_name
|
|
50
|
-
Output: A txt file representation of the data matrix
|
|
51
|
-
"""
|
|
52
|
-
import numpy as np
|
|
53
|
-
np.savetxt(f'{save_name}.txt', matrix)
|
|
54
|
-
|
|
55
|
-
def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
|
|
56
|
-
"""
|
|
57
|
-
Concatenate all h5ad files in a directory and delete them after the final adata is written out.
|
|
58
|
-
Input: an output file path relative to the directory in which the function is called
|
|
59
|
-
"""
|
|
60
|
-
import os
|
|
61
|
-
import anndata as ad
|
|
62
|
-
# Runtime warnings
|
|
63
|
-
import warnings
|
|
64
|
-
warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
|
|
65
|
-
warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
|
|
66
|
-
|
|
67
|
-
# List all files in the directory
|
|
68
|
-
files = os.listdir(os.getcwd())
|
|
69
|
-
# get current working directory
|
|
70
|
-
cwd = os.getcwd()
|
|
71
|
-
suffix = file_suffix
|
|
72
|
-
# Filter file names that contain the search string in their filename and keep them in a list
|
|
73
|
-
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
74
|
-
# Sort file list by names and print the list of file names
|
|
75
|
-
hdfs.sort()
|
|
76
|
-
print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
|
|
77
|
-
# Iterate over all of the hdf5 files and concatenate them.
|
|
78
|
-
final_adata = None
|
|
79
|
-
for hdf in hdfs:
|
|
80
|
-
print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
|
|
81
|
-
temp_adata = ad.read_h5ad(hdf)
|
|
82
|
-
if final_adata:
|
|
83
|
-
print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
84
|
-
final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
|
|
85
|
-
else:
|
|
86
|
-
print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
87
|
-
final_adata = temp_adata
|
|
88
|
-
print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
|
|
89
|
-
final_adata.write_h5ad(output_file, compression='gzip')
|
|
90
|
-
|
|
91
|
-
# Delete the individual h5ad files and only keep the final concatenated file
|
|
92
|
-
if delete_inputs:
|
|
93
|
-
files = os.listdir(os.getcwd())
|
|
94
|
-
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
95
|
-
if output_file in hdfs:
|
|
96
|
-
hdfs.remove(output_file)
|
|
97
|
-
# Iterate over the files and delete them
|
|
98
|
-
for hdf in hdfs:
|
|
99
|
-
try:
|
|
100
|
-
os.remove(hdf)
|
|
101
|
-
print(f"Deleted file: {hdf}")
|
|
102
|
-
except OSError as e:
|
|
103
|
-
print(f"Error deleting file {hdf}: {e}")
|
|
104
|
-
else:
|
|
105
|
-
print('Keeping input files')
|
|
106
|
-
######################################################################################################
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
# subsample_fasta_from_bed
|
|
2
|
-
|
|
3
|
-
def subsample_fasta_from_bed(input_FASTA, input_bed, output_directory, output_FASTA):
|
|
4
|
-
"""
|
|
5
|
-
Take a genome-wide FASTA file and a bed file containing coordinate windows of interest. Outputs a subsampled FASTA.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
input_FASTA (str): String representing the path to the input FASTA file.
|
|
9
|
-
input_bed (str): String representing the path to the input BED file.
|
|
10
|
-
output_directory (str): String representing the path to the output directory for the new FASTA file.
|
|
11
|
-
output_FASTA (str): Name of the output FASTA.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
from pyfaidx import Fasta
|
|
17
|
-
import os
|
|
18
|
-
|
|
19
|
-
# Load the FASTA file using pyfaidx
|
|
20
|
-
fasta = Fasta(input_FASTA)
|
|
21
|
-
|
|
22
|
-
output_FASTA_path = os.path.join(output_directory, output_FASTA)
|
|
23
|
-
|
|
24
|
-
# Open the BED file
|
|
25
|
-
with open(input_bed, 'r') as bed, open(output_FASTA_path, 'w') as out_fasta:
|
|
26
|
-
for line in bed:
|
|
27
|
-
# Each line in BED file contains: chrom, start, end (and possibly more columns)
|
|
28
|
-
fields = line.strip().split()
|
|
29
|
-
n_fields = len(fields)
|
|
30
|
-
chrom = fields[0]
|
|
31
|
-
start = int(fields[1]) # BED is 0-based
|
|
32
|
-
end = int(fields[2]) # BED is 0-based and end is exclusive
|
|
33
|
-
if n_fields > 3:
|
|
34
|
-
description = " ".join(fields[3:])
|
|
35
|
-
|
|
36
|
-
# Check if the chromosome exists in the FASTA file
|
|
37
|
-
if chrom in fasta:
|
|
38
|
-
# pyfaidx is 1-based, so convert coordinates accordingly
|
|
39
|
-
sequence = fasta[chrom][start:end].seq
|
|
40
|
-
# Write the sequence to the output FASTA file
|
|
41
|
-
if n_fields > 3:
|
|
42
|
-
out_fasta.write(f">{chrom}:{start}-{end} {description}\n")
|
|
43
|
-
else:
|
|
44
|
-
out_fasta.write(f">{chrom}:{start}-{end}\n")
|
|
45
|
-
out_fasta.write(f"{sequence}\n")
|
|
46
|
-
else:
|
|
47
|
-
print(f"Warning: {chrom} not found in the FASTA file")
|