smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +34 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/hmm/apply_hmm_batched.py +242 -0
- smftools/hmm/calculate_distances.py +18 -0
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/hmm/display_hmm.py +18 -0
- smftools/hmm/hmm_readwrite.py +16 -0
- smftools/hmm/nucleosome_hmm_refinement.py +104 -0
- smftools/hmm/train_hmm.py +78 -0
- smftools/informatics/__init__.py +14 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/conversion_smf.py +132 -0
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/direct_smf.py +137 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/fast5_to_pod5.py +24 -0
- smftools/informatics/helpers/__init__.py +73 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +70 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +51 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +32 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/data/preprocessing.py +6 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/__init__.py +9 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/machine_learning/models/positional.py +18 -0
- smftools/machine_learning/models/rnn.py +17 -0
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/models/wrappers.py +20 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +10 -0
- smftools/machine_learning/utils/grl.py +14 -0
- smftools/plotting/__init__.py +18 -0
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +682 -0
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +38 -0
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_coverage.py +51 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +62 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- smftools/preprocessing/invert_adata.py +37 -0
- smftools/preprocessing/load_sample_sheet.py +53 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +1004 -0
- smftools/tools/__init__.py +20 -0
- smftools/tools/archived/apply_hmm.py +202 -0
- smftools/tools/archived/classifiers.py +787 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/position_stats.py +601 -0
- smftools/tools/read_stats.py +184 -0
- smftools/tools/spatial_autocorrelation.py +562 -0
- smftools/tools/subset_adata.py +28 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# subsample_fasta_from_bed
|
|
2
|
+
|
|
3
|
+
def subsample_fasta_from_bed(input_FASTA, input_bed, output_directory, output_FASTA):
|
|
4
|
+
"""
|
|
5
|
+
Take a genome-wide FASTA file and a bed file containing coordinate windows of interest. Outputs a subsampled FASTA.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
input_FASTA (str): String representing the path to the input FASTA file.
|
|
9
|
+
input_bed (str): String representing the path to the input BED file.
|
|
10
|
+
output_directory (str): String representing the path to the output directory for the new FASTA file.
|
|
11
|
+
output_FASTA (str): Name of the output FASTA.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
16
|
+
from pyfaidx import Fasta
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
# Load the FASTA file using pyfaidx
|
|
20
|
+
fasta = Fasta(input_FASTA)
|
|
21
|
+
|
|
22
|
+
output_FASTA_path = os.path.join(output_directory, output_FASTA)
|
|
23
|
+
|
|
24
|
+
# Open the BED file
|
|
25
|
+
with open(input_bed, 'r') as bed, open(output_FASTA_path, 'w') as out_fasta:
|
|
26
|
+
for line in bed:
|
|
27
|
+
# Each line in BED file contains: chrom, start, end (and possibly more columns)
|
|
28
|
+
fields = line.strip().split()
|
|
29
|
+
n_fields = len(fields)
|
|
30
|
+
chrom = fields[0]
|
|
31
|
+
start = int(fields[1]) # BED is 0-based
|
|
32
|
+
end = int(fields[2]) # BED is 0-based and end is exclusive
|
|
33
|
+
if n_fields > 3:
|
|
34
|
+
description = " ".join(fields[3:])
|
|
35
|
+
|
|
36
|
+
# Check if the chromosome exists in the FASTA file
|
|
37
|
+
if chrom in fasta:
|
|
38
|
+
# pyfaidx is 1-based, so convert coordinates accordingly
|
|
39
|
+
sequence = fasta[chrom][start:end].seq
|
|
40
|
+
# Write the sequence to the output FASTA file
|
|
41
|
+
if n_fields > 3:
|
|
42
|
+
out_fasta.write(f">{chrom}:{start}-{end} {description}\n")
|
|
43
|
+
else:
|
|
44
|
+
out_fasta.write(f">{chrom}:{start}-{end}\n")
|
|
45
|
+
out_fasta.write(f"{sequence}\n")
|
|
46
|
+
else:
|
|
47
|
+
print(f"Warning: {chrom} not found in the FASTA file")
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# subsample_pod5
|
|
2
|
+
|
|
3
|
+
def subsample_pod5(pod5_path, read_name_path, output_directory):
|
|
4
|
+
"""
|
|
5
|
+
Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
|
|
6
|
+
This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
pod5_path (str): File path to the POD5 file (or directory of multiple pod5 files) to subsample.
|
|
10
|
+
read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
|
|
11
|
+
output_directory (str): A file path to the directory to output the file.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
16
|
+
import pod5 as p5
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
if os.path.isdir(pod5_path):
|
|
20
|
+
pod5_path_is_dir = True
|
|
21
|
+
input_pod5_base = 'input_pod5s.pod5'
|
|
22
|
+
files = os.listdir(pod5_path)
|
|
23
|
+
pod5_files = [os.path.join(pod5_path, file) for file in files if '.pod5' in file]
|
|
24
|
+
pod5_files.sort()
|
|
25
|
+
print(f'Found input pod5s: {pod5_files}')
|
|
26
|
+
|
|
27
|
+
elif os.path.exists(pod5_path):
|
|
28
|
+
pod5_path_is_dir = False
|
|
29
|
+
input_pod5_base = os.path.basename(pod5_path)
|
|
30
|
+
|
|
31
|
+
else:
|
|
32
|
+
print('Error: pod5_path passed does not exist')
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
if type(read_name_path) == str:
|
|
36
|
+
input_read_name_base = os.path.basename(read_name_path)
|
|
37
|
+
output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
|
|
38
|
+
|
|
39
|
+
# extract read names into a list of strings
|
|
40
|
+
with open(read_name_path, 'r') as file:
|
|
41
|
+
read_names = [line.strip() for line in file]
|
|
42
|
+
|
|
43
|
+
print(f'Looking for read_ids: {read_names}')
|
|
44
|
+
read_records = []
|
|
45
|
+
|
|
46
|
+
if pod5_path_is_dir:
|
|
47
|
+
for input_pod5 in pod5_files:
|
|
48
|
+
with p5.Reader(input_pod5) as reader:
|
|
49
|
+
try:
|
|
50
|
+
for read_record in reader.reads(selection=read_names, missing_ok=True):
|
|
51
|
+
read_records.append(read_record.to_read())
|
|
52
|
+
print(f'Found read in {input_pod5}: {read_record.read_id}')
|
|
53
|
+
except:
|
|
54
|
+
print('Skipping pod5, could not find reads')
|
|
55
|
+
else:
|
|
56
|
+
with p5.Reader(pod5_path) as reader:
|
|
57
|
+
try:
|
|
58
|
+
for read_record in reader.reads(selection=read_names):
|
|
59
|
+
read_records.append(read_record.to_read())
|
|
60
|
+
print(f'Found read in {input_pod5}: {read_record}')
|
|
61
|
+
except:
|
|
62
|
+
print('Could not find reads')
|
|
63
|
+
|
|
64
|
+
elif type(read_name_path) == int:
|
|
65
|
+
import random
|
|
66
|
+
output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
|
|
67
|
+
all_read_records = []
|
|
68
|
+
|
|
69
|
+
if pod5_path_is_dir:
|
|
70
|
+
# Shuffle the list of input pod5 paths
|
|
71
|
+
random.shuffle(pod5_files)
|
|
72
|
+
for input_pod5 in pod5_files:
|
|
73
|
+
# iterate over the input pod5s
|
|
74
|
+
print(f'Opening pod5 file {input_pod5}')
|
|
75
|
+
with p5.Reader(pod5_path) as reader:
|
|
76
|
+
for read_record in reader.reads():
|
|
77
|
+
all_read_records.append(read_record.to_read())
|
|
78
|
+
# When enough reads are in all_read_records, stop accumulating reads.
|
|
79
|
+
if len(all_read_records) >= read_name_path:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
if read_name_path <= len(all_read_records):
|
|
83
|
+
read_records = random.sample(all_read_records, read_name_path)
|
|
84
|
+
else:
|
|
85
|
+
print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
|
|
86
|
+
read_records = all_read_records
|
|
87
|
+
|
|
88
|
+
else:
|
|
89
|
+
with p5.Reader(pod5_path) as reader:
|
|
90
|
+
for read_record in reader.reads():
|
|
91
|
+
# get all read records from the input pod5
|
|
92
|
+
all_read_records.append(read_record.to_read())
|
|
93
|
+
if read_name_path <= len(all_read_records):
|
|
94
|
+
# if the subsampling amount is less than the record amount in the file, randomly subsample the reads
|
|
95
|
+
read_records = random.sample(all_read_records, read_name_path)
|
|
96
|
+
else:
|
|
97
|
+
print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
|
|
98
|
+
read_records = all_read_records
|
|
99
|
+
|
|
100
|
+
output_pod5 = os.path.join(output_directory, output_base)
|
|
101
|
+
|
|
102
|
+
# Write the subsampled POD5
|
|
103
|
+
with p5.Writer(output_pod5) as writer:
|
|
104
|
+
writer.add_reads(read_records)
|