smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from ..config import LoadExperimentConfig
|
|
2
|
+
from ..readwrite import make_dirs
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pod5 as p5
|
|
9
|
+
|
|
10
|
+
from typing import Union, List
|
|
11
|
+
|
|
12
|
+
def basecall_pod5s(config_path):
|
|
13
|
+
"""
|
|
14
|
+
Basecall from pod5s given a config file.
|
|
15
|
+
|
|
16
|
+
Parameters:
|
|
17
|
+
config_path (str): File path to the basecall configuration file
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
None
|
|
21
|
+
"""
|
|
22
|
+
# Default params
|
|
23
|
+
bam_suffix = '.bam' # If different, change from here.
|
|
24
|
+
|
|
25
|
+
# Load experiment config parameters into global variables
|
|
26
|
+
experiment_config = LoadExperimentConfig(config_path)
|
|
27
|
+
var_dict = experiment_config.var_dict
|
|
28
|
+
|
|
29
|
+
# These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
|
|
30
|
+
default_value = None
|
|
31
|
+
|
|
32
|
+
# General config variable init
|
|
33
|
+
input_data_path = Path(var_dict.get('input_data_path', default_value)) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
|
|
34
|
+
output_directory = Path(var_dict.get('output_directory', default_value)) # Path to the output directory to make for the analysis. Necessary.
|
|
35
|
+
model = var_dict.get('model', default_value) # needed for dorado basecaller
|
|
36
|
+
model_dir = Path(var_dict.get('model_dir', default_value)) # model directory
|
|
37
|
+
barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
|
|
38
|
+
barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
|
|
39
|
+
trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
|
|
40
|
+
device = var_dict.get('device', 'auto')
|
|
41
|
+
|
|
42
|
+
# Modified basecalling specific variable init
|
|
43
|
+
filter_threshold = var_dict.get('filter_threshold', default_value)
|
|
44
|
+
m6A_threshold = var_dict.get('m6A_threshold', default_value)
|
|
45
|
+
m5C_threshold = var_dict.get('m5C_threshold', default_value)
|
|
46
|
+
hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
|
|
47
|
+
thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
|
|
48
|
+
mod_list = var_dict.get('mod_list', default_value)
|
|
49
|
+
|
|
50
|
+
# Make initial output directory
|
|
51
|
+
make_dirs([output_directory])
|
|
52
|
+
|
|
53
|
+
# Get the input filetype
|
|
54
|
+
if input_data_path.is_file():
|
|
55
|
+
input_data_filetype = input_data_path.suffixes[0]
|
|
56
|
+
input_is_pod5 = input_data_filetype in ['.pod5','.p5']
|
|
57
|
+
input_is_fast5 = input_data_filetype in ['.fast5','.f5']
|
|
58
|
+
|
|
59
|
+
elif input_data_path.is_dir():
|
|
60
|
+
# Get the file names in the input data dir
|
|
61
|
+
input_files = input_data_path.iterdir()
|
|
62
|
+
input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
|
|
63
|
+
input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
|
|
64
|
+
|
|
65
|
+
# If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
|
|
66
|
+
if input_is_fast5 and not input_is_pod5:
|
|
67
|
+
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
68
|
+
output_pod5 = output_directory / 'FAST5s_to_POD5.pod5'
|
|
69
|
+
print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
|
|
70
|
+
fast5_to_pod5(input_data_path, output_pod5)
|
|
71
|
+
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
72
|
+
input_data_path = output_pod5
|
|
73
|
+
|
|
74
|
+
model_basename = model.name
|
|
75
|
+
model_basename = model_basename.replace('.', '_')
|
|
76
|
+
|
|
77
|
+
if mod_list:
|
|
78
|
+
mod_string = "_".join(mod_list)
|
|
79
|
+
bam = output_directory / f"{model_basename}_{mod_string}_calls"
|
|
80
|
+
modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
|
|
81
|
+
else:
|
|
82
|
+
bam = output_directory / f"{model_basename}_canonical_basecalls"
|
|
83
|
+
canoncall(model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def fast5_to_pod5(
|
|
87
|
+
fast5_dir: Union[str, Path, List[Union[str, Path]]],
|
|
88
|
+
output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
|
|
89
|
+
) -> None:
|
|
90
|
+
"""
|
|
91
|
+
Convert Nanopore FAST5 files (single file, list of files, or directory)
|
|
92
|
+
into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
output_pod5 = str(output_pod5) # ensure string
|
|
96
|
+
|
|
97
|
+
# 1) If user gives a list of FAST5 files
|
|
98
|
+
if isinstance(fast5_dir, (list, tuple)):
|
|
99
|
+
fast5_paths = [str(Path(f)) for f in fast5_dir]
|
|
100
|
+
cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
|
|
101
|
+
subprocess.run(cmd, check=True)
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
# Ensure Path object
|
|
105
|
+
p = Path(fast5_dir)
|
|
106
|
+
|
|
107
|
+
# 2) If user gives a single file
|
|
108
|
+
if p.is_file():
|
|
109
|
+
cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
|
|
110
|
+
subprocess.run(cmd, check=True)
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
# 3) If user gives a directory → collect FAST5s
|
|
114
|
+
if p.is_dir():
|
|
115
|
+
fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
|
|
116
|
+
if not fast5_paths:
|
|
117
|
+
raise FileNotFoundError(f"No FAST5 files found in {p}")
|
|
118
|
+
|
|
119
|
+
cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
|
|
120
|
+
subprocess.run(cmd, check=True)
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
|
|
124
|
+
|
|
125
|
+
def subsample_pod5(pod5_path, read_name_path, output_directory):
|
|
126
|
+
"""
|
|
127
|
+
Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
|
|
128
|
+
This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
|
|
129
|
+
|
|
130
|
+
Parameters:
|
|
131
|
+
pod5_path (str): File path to the POD5 file (or directory of multiple pod5 files) to subsample.
|
|
132
|
+
read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
|
|
133
|
+
output_directory (str): A file path to the directory to output the file.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
None
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
if os.path.isdir(pod5_path):
|
|
140
|
+
pod5_path_is_dir = True
|
|
141
|
+
input_pod5_base = 'input_pod5s.pod5'
|
|
142
|
+
files = os.listdir(pod5_path)
|
|
143
|
+
pod5_files = [os.path.join(pod5_path, file) for file in files if '.pod5' in file]
|
|
144
|
+
pod5_files.sort()
|
|
145
|
+
print(f'Found input pod5s: {pod5_files}')
|
|
146
|
+
|
|
147
|
+
elif os.path.exists(pod5_path):
|
|
148
|
+
pod5_path_is_dir = False
|
|
149
|
+
input_pod5_base = os.path.basename(pod5_path)
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
print('Error: pod5_path passed does not exist')
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
if type(read_name_path) == str:
|
|
156
|
+
input_read_name_base = os.path.basename(read_name_path)
|
|
157
|
+
output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
|
|
158
|
+
|
|
159
|
+
# extract read names into a list of strings
|
|
160
|
+
with open(read_name_path, 'r') as file:
|
|
161
|
+
read_names = [line.strip() for line in file]
|
|
162
|
+
|
|
163
|
+
print(f'Looking for read_ids: {read_names}')
|
|
164
|
+
read_records = []
|
|
165
|
+
|
|
166
|
+
if pod5_path_is_dir:
|
|
167
|
+
for input_pod5 in pod5_files:
|
|
168
|
+
with p5.Reader(input_pod5) as reader:
|
|
169
|
+
try:
|
|
170
|
+
for read_record in reader.reads(selection=read_names, missing_ok=True):
|
|
171
|
+
read_records.append(read_record.to_read())
|
|
172
|
+
print(f'Found read in {input_pod5}: {read_record.read_id}')
|
|
173
|
+
except:
|
|
174
|
+
print('Skipping pod5, could not find reads')
|
|
175
|
+
else:
|
|
176
|
+
with p5.Reader(pod5_path) as reader:
|
|
177
|
+
try:
|
|
178
|
+
for read_record in reader.reads(selection=read_names):
|
|
179
|
+
read_records.append(read_record.to_read())
|
|
180
|
+
print(f'Found read in {input_pod5}: {read_record}')
|
|
181
|
+
except:
|
|
182
|
+
print('Could not find reads')
|
|
183
|
+
|
|
184
|
+
elif type(read_name_path) == int:
|
|
185
|
+
import random
|
|
186
|
+
output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
|
|
187
|
+
all_read_records = []
|
|
188
|
+
|
|
189
|
+
if pod5_path_is_dir:
|
|
190
|
+
# Shuffle the list of input pod5 paths
|
|
191
|
+
random.shuffle(pod5_files)
|
|
192
|
+
for input_pod5 in pod5_files:
|
|
193
|
+
# iterate over the input pod5s
|
|
194
|
+
print(f'Opening pod5 file {input_pod5}')
|
|
195
|
+
with p5.Reader(pod5_path) as reader:
|
|
196
|
+
for read_record in reader.reads():
|
|
197
|
+
all_read_records.append(read_record.to_read())
|
|
198
|
+
# When enough reads are in all_read_records, stop accumulating reads.
|
|
199
|
+
if len(all_read_records) >= read_name_path:
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
if read_name_path <= len(all_read_records):
|
|
203
|
+
read_records = random.sample(all_read_records, read_name_path)
|
|
204
|
+
else:
|
|
205
|
+
print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
|
|
206
|
+
read_records = all_read_records
|
|
207
|
+
|
|
208
|
+
else:
|
|
209
|
+
with p5.Reader(pod5_path) as reader:
|
|
210
|
+
for read_record in reader.reads():
|
|
211
|
+
# get all read records from the input pod5
|
|
212
|
+
all_read_records.append(read_record.to_read())
|
|
213
|
+
if read_name_path <= len(all_read_records):
|
|
214
|
+
# if the subsampling amount is less than the record amount in the file, randomly subsample the reads
|
|
215
|
+
read_records = random.sample(all_read_records, read_name_path)
|
|
216
|
+
else:
|
|
217
|
+
print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
|
|
218
|
+
read_records = all_read_records
|
|
219
|
+
|
|
220
|
+
output_pod5 = os.path.join(output_directory, output_base)
|
|
221
|
+
|
|
222
|
+
# Write the subsampled POD5
|
|
223
|
+
with p5.Writer(output_pod5) as writer:
|
|
224
|
+
writer.add_reads(read_records)
|
|
@@ -9,10 +9,13 @@ def run_multiqc(input_dir, output_dir):
|
|
|
9
9
|
Returns:
|
|
10
10
|
- None: The function executes MultiQC and prints the status.
|
|
11
11
|
"""
|
|
12
|
-
import
|
|
12
|
+
from ..readwrite import make_dirs
|
|
13
13
|
import subprocess
|
|
14
14
|
# Ensure the output directory exists
|
|
15
|
-
|
|
15
|
+
make_dirs(output_dir)
|
|
16
|
+
|
|
17
|
+
input_dir = str(input_dir)
|
|
18
|
+
output_dir = str(output_dir)
|
|
16
19
|
|
|
17
20
|
# Construct MultiQC command
|
|
18
21
|
command = ["multiqc", input_dir, "-o", output_dir]
|
|
@@ -166,7 +166,7 @@ def plot_spatial_autocorr_grid(
|
|
|
166
166
|
ax.set_xlabel("Lag (bp)", fontsize=7)
|
|
167
167
|
ax.tick_params(axis='both', which='major', labelsize=6)
|
|
168
168
|
ax.grid(True, alpha=0.22)
|
|
169
|
-
col_idx += 1
|
|
169
|
+
#col_idx += 1
|
|
170
170
|
continue
|
|
171
171
|
|
|
172
172
|
# mask low-support lags if counts available
|
|
@@ -417,7 +417,6 @@ def plot_spatial_autocorr_grid(
|
|
|
417
417
|
|
|
418
418
|
return saved_pages
|
|
419
419
|
|
|
420
|
-
|
|
421
420
|
def plot_rolling_metrics(df, out_png=None, title=None, figsize=(10, 3.5), dpi=160, show=False):
|
|
422
421
|
"""
|
|
423
422
|
Plot NRL and SNR vs window center from the dataframe returned by rolling_autocorr_metrics.
|
|
@@ -608,4 +607,3 @@ def plot_rolling_grid(
|
|
|
608
607
|
pages_by_metric[metric] = saved_pages
|
|
609
608
|
|
|
610
609
|
return pages_by_metric
|
|
611
|
-
|