smftools 0.1.3__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/METADATA +44 -11
- smftools-0.1.6.dist-info/RECORD +4 -0
- smftools/__init__.py +0 -25
- smftools/_settings.py +0 -20
- smftools/_version.py +0 -1
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +0 -5
- smftools/datasets/__init__.py +0 -9
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +0 -28
- smftools/informatics/__init__.py +0 -14
- smftools/informatics/archived/bam_conversion.py +0 -59
- smftools/informatics/archived/bam_direct.py +0 -63
- smftools/informatics/archived/basecalls_to_adata.py +0 -71
- smftools/informatics/conversion_smf.py +0 -79
- smftools/informatics/direct_smf.py +0 -89
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
- smftools/informatics/helpers/__init__.py +0 -60
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -48
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -73
- smftools/informatics/helpers/archived/informatics.py +0 -260
- smftools/informatics/helpers/archived/load_adata.py +0 -516
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
- smftools/informatics/helpers/canoncall.py +0 -25
- smftools/informatics/helpers/complement_base_list.py +0 -21
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -54
- smftools/informatics/helpers/converted_BAM_to_adata.py +0 -233
- smftools/informatics/helpers/count_aligned_reads.py +0 -43
- smftools/informatics/helpers/extract_base_identities.py +0 -57
- smftools/informatics/helpers/extract_mods.py +0 -51
- smftools/informatics/helpers/extract_readnames_from_BAM.py +0 -22
- smftools/informatics/helpers/find_conversion_sites.py +0 -61
- smftools/informatics/helpers/generate_converted_FASTA.py +0 -98
- smftools/informatics/helpers/get_chromosome_lengths.py +0 -32
- smftools/informatics/helpers/get_native_references.py +0 -28
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/make_modbed.py +0 -27
- smftools/informatics/helpers/modQC.py +0 -27
- smftools/informatics/helpers/modcall.py +0 -28
- smftools/informatics/helpers/modkit_extract_to_adata.py +0 -518
- smftools/informatics/helpers/ohe_batching.py +0 -52
- smftools/informatics/helpers/one_hot_encode.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -52
- smftools/informatics/helpers/separate_bam_by_bc.py +0 -43
- smftools/informatics/helpers/split_and_index_BAM.py +0 -41
- smftools/informatics/load_adata.py +0 -127
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/informatics/subsample_pod5.py +0 -104
- smftools/plotting/__init__.py +0 -0
- smftools/preprocessing/__init__.py +0 -34
- smftools/preprocessing/append_C_context.py +0 -69
- smftools/preprocessing/archives/preprocessing.py +0 -614
- smftools/preprocessing/binarize_on_Youden.py +0 -42
- smftools/preprocessing/binary_layers_to_ohe.py +0 -30
- smftools/preprocessing/calculate_complexity.py +0 -71
- smftools/preprocessing/calculate_consensus.py +0 -47
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -96
- smftools/preprocessing/calculate_coverage.py +0 -41
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
- smftools/preprocessing/calculate_position_Youden.py +0 -104
- smftools/preprocessing/calculate_read_length_stats.py +0 -86
- smftools/preprocessing/clean_NaN.py +0 -38
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -29
- smftools/preprocessing/filter_reads_on_length.py +0 -41
- smftools/preprocessing/invert_adata.py +0 -23
- smftools/preprocessing/load_sample_sheet.py +0 -24
- smftools/preprocessing/make_dirs.py +0 -21
- smftools/preprocessing/mark_duplicates.py +0 -134
- smftools/preprocessing/min_non_diagonal.py +0 -25
- smftools/preprocessing/recipes.py +0 -125
- smftools/preprocessing/remove_duplicates.py +0 -21
- smftools/readwrite.py +0 -106
- smftools/tools/__init__.py +0 -0
- smftools/tools/apply_HMM.py +0 -1
- smftools/tools/cluster.py +0 -0
- smftools/tools/read_HMM.py +0 -1
- smftools/tools/subset_adata.py +0 -32
- smftools/tools/train_HMM.py +0 -43
- smftools-0.1.3.dist-info/RECORD +0 -84
- {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
- {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
## get_native_references
|
|
2
|
-
|
|
3
|
-
# Direct methylation specific
|
|
4
|
-
def get_native_references(fasta_file):
|
|
5
|
-
"""
|
|
6
|
-
Makes a dictionary keyed by record id which points to the record length and record sequence.
|
|
7
|
-
|
|
8
|
-
Paramaters:
|
|
9
|
-
fasta_file (str): A string representing the path to the FASTA file for the experiment.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
None
|
|
13
|
-
"""
|
|
14
|
-
from .. import readwrite
|
|
15
|
-
from Bio import SeqIO
|
|
16
|
-
from Bio.SeqRecord import SeqRecord
|
|
17
|
-
from Bio.Seq import Seq
|
|
18
|
-
record_dict = {}
|
|
19
|
-
print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
|
|
20
|
-
# Open the FASTA record as read only
|
|
21
|
-
with open(fasta_file, "r") as f:
|
|
22
|
-
# Iterate over records in the FASTA
|
|
23
|
-
for record in SeqIO.parse(f, "fasta"):
|
|
24
|
-
# Extract the sequence string of the record
|
|
25
|
-
sequence = str(record.seq).upper()
|
|
26
|
-
sequence_length = len(sequence)
|
|
27
|
-
record_dict[record.id] = [sequence_length, sequence]
|
|
28
|
-
return record_dict
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# index_fasta
|
|
2
|
-
|
|
3
|
-
def index_fasta(fasta):
|
|
4
|
-
"""
|
|
5
|
-
Generate a FASTA index file for an input fasta.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
fasta (str): Path to the input fasta to make an index file for.
|
|
9
|
-
"""
|
|
10
|
-
import subprocess
|
|
11
|
-
|
|
12
|
-
subprocess.run(["samtools", "faidx", fasta])
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
## make_dirs
|
|
2
|
-
|
|
3
|
-
# General
|
|
4
|
-
def make_dirs(directories):
|
|
5
|
-
"""
|
|
6
|
-
Takes a list of file paths and makes new directories if the directory does not already exist.
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
directories (list): A list of directories to make
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
None
|
|
13
|
-
"""
|
|
14
|
-
import os
|
|
15
|
-
|
|
16
|
-
for directory in directories:
|
|
17
|
-
if not os.path.isdir(directory):
|
|
18
|
-
os.mkdir(directory)
|
|
19
|
-
print(f"Directory '{directory}' created successfully.")
|
|
20
|
-
else:
|
|
21
|
-
print(f"Directory '{directory}' already exists.")
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
## make_modbed
|
|
2
|
-
|
|
3
|
-
# Direct SMF
|
|
4
|
-
def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
|
|
5
|
-
"""
|
|
6
|
-
Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
|
|
7
|
-
Parameters:
|
|
8
|
-
aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
|
|
9
|
-
|
|
10
|
-
Returns:
|
|
11
|
-
None
|
|
12
|
-
"""
|
|
13
|
-
import os
|
|
14
|
-
import subprocess
|
|
15
|
-
|
|
16
|
-
os.chdir(mod_bed_dir)
|
|
17
|
-
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
18
|
-
command = [
|
|
19
|
-
"modkit", "pileup", aligned_sorted_output, mod_bed_dir,
|
|
20
|
-
"--partition-tag", "BC",
|
|
21
|
-
"--only-tabs",
|
|
22
|
-
"--filter-threshold", f'{filter_threshold}',
|
|
23
|
-
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
24
|
-
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
25
|
-
"--mod-thresholds", f"h:{hm5C_threshold}"
|
|
26
|
-
]
|
|
27
|
-
subprocess.run(command)
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
## modQC
|
|
2
|
-
|
|
3
|
-
# Direct SMF
|
|
4
|
-
def modQC(aligned_sorted_output, thresholds):
|
|
5
|
-
"""
|
|
6
|
-
Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
|
|
7
|
-
It is generally good to look at these parameters on positive and negative controls.
|
|
8
|
-
|
|
9
|
-
Parameters:
|
|
10
|
-
aligned_sorted_output (str): A string representing the file path of the aligned_sorted non-split BAM file output by the dorado aligned.
|
|
11
|
-
thresholds (list): A list of floats to pass for call thresholds.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
import subprocess
|
|
17
|
-
|
|
18
|
-
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
19
|
-
subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
|
|
20
|
-
command = [
|
|
21
|
-
"modkit", "summary", aligned_sorted_output,
|
|
22
|
-
"--filter-threshold", f"{filter_threshold}",
|
|
23
|
-
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
24
|
-
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
25
|
-
"--mod-thresholds", f"h:{hm5C_threshold}"
|
|
26
|
-
]
|
|
27
|
-
subprocess.run(command)
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
## modcall
|
|
2
|
-
|
|
3
|
-
# Direct methylation specific
|
|
4
|
-
def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
|
|
5
|
-
"""
|
|
6
|
-
Wrapper function for dorado modified base calling.
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
model (str): a string representing the file path to the dorado basecalling model.
|
|
10
|
-
pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
|
|
11
|
-
barcode_kit (str): A string representing the barcoding kit used in the experiment.
|
|
12
|
-
mod_list (list): A list of modification types to use in the analysis.
|
|
13
|
-
bam (str): File path to the BAM file to output.
|
|
14
|
-
bam_suffix (str): The suffix to use for the BAM file.
|
|
15
|
-
|
|
16
|
-
Returns:
|
|
17
|
-
None
|
|
18
|
-
Outputs a BAM file holding the modified base calls output by the dorado basecaller.
|
|
19
|
-
"""
|
|
20
|
-
import subprocess
|
|
21
|
-
output = bam + bam_suffix
|
|
22
|
-
command = [
|
|
23
|
-
"dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
|
|
24
|
-
"--modified-bases"]
|
|
25
|
-
command += mod_list
|
|
26
|
-
print(f'Running: {" ".join(command)}')
|
|
27
|
-
with open(output, "w") as outfile:
|
|
28
|
-
subprocess.run(command, stdout=outfile)
|