smftools 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_settings.py +3 -2
- smftools/_version.py +1 -1
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/datasets.py +8 -7
- smftools/informatics/__init__.py +7 -5
- smftools/informatics/{bam_conversion.py → archived/bam_conversion.py} +16 -4
- smftools/informatics/{bam_direct.py → archived/bam_direct.py} +22 -8
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/conversion_smf.py +79 -0
- smftools/informatics/direct_smf.py +89 -0
- smftools/informatics/fast5_to_pod5.py +8 -6
- smftools/informatics/helpers/__init__.py +18 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
- smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
- smftools/informatics/helpers/canoncall.py +2 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +161 -92
- smftools/informatics/helpers/count_aligned_reads.py +13 -9
- smftools/informatics/helpers/extract_base_identities.py +34 -20
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +11 -9
- smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/modcall.py +3 -1
- smftools/informatics/helpers/modkit_extract_to_adata.py +467 -316
- smftools/informatics/helpers/ohe_batching.py +52 -0
- smftools/informatics/helpers/one_hot_encode.py +10 -8
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
- smftools/informatics/helpers/split_and_index_BAM.py +16 -4
- smftools/informatics/load_adata.py +127 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +69 -13
- smftools/preprocessing/__init__.py +6 -1
- smftools/preprocessing/append_C_context.py +37 -14
- smftools/preprocessing/calculate_complexity.py +2 -2
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +60 -9
- smftools/preprocessing/calculate_coverage.py +2 -2
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
- smftools/preprocessing/calculate_read_length_stats.py +56 -2
- smftools/preprocessing/clean_NaN.py +2 -2
- smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
- smftools/preprocessing/filter_reads_on_length.py +4 -2
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/preprocessing/load_sample_sheet.py +24 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/mark_duplicates.py +34 -19
- smftools/preprocessing/recipes.py +125 -0
- smftools/preprocessing/remove_duplicates.py +7 -4
- smftools/tools/apply_HMM.py +1 -0
- smftools/tools/cluster.py +0 -0
- smftools/tools/read_HMM.py +1 -0
- smftools/tools/subset_adata.py +32 -0
- smftools/tools/train_HMM.py +43 -0
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/METADATA +13 -7
- smftools-0.1.3.dist-info/RECORD +84 -0
- smftools/informatics/basecalls_to_adata.py +0 -42
- smftools/informatics/pod5_conversion.py +0 -53
- smftools/informatics/pod5_direct.py +0 -55
- smftools/informatics/pod5_to_adata.py +0 -40
- smftools-0.1.1.dist-info/RECORD +0 -64
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -57,23 +57,42 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
|
|
|
57
57
|
from Bio import SeqIO
|
|
58
58
|
from Bio.SeqRecord import SeqRecord
|
|
59
59
|
from Bio.Seq import Seq
|
|
60
|
+
import gzip
|
|
60
61
|
modified_records = []
|
|
61
62
|
unconverted = modification_types[0]
|
|
62
63
|
# Iterate over each record in the input FASTA
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
64
|
+
if '.gz' in input_fasta:
|
|
65
|
+
with gzip.open(input_fasta, 'rt') as handle:
|
|
66
|
+
for record in SeqIO.parse(handle, 'fasta'):
|
|
67
|
+
record_description = record.description
|
|
68
|
+
# Iterate over each modification type of interest
|
|
69
|
+
for modification_type in modification_types:
|
|
70
|
+
# Iterate over the strands of interest
|
|
71
|
+
for i, strand in enumerate(strands):
|
|
72
|
+
if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
|
|
73
|
+
pass
|
|
74
|
+
else:
|
|
75
|
+
# Add the modified record to the list of modified records
|
|
76
|
+
print(f'converting {modification_type} on the {strand} strand of record {record}')
|
|
77
|
+
new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
|
|
78
|
+
new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
|
|
79
|
+
modified_records.append(new_record)
|
|
80
|
+
else:
|
|
81
|
+
for record in SeqIO.parse(input_fasta, 'fasta'):
|
|
82
|
+
record_description = record.description
|
|
83
|
+
# Iterate over each modification type of interest
|
|
84
|
+
for modification_type in modification_types:
|
|
85
|
+
# Iterate over the strands of interest
|
|
86
|
+
for i, strand in enumerate(strands):
|
|
87
|
+
if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
|
|
88
|
+
pass
|
|
89
|
+
else:
|
|
90
|
+
# Add the modified record to the list of modified records
|
|
91
|
+
print(f'converting {modification_type} on the {strand} strand of record {record}')
|
|
92
|
+
new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
|
|
93
|
+
new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
|
|
94
|
+
modified_records.append(new_record)
|
|
95
|
+
|
|
77
96
|
with open(output_fasta, 'w') as output_handle:
|
|
78
97
|
# write out the concatenated FASTA file of modified sequences
|
|
79
98
|
SeqIO.write(modified_records, output_handle, 'fasta')
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# get_chromosome_lengths
|
|
2
|
+
|
|
3
|
+
def get_chromosome_lengths(fasta):
|
|
4
|
+
"""
|
|
5
|
+
Generates a file containing chromosome lengths within an input FASTA.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fasta (str): Path to the input fasta
|
|
9
|
+
"""
|
|
10
|
+
import os
|
|
11
|
+
import subprocess
|
|
12
|
+
from .index_fasta import index_fasta
|
|
13
|
+
|
|
14
|
+
# Make a fasta index file if one isn't already available
|
|
15
|
+
index_path = f'{fasta}.fai'
|
|
16
|
+
if os.path.exists(index_path):
|
|
17
|
+
print(f'Using existing fasta index file: {index_path}')
|
|
18
|
+
else:
|
|
19
|
+
index_fasta(fasta)
|
|
20
|
+
|
|
21
|
+
parent_dir = os.path.dirname(fasta)
|
|
22
|
+
fasta_basename = os.path.basename(fasta)
|
|
23
|
+
chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
|
|
24
|
+
chrom_path = os.path.join(parent_dir, chrom_basename)
|
|
25
|
+
|
|
26
|
+
# Make a chromosome length file
|
|
27
|
+
if os.path.exists(chrom_path):
|
|
28
|
+
print(f'Using existing chrom length index file: {chrom_path}')
|
|
29
|
+
else:
|
|
30
|
+
with open(chrom_path, 'w') as outfile:
|
|
31
|
+
command = ["cut", "-f1,2", index_path]
|
|
32
|
+
subprocess.run(command, stdout=outfile)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# index_fasta
|
|
2
|
+
|
|
3
|
+
def index_fasta(fasta):
|
|
4
|
+
"""
|
|
5
|
+
Generate a FASTA index file for an input fasta.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fasta (str): Path to the input fasta to make an index file for.
|
|
9
|
+
"""
|
|
10
|
+
import subprocess
|
|
11
|
+
|
|
12
|
+
subprocess.run(["samtools", "faidx", fasta])
|
|
@@ -21,6 +21,8 @@ def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
|
|
|
21
21
|
output = bam + bam_suffix
|
|
22
22
|
command = [
|
|
23
23
|
"dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
|
|
24
|
-
"--modified-bases"
|
|
24
|
+
"--modified-bases"]
|
|
25
|
+
command += mod_list
|
|
26
|
+
print(f'Running: {" ".join(command)}')
|
|
25
27
|
with open(output, "w") as outfile:
|
|
26
28
|
subprocess.run(command, stdout=outfile)
|