PyPI - smftools - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

smftools 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

smftools/_settings.py +3 -2
smftools/_version.py +1 -1
smftools/datasets/F1_sample_sheet.csv +5 -0
smftools/datasets/datasets.py +8 -7
smftools/informatics/__init__.py +7 -5
smftools/informatics/{bam_conversion.py → archived/bam_conversion.py} +16 -4
smftools/informatics/{bam_direct.py → archived/bam_direct.py} +22 -8
smftools/informatics/archived/basecalls_to_adata.py +71 -0
smftools/informatics/conversion_smf.py +79 -0
smftools/informatics/direct_smf.py +89 -0
smftools/informatics/fast5_to_pod5.py +8 -6
smftools/informatics/helpers/__init__.py +18 -0
smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
smftools/informatics/helpers/bed_to_bigwig.py +39 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
smftools/informatics/helpers/canoncall.py +2 -0
smftools/informatics/helpers/complement_base_list.py +21 -0
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
smftools/informatics/helpers/converted_BAM_to_adata.py +161 -92
smftools/informatics/helpers/count_aligned_reads.py +13 -9
smftools/informatics/helpers/extract_base_identities.py +34 -20
smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
smftools/informatics/helpers/find_conversion_sites.py +11 -9
smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
smftools/informatics/helpers/index_fasta.py +12 -0
smftools/informatics/helpers/modcall.py +3 -1
smftools/informatics/helpers/modkit_extract_to_adata.py +467 -316
smftools/informatics/helpers/ohe_batching.py +52 -0
smftools/informatics/helpers/one_hot_encode.py +10 -8
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
smftools/informatics/helpers/split_and_index_BAM.py +16 -4
smftools/informatics/load_adata.py +127 -0
smftools/informatics/subsample_fasta_from_bed.py +47 -0
smftools/informatics/subsample_pod5.py +69 -13
smftools/preprocessing/__init__.py +6 -1
smftools/preprocessing/append_C_context.py +37 -14
smftools/preprocessing/calculate_complexity.py +2 -2
smftools/preprocessing/calculate_consensus.py +47 -0
smftools/preprocessing/calculate_converted_read_methylation_stats.py +60 -9
smftools/preprocessing/calculate_coverage.py +2 -2
smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
smftools/preprocessing/calculate_read_length_stats.py +56 -2
smftools/preprocessing/clean_NaN.py +2 -2
smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
smftools/preprocessing/filter_reads_on_length.py +4 -2
smftools/preprocessing/invert_adata.py +1 -0
smftools/preprocessing/load_sample_sheet.py +24 -0
smftools/preprocessing/make_dirs.py +21 -0
smftools/preprocessing/mark_duplicates.py +34 -19
smftools/preprocessing/recipes.py +125 -0
smftools/preprocessing/remove_duplicates.py +7 -4
smftools/tools/apply_HMM.py +1 -0
smftools/tools/cluster.py +0 -0
smftools/tools/read_HMM.py +1 -0
smftools/tools/subset_adata.py +32 -0
smftools/tools/train_HMM.py +43 -0
{smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/METADATA +13 -7
smftools-0.1.3.dist-info/RECORD +84 -0
smftools/informatics/basecalls_to_adata.py +0 -42
smftools/informatics/pod5_conversion.py +0 -53
smftools/informatics/pod5_direct.py +0 -55
smftools/informatics/pod5_to_adata.py +0 -40
smftools-0.1.1.dist-info/RECORD +0 -64
{smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
{smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/helpers/generate_converted_FASTA.py CHANGED Viewed

@@ -57,23 +57,42 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
     from Bio import SeqIO
     from Bio.SeqRecord import SeqRecord
     from Bio.Seq import Seq
+    import gzip
     modified_records = []
     unconverted = modification_types[0]
     # Iterate over each record in the input FASTA
-    for record in SeqIO.parse(input_fasta, 'fasta'):
-        record_description = record.description
-        # Iterate over each modification type of interest
-        for modification_type in modification_types:
-            # Iterate over the strands of interest
-            for i, strand in enumerate(strands):
-                if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
-                    pass
-                else:
-                    # Add the modified record to the list of modified records
-                    print(f'converting {modification_type} on the {strand} strand of record {record}')
-                    new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
-                    new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
-                    modified_records.append(new_record)
+    if '.gz' in input_fasta:
+        with gzip.open(input_fasta, 'rt') as handle:
+            for record in SeqIO.parse(handle, 'fasta'):
+                record_description = record.description
+                # Iterate over each modification type of interest
+                for modification_type in modification_types:
+                    # Iterate over the strands of interest
+                    for i, strand in enumerate(strands):
+                        if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
+                            pass
+                        else:
+                            # Add the modified record to the list of modified records
+                            print(f'converting {modification_type} on the {strand} strand of record {record}')
+                            new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
+                            new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
+                            modified_records.append(new_record)
+    else:
+        for record in SeqIO.parse(input_fasta, 'fasta'):
+            record_description = record.description
+            # Iterate over each modification type of interest
+            for modification_type in modification_types:
+                # Iterate over the strands of interest
+                for i, strand in enumerate(strands):
+                    if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
+                        pass
+                    else:
+                        # Add the modified record to the list of modified records
+                        print(f'converting {modification_type} on the {strand} strand of record {record}')
+                        new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
+                        new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
+                        modified_records.append(new_record)
     with open(output_fasta, 'w') as output_handle:
         # write out the concatenated FASTA file of modified sequences
         SeqIO.write(modified_records, output_handle, 'fasta')

smftools/informatics/helpers/get_chromosome_lengths.py ADDED Viewed

@@ -0,0 +1,32 @@
+# get_chromosome_lengths
+def get_chromosome_lengths(fasta):
+    """
+    Generates a file containing chromosome lengths within an input FASTA.
+    Parameters:
+        fasta (str): Path to the input fasta
+    """
+    import os
+    import subprocess
+    from .index_fasta import index_fasta
+    # Make a fasta index file if one isn't already available
+    index_path = f'{fasta}.fai'
+    if os.path.exists(index_path):
+        print(f'Using existing fasta index file: {index_path}')
+    else:
+        index_fasta(fasta)
+    parent_dir = os.path.dirname(fasta)
+    fasta_basename = os.path.basename(fasta)
+    chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
+    chrom_path = os.path.join(parent_dir, chrom_basename)
+    # Make a chromosome length file
+    if os.path.exists(chrom_path):
+        print(f'Using existing chrom length index file: {chrom_path}')
+    else:
+        with open(chrom_path, 'w') as outfile:
+            command = ["cut", "-f1,2", index_path]
+            subprocess.run(command, stdout=outfile)

smftools/informatics/helpers/index_fasta.py ADDED Viewed

@@ -0,0 +1,12 @@
+# index_fasta
+def index_fasta(fasta):
+    """
+    Generate a FASTA index file for an input fasta.
+    Parameters:
+        fasta (str): Path to the input fasta to make an index file for.
+    """
+    import subprocess
+    subprocess.run(["samtools", "faidx", fasta])

smftools/informatics/helpers/modcall.py CHANGED Viewed

@@ -21,6 +21,8 @@ def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
     output = bam + bam_suffix
     command = [
     "dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
-    "--modified-bases", ",".join(mod_list)]  # Join MOD_LIST elements with commas
+    "--modified-bases"]
+    command += mod_list
+    print(f'Running: {" ".join(command)}')
     with open(output, "w") as outfile:
         subprocess.run(command, stdout=outfile)

smftools 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

smftools 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl