PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py RENAMED Viewed

@@ -23,9 +23,9 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
     import glob
     import zipfile
-    os.chdir(mod_tsv_dir)
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
-    bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
+    bam_files = glob.glob(split_dir / f"*{bam_suffix}")
+    print(f"Running modkit extract for the following bam files: {bam_files}")
     if threads:
         threads = str(threads)
@@ -35,20 +35,20 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
     for input_file in bam_files:
         print(input_file)
         # Extract the file basename
-        file_name = os.path.basename(input_file)
+        file_name = input_file.name
         if skip_unclassified and "unclassified" in file_name:
             print("Skipping modkit extract on unclassified reads")
         else:
             # Construct the output TSV file path
-            output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
-            output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
-            if os.path.exists(f"{output_tsv}.gz"):
-                print(f"{output_tsv}.gz already exists, skipping modkit extract")
+            output_tsv = mod_tsv_dir / file_name.stem + "_extract.tsv"
+            output_tsv_gz = output_tsv + '.gz'
+            if output_tsv_gz.exists():
+                print(f"{output_tsv_gz} already exists, skipping modkit extract")
             else:
                 print(f"Extracting modification data from {input_file}")
                 if modkit_summary:
                     # Run modkit summary
-                    subprocess.run(["modkit", "summary", input_file])
+                    subprocess.run(["modkit", "summary", str(input_file)])
                 else:
                     pass
                 # Run modkit extract
@@ -61,7 +61,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
                         "--mod-thresholds", f"a:{m6A_threshold}",
                         "--mod-thresholds", f"h:{hm5C_threshold}",
                         "-t", threads,
-                        input_file, output_tsv
+                        str(input_file), str(output_tsv)
                         ]
                 else:
                     extract_command = [
@@ -71,13 +71,15 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
                         "--mod-thresholds", f"m:{m5C_threshold}",
                         "--mod-thresholds", f"a:{m6A_threshold}",
                         "--mod-thresholds", f"h:{hm5C_threshold}",
-                        input_file, output_tsv
+                        str(input_file), str(output_tsv)
                         ]
                 subprocess.run(extract_command)
                 # Zip the output TSV
                 print(f'zipping {output_tsv}')
                 if threads:
-                    zip_command = ["pigz", "-f", "-p", threads, output_tsv]
+                    zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
                 else:
-                    zip_command = ["pigz", "-f", output_tsv]
-                subprocess.run(zip_command, check=True)
+                    zip_command = ["pigz", "-f", str(output_tsv)]
+                subprocess.run(zip_command, check=True)
+    return

smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py RENAMED Viewed

@@ -67,6 +67,8 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
         None (Writes the converted FASTA file).
     """
     unconverted = modification_types[0]
+    input_fasta = str(input_fasta)
+    output_fasta = str(output_fasta)
     # Detect if input is gzipped
     open_func = gzip.open if input_fasta.endswith('.gz') else open

smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py RENAMED Viewed

@@ -8,25 +8,26 @@ def get_chromosome_lengths(fasta):
         fasta (str): Path to the input fasta
     """
     import os
+    from pathlib import Path
     import subprocess
     from .index_fasta import index_fasta
     # Make a fasta index file if one isn't already available
-    index_path = f'{fasta}.fai'
-    if os.path.exists(index_path):
+    index_path = fasta / '.fai'
+    if index_path.exists():
         print(f'Using existing fasta index file: {index_path}')
     else:
         index_fasta(fasta)
-    parent_dir = os.path.dirname(fasta)
-    fasta_basename = os.path.basename(fasta)
-    chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
-    chrom_path = os.path.join(parent_dir, chrom_basename)
+    parent_dir = fasta.parent
+    fasta_basename = fasta.name
+    chrom_basename = fasta.stem + '.chrom.sizes'
+    chrom_path = parent_dir / chrom_basename
     # Make a chromosome length file
-    if os.path.exists(chrom_path):
+    if chrom_path.exists():
         print(f'Using existing chrom length index file: {chrom_path}')
     else:
         with open(chrom_path, 'w') as outfile:
-            command = ["cut", "-f1,2", index_path]
+            command = ["cut", "-f1,2", str(index_path)]
             subprocess.run(command, stdout=outfile)

smftools/informatics/archived/helpers/archived/index_fasta.py ADDED Viewed

@@ -0,0 +1,24 @@
+import pysam
+from pathlib import Path
+def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
+    """
+    Index a FASTA and optionally write <fasta>.chrom.sizes for bigwig/bedgraph work.
+    Returns
+    -------
+    Path: path to chrom.sizes file (if requested), else .fai
+    """
+    fasta = Path(fasta)
+    pysam.faidx(str(fasta))   # makes fasta.fai
+    if write_chrom_sizes:
+        fai = fasta.with_suffix(fasta.suffix + ".fai")
+        chrom_sizes = fasta.with_suffix(".chrom.sizes")
+        with open(fai) as f_in, open(chrom_sizes, "w") as out:
+            for line in f_in:
+                chrom, size = line.split()[:2]
+                out.write(f"{chrom}\t{size}\n")
+        return chrom_sizes
+    return fasta.with_suffix(fasta.suffix + ".fai")

smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py RENAMED Viewed

@@ -13,10 +13,9 @@ def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
     import os
     import subprocess
-    os.chdir(mod_bed_dir)
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
     command = [
-        "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
+        "modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
         "--partition-tag", "BC",
         "--only-tabs",
         "--filter-threshold", f'{filter_threshold}',

smftools/informatics/{helpers → archived/helpers/archived}/modQC.py RENAMED Viewed

@@ -16,9 +16,9 @@ def modQC(aligned_sorted_output, thresholds):
     import subprocess
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
-    subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
+    subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
     command = [
-        "modkit", "summary", aligned_sorted_output,
+        "modkit", "summary", str(aligned_sorted_output),
         "--filter-threshold", f"{filter_threshold}",
         "--mod-thresholds", f"m:{m5C_threshold}",
         "--mod-thresholds", f"a:{m6A_threshold}",

smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py RENAMED Viewed

@@ -1,24 +1,5 @@
 # plot_bed_histograms
-def plot_bed_histograms(bed_file, plotting_directory, fasta):
-    """
-    Plots read length, coverage, mapq, read quality stats for each record.
-    Parameters:
-        bed_file (str): Path to the bed file to derive metrics from.
-        plot_directory (str): Path to the directory to write out historgrams.
-        fasta (str): Path to FASTA corresponding to bed
-    Returns:
-        None
-    """
-    import pandas as pd
-    import matplotlib.pyplot as plt
-    import numpy as np
-    import os
-    # plot_bed_histograms.py
 def plot_bed_histograms(
     bed_file,
     plotting_directory,

smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py RENAMED Viewed

@@ -15,13 +15,14 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
             Writes out split BAM files.
     """
     import pysam
+    from pathlib import Path
     import os
-    bam_base = os.path.basename(input_bam)
-    bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
+    bam_base = input_bam.name
+    bam_base_minus_suffix = input_bam.stem
     # Open the input BAM file for reading
-    with pysam.AlignmentFile(input_bam, "rb") as bam:
+    with pysam.AlignmentFile(str(input_bam), "rb") as bam:
         # Create a dictionary to store output BAM files
         output_files = {}
         # Iterate over each read in the BAM file
@@ -32,8 +33,8 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
                 #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
                 # Open the output BAM file corresponding to the barcode
                 if bc_tag not in output_files:
-                    output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
-                    output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
+                    output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
+                    output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
                 # Write the read to the corresponding output BAM file
                 output_files[bc_tag].write(read)
             except KeyError:

smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py RENAMED Viewed

@@ -12,21 +12,21 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
         None
             Splits an input BAM file on barcode value and makes a BAM index file.
     """
-    from .. import readwrite
+    from ...readwrite import date_string, make_dirs
+    from pathlib import Path
     import os
-    import subprocess
+    import pysam
     import glob
     from .separate_bam_by_bc import separate_bam_by_bc
-    from .make_dirs import make_dirs
     aligned_sorted_output = aligned_sorted_BAM + bam_suffix
-    file_prefix = readwrite.date_string()
+    file_prefix = date_string()
     separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
     # Make a BAM index file for the BAMs in that directory
     bam_pattern = '*' + bam_suffix
-    bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
-    bam_files = [bam for bam in bam_files if '.bai' not in bam]
+    bam_files = glob.glob(split_dir / bam_pattern)
+    bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
     for input_file in bam_files:
-        subprocess.run(["samtools", "index", input_file])
+        pysam.index(input_file)
     return bam_files

smftools/informatics/archived/subsample_fasta_from_bed.py ADDED Viewed

@@ -0,0 +1,49 @@
+from pathlib import Path
+from pyfaidx import Fasta
+def subsample_fasta_from_bed(
+    input_FASTA: str | Path,
+    input_bed: str | Path,
+    output_directory: str | Path,
+    output_FASTA: str | Path
+) -> None:
+    """
+    Take a genome-wide FASTA file and a BED file containing
+    coordinate windows of interest. Outputs a subsampled FASTA.
+    """
+    # Normalize everything to Path
+    input_FASTA = Path(input_FASTA)
+    input_bed = Path(input_bed)
+    output_directory = Path(output_directory)
+    output_FASTA = Path(output_FASTA)
+    # Ensure output directory exists
+    output_directory.mkdir(parents=True, exist_ok=True)
+    output_FASTA_path = output_directory / output_FASTA
+    # Load the FASTA file using pyfaidx
+    fasta = Fasta(str(input_FASTA))   # pyfaidx requires string paths
+    # Open BED + output FASTA
+    with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
+        for line in bed:
+            fields = line.strip().split()
+            chrom = fields[0]
+            start = int(fields[1]) # BED is 0-based
+            end   = int(fields[2]) # BED is 0-based and end is exclusive
+            desc  = " ".join(fields[3:]) if len(fields) > 3 else ""
+            if chrom not in fasta:
+                print(f"Warning: {chrom} not found in FASTA")
+                continue
+            # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
+            sequence = fasta[chrom][start:end].seq
+            header = f">{chrom}:{start}-{end}"
+            if desc:
+                header += f"    {desc}"
+            out_fasta.write(f"{header}\n{sequence}\n")

smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl