PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py ADDED Viewed

@@ -0,0 +1,259 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, List, Any, Tuple, Union, Optional
+import re
+from itertools import zip_longest
+import pysam
+from tqdm import tqdm
+def concatenate_fastqs_to_bam(
+    fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
+    output_bam: Union[str, Path],
+    barcode_tag: str = "BC",
+    barcode_map: Optional[Dict[Union[str, Path], str]] = None,
+    add_read_group: bool = True,
+    rg_sample_field: Optional[str] = None,
+    progress: bool = True,
+    auto_pair: bool = True,
+) -> Dict[str, Any]:
+    """
+    Concatenate FASTQ(s) into an **unaligned** BAM. Supports single-end and paired-end.
+    Parameters
+    ----------
+    fastq_files : list[Path|str] or list[(Path|str, Path|str)]
+        Either explicit pairs (R1,R2) or a flat list of FASTQs (auto-paired if auto_pair=True).
+    output_bam : Path|str
+        Output BAM path (parent directory will be created).
+    barcode_tag : str
+        SAM tag used to store barcode on each read (default 'BC').
+    barcode_map : dict or None
+        Optional mapping {path: barcode} to override automatic filename-based barcode extraction.
+    add_read_group : bool
+        If True, add @RG header lines (ID = barcode) and set each read's RG tag.
+    rg_sample_field : str or None
+        If set, include SM=<value> in @RG.
+    progress : bool
+        Show tqdm progress bars.
+    auto_pair : bool
+        Auto-pair R1/R2 based on filename patterns if given a flat list.
+    Returns
+    -------
+    dict
+      {'total_reads','per_file','paired_pairs_written','singletons_written','barcodes'}
+    """
+    # ---------- helpers (Pathlib-only) ----------
+    def _strip_fastq_ext(p: Path) -> str:
+        """
+        Remove common FASTQ multi-suffixes; return stem-like name.
+        """
+        name = p.name
+        lowers = name.lower()
+        for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
+            if lowers.endswith(ext):
+                return name[: -len(ext)]
+        return p.stem  # fallback: remove last suffix only
+    def _extract_barcode_from_filename(p: Path) -> str:
+        stem = _strip_fastq_ext(p)
+        if "_" in stem:
+            token = stem.split("_")[-1]
+            if token:
+                return token
+        return stem
+    def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
+        # return (prefix, readnum) if matches; else (None, None)
+        patterns = [
+            r"(?i)(.*?)[._-]r?([12])$",        # prefix_R1 / prefix.r2 / prefix-1
+            r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
+        ]
+        for pat in patterns:
+            m = re.match(pat, stem)
+            if m:
+                return m.group(1), int(m.group(2))
+        return None, None
+    def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
+        pref_map: Dict[str, Dict[int, Path]] = {}
+        unpaired: List[Path] = []
+        for pth in paths:
+            stem = _strip_fastq_ext(pth)
+            pref, num = _classify_read_token(stem)
+            if pref is None:
+                unpaired.append(pth)
+            else:
+                entry = pref_map.setdefault(pref, {})
+                entry[num] = pth
+        pairs: List[Tuple[Path, Path]] = []
+        leftovers: List[Path] = []
+        for d in pref_map.values():
+            if 1 in d and 2 in d:
+                pairs.append((d[1], d[2]))
+            else:
+                leftovers.extend(d.values())
+        leftovers.extend(unpaired)
+        return pairs, leftovers
+    def _fastq_iter(p: Path):
+        # pysam.FastxFile handles compressed extensions transparently
+        with pysam.FastxFile(str(p)) as fx:
+            for rec in fx:
+                yield rec  # rec.name, rec.sequence, rec.quality
+    def _make_unaligned_segment(
+        name: str,
+        seq: str,
+        qual: Optional[str],
+        bc: str,
+        read1: bool,
+        read2: bool,
+    ) -> pysam.AlignedSegment:
+        a = pysam.AlignedSegment()
+        a.query_name = name
+        a.query_sequence = seq
+        if qual is not None:
+            a.query_qualities = pysam.qualitystring_to_array(qual)
+        a.is_unmapped = True
+        a.is_paired = read1 or read2
+        a.is_read1 = read1
+        a.is_read2 = read2
+        a.mate_is_unmapped = a.is_paired
+        a.reference_id = -1
+        a.reference_start = -1
+        a.next_reference_id = -1
+        a.next_reference_start = -1
+        a.template_length = 0
+        a.set_tag(barcode_tag, str(bc), value_type="Z")
+        if add_read_group:
+            a.set_tag("RG", str(bc), value_type="Z")
+        return a
+    # ---------- normalize inputs to Path ----------
+    def _to_path_pair(x) -> Tuple[Path, Path]:
+        a, b = x
+        return Path(a), Path(b)
+    explicit_pairs: List[Tuple[Path, Path]] = []
+    singles: List[Path] = []
+    if not isinstance(fastq_files, (list, tuple)):
+        raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
+    if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
+        explicit_pairs = [_to_path_pair(x) for x in fastq_files]
+    else:
+        flat_paths = [Path(x) for x in fastq_files if x is not None]
+        if auto_pair:
+            explicit_pairs, leftovers = _pair_by_filename(flat_paths)
+            singles = leftovers
+        else:
+            singles = flat_paths
+    output_bam = Path(output_bam)
+    output_bam.parent.mkdir(parents=True, exist_ok=True)
+    # ---------- barcodes ----------
+    barcode_map = {Path(k): v for k, v in (barcode_map or {}).items()}
+    per_path_barcode: Dict[Path, str] = {}
+    barcodes_in_order: List[str] = []
+    for r1, r2 in explicit_pairs:
+        bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
+        per_path_barcode[r1] = bc
+        per_path_barcode[r2] = bc
+        if bc not in barcodes_in_order:
+            barcodes_in_order.append(bc)
+    for pth in singles:
+        bc = barcode_map.get(pth) or _extract_barcode_from_filename(pth)
+        per_path_barcode[pth] = bc
+        if bc not in barcodes_in_order:
+            barcodes_in_order.append(bc)
+    # ---------- BAM header ----------
+    header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
+    if add_read_group:
+        header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
+    header.setdefault("PG", []).append(
+        {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
+    )
+    # ---------- counters ----------
+    per_file_counts: Dict[Path, int] = {}
+    total_written = 0
+    paired_pairs_written = 0
+    singletons_written = 0
+    # ---------- write BAM ----------
+    with pysam.AlignmentFile(str(output_bam), "wb", header=header) as bam_out:
+        # Paired
+        it_pairs = explicit_pairs
+        if progress and it_pairs:
+            it_pairs = tqdm(it_pairs, desc="Paired FASTQ→BAM")
+        for r1_path, r2_path in it_pairs:
+            if not (r1_path.exists() and r2_path.exists()):
+                raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
+            bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
+            it1 = _fastq_iter(r1_path)
+            it2 = _fastq_iter(r2_path)
+            for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
+                def _clean(n: Optional[str]) -> Optional[str]:
+                    if n is None:
+                        return None
+                    return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
+                name = (
+                    _clean(getattr(rec1, "name", None))
+                    or _clean(getattr(rec2, "name", None))
+                    or getattr(rec1, "name", None)
+                    or getattr(rec2, "name", None)
+                )
+                if rec1 is not None:
+                    a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
+                    bam_out.write(a1)
+                    per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
+                    total_written += 1
+                if rec2 is not None:
+                    a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
+                    bam_out.write(a2)
+                    per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
+                    total_written += 1
+                if rec1 is not None and rec2 is not None:
+                    paired_pairs_written += 1
+                else:
+                    if rec1 is not None:
+                        singletons_written += 1
+                    if rec2 is not None:
+                        singletons_written += 1
+        # Singles
+        it_singles = singles
+        if progress and it_singles:
+            it_singles = tqdm(it_singles, desc="Single FASTQ→BAM")
+        for pth in it_singles:
+            if not pth.exists():
+                raise FileNotFoundError(pth)
+            bc = per_path_barcode.get(pth, "barcode")
+            for rec in _fastq_iter(pth):
+                a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
+                bam_out.write(a)
+                per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
+                total_written += 1
+                singletons_written += 1
+    return {
+        "total_reads": total_written,
+        "per_file": {str(k): v for k, v in per_file_counts.items()},
+        "paired_pairs_written": paired_pairs_written,
+        "singletons_written": singletons_written,
+        "barcodes": barcodes_in_order,
+    }

smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py RENAMED Viewed

@@ -14,7 +14,7 @@ def count_aligned_reads(bam_file):
        record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
     """
-    from .. import readwrite
+    from ... import readwrite
     import pysam
     from tqdm import tqdm
     from collections import defaultdict
@@ -25,7 +25,7 @@ def count_aligned_reads(bam_file):
     # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
     record_counts = defaultdict(int)
-    with pysam.AlignmentFile(bam_file, "rb") as bam:
+    with pysam.AlignmentFile(str(bam_file), "rb") as bam:
         total_reads = bam.mapped + bam.unmapped
         # Iterate over reads to get the total mapped read counts and the reads that map to each reference
         for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):

smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py RENAMED Viewed

@@ -18,13 +18,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
         bam_files (list): List of split BAM file path strings
             Splits an input BAM file on barcode value and makes a BAM index file.
     """
-    from .. import readwrite
+    from ...readwrite import make_dirs
     import os
     import subprocess
     import glob
-    from .make_dirs import make_dirs
-    input_bam = aligned_sorted_BAM + bam_suffix
+    input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
     command = ["dorado", "demux", "--kit-name", barcode_kit]
     if barcode_both_ends:
         command.append("--barcode-both-ends")
@@ -34,17 +33,16 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
         command += ["-t", str(threads)]
     else:
         pass
-    command += ["--emit-summary", "--sort-bam", "--output-dir", split_dir]
-    command.append(input_bam)
+    command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
+    command.append(str(input_bam))
     command_string = ' '.join(command)
     print(f"Running: {command_string}")
     subprocess.run(command)
-    # Make a BAM index file for the BAMs in that directory
-    bam_pattern = '*' + bam_suffix
-    bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
-    bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
-    bam_files.sort()
+    bam_files = sorted(
+        p for p in split_dir.glob(f"*{bam_suffix}")
+        if p.is_file() and p.suffix == bam_suffix and "unclassified" not in p.name
+    )
     if not bam_files:
         raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")

smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py RENAMED Viewed

@@ -27,7 +27,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
     mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
     #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
-    with pysam.AlignmentFile(bam_file, "rb") as bam:
+    with pysam.AlignmentFile(str(bam_file), "rb") as bam:
         total_reads = bam.mapped
         ref_seq = sequence.upper()
         for read in bam.fetch(chromosome):

smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py RENAMED Viewed

@@ -23,9 +23,9 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
     import glob
     import zipfile
-    os.chdir(mod_tsv_dir)
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
-    bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
+    bam_files = glob.glob(split_dir / f"*{bam_suffix}")
+    print(f"Running modkit extract for the following bam files: {bam_files}")
     if threads:
         threads = str(threads)
@@ -35,20 +35,20 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
     for input_file in bam_files:
         print(input_file)
         # Extract the file basename
-        file_name = os.path.basename(input_file)
+        file_name = input_file.name
         if skip_unclassified and "unclassified" in file_name:
             print("Skipping modkit extract on unclassified reads")
         else:
             # Construct the output TSV file path
-            output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
-            output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
-            if os.path.exists(f"{output_tsv}.gz"):
-                print(f"{output_tsv}.gz already exists, skipping modkit extract")
+            output_tsv = mod_tsv_dir / file_name.stem + "_extract.tsv"
+            output_tsv_gz = output_tsv + '.gz'
+            if output_tsv_gz.exists():
+                print(f"{output_tsv_gz} already exists, skipping modkit extract")
             else:
                 print(f"Extracting modification data from {input_file}")
                 if modkit_summary:
                     # Run modkit summary
-                    subprocess.run(["modkit", "summary", input_file])
+                    subprocess.run(["modkit", "summary", str(input_file)])
                 else:
                     pass
                 # Run modkit extract
@@ -61,7 +61,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
                         "--mod-thresholds", f"a:{m6A_threshold}",
                         "--mod-thresholds", f"h:{hm5C_threshold}",
                         "-t", threads,
-                        input_file, output_tsv
+                        str(input_file), str(output_tsv)
                         ]
                 else:
                     extract_command = [
@@ -71,13 +71,15 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
                         "--mod-thresholds", f"m:{m5C_threshold}",
                         "--mod-thresholds", f"a:{m6A_threshold}",
                         "--mod-thresholds", f"h:{hm5C_threshold}",
-                        input_file, output_tsv
+                        str(input_file), str(output_tsv)
                         ]
                 subprocess.run(extract_command)
                 # Zip the output TSV
                 print(f'zipping {output_tsv}')
                 if threads:
-                    zip_command = ["pigz", "-f", "-p", threads, output_tsv]
+                    zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
                 else:
-                    zip_command = ["pigz", "-f", output_tsv]
-                subprocess.run(zip_command, check=True)
+                    zip_command = ["pigz", "-f", str(output_tsv)]
+                subprocess.run(zip_command, check=True)
+    return

smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py RENAMED Viewed

@@ -67,6 +67,8 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
         None (Writes the converted FASTA file).
     """
     unconverted = modification_types[0]
+    input_fasta = str(input_fasta)
+    output_fasta = str(output_fasta)
     # Detect if input is gzipped
     open_func = gzip.open if input_fasta.endswith('.gz') else open

smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py RENAMED Viewed

@@ -8,25 +8,26 @@ def get_chromosome_lengths(fasta):
         fasta (str): Path to the input fasta
     """
     import os
+    from pathlib import Path
     import subprocess
     from .index_fasta import index_fasta
     # Make a fasta index file if one isn't already available
-    index_path = f'{fasta}.fai'
-    if os.path.exists(index_path):
+    index_path = fasta / '.fai'
+    if index_path.exists():
         print(f'Using existing fasta index file: {index_path}')
     else:
         index_fasta(fasta)
-    parent_dir = os.path.dirname(fasta)
-    fasta_basename = os.path.basename(fasta)
-    chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
-    chrom_path = os.path.join(parent_dir, chrom_basename)
+    parent_dir = fasta.parent
+    fasta_basename = fasta.name
+    chrom_basename = fasta.stem + '.chrom.sizes'
+    chrom_path = parent_dir / chrom_basename
     # Make a chromosome length file
-    if os.path.exists(chrom_path):
+    if chrom_path.exists():
         print(f'Using existing chrom length index file: {chrom_path}')
     else:
         with open(chrom_path, 'w') as outfile:
-            command = ["cut", "-f1,2", index_path]
+            command = ["cut", "-f1,2", str(index_path)]
             subprocess.run(command, stdout=outfile)

smftools/informatics/archived/helpers/archived/index_fasta.py ADDED Viewed

@@ -0,0 +1,24 @@
+import pysam
+from pathlib import Path
+def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
+    """
+    Index a FASTA and optionally write <fasta>.chrom.sizes for bigwig/bedgraph work.
+    Returns
+    -------
+    Path: path to chrom.sizes file (if requested), else .fai
+    """
+    fasta = Path(fasta)
+    pysam.faidx(str(fasta))   # makes fasta.fai
+    if write_chrom_sizes:
+        fai = fasta.with_suffix(fasta.suffix + ".fai")
+        chrom_sizes = fasta.with_suffix(".chrom.sizes")
+        with open(fai) as f_in, open(chrom_sizes, "w") as out:
+            for line in f_in:
+                chrom, size = line.split()[:2]
+                out.write(f"{chrom}\t{size}\n")
+        return chrom_sizes
+    return fasta.with_suffix(fasta.suffix + ".fai")

smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py RENAMED Viewed

@@ -13,10 +13,9 @@ def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
     import os
     import subprocess
-    os.chdir(mod_bed_dir)
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
     command = [
-        "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
+        "modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
         "--partition-tag", "BC",
         "--only-tabs",
         "--filter-threshold", f'{filter_threshold}',

smftools/informatics/{helpers → archived/helpers/archived}/modQC.py RENAMED Viewed

@@ -16,9 +16,9 @@ def modQC(aligned_sorted_output, thresholds):
     import subprocess
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
-    subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
+    subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
     command = [
-        "modkit", "summary", aligned_sorted_output,
+        "modkit", "summary", str(aligned_sorted_output),
         "--filter-threshold", f"{filter_threshold}",
         "--mod-thresholds", f"m:{m5C_threshold}",
         "--mod-thresholds", f"a:{m6A_threshold}",

smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py RENAMED Viewed

@@ -1,24 +1,5 @@
 # plot_bed_histograms
-def plot_bed_histograms(bed_file, plotting_directory, fasta):
-    """
-    Plots read length, coverage, mapq, read quality stats for each record.
-    Parameters:
-        bed_file (str): Path to the bed file to derive metrics from.
-        plot_directory (str): Path to the directory to write out historgrams.
-        fasta (str): Path to FASTA corresponding to bed
-    Returns:
-        None
-    """
-    import pandas as pd
-    import matplotlib.pyplot as plt
-    import numpy as np
-    import os
-    # plot_bed_histograms.py
 def plot_bed_histograms(
     bed_file,
     plotting_directory,

smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py RENAMED Viewed

@@ -15,13 +15,14 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
             Writes out split BAM files.
     """
     import pysam
+    from pathlib import Path
     import os
-    bam_base = os.path.basename(input_bam)
-    bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
+    bam_base = input_bam.name
+    bam_base_minus_suffix = input_bam.stem
     # Open the input BAM file for reading
-    with pysam.AlignmentFile(input_bam, "rb") as bam:
+    with pysam.AlignmentFile(str(input_bam), "rb") as bam:
         # Create a dictionary to store output BAM files
         output_files = {}
         # Iterate over each read in the BAM file
@@ -32,8 +33,8 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
                 #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
                 # Open the output BAM file corresponding to the barcode
                 if bc_tag not in output_files:
-                    output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
-                    output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
+                    output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
+                    output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
                 # Write the read to the corresponding output BAM file
                 output_files[bc_tag].write(read)
             except KeyError:

smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py RENAMED Viewed

@@ -12,21 +12,21 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
         None
             Splits an input BAM file on barcode value and makes a BAM index file.
     """
-    from .. import readwrite
+    from ...readwrite import date_string, make_dirs
+    from pathlib import Path
     import os
-    import subprocess
+    import pysam
     import glob
     from .separate_bam_by_bc import separate_bam_by_bc
-    from .make_dirs import make_dirs
     aligned_sorted_output = aligned_sorted_BAM + bam_suffix
-    file_prefix = readwrite.date_string()
+    file_prefix = date_string()
     separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
     # Make a BAM index file for the BAMs in that directory
     bam_pattern = '*' + bam_suffix
-    bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
-    bam_files = [bam for bam in bam_files if '.bai' not in bam]
+    bam_files = glob.glob(split_dir / bam_pattern)
+    bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
     for input_file in bam_files:
-        subprocess.run(["samtools", "index", input_file])
+        pysam.index(input_file)
     return bam_files

smftools/informatics/archived/subsample_fasta_from_bed.py ADDED Viewed

@@ -0,0 +1,49 @@
+from pathlib import Path
+from pyfaidx import Fasta
+def subsample_fasta_from_bed(
+    input_FASTA: str | Path,
+    input_bed: str | Path,
+    output_directory: str | Path,
+    output_FASTA: str | Path
+) -> None:
+    """
+    Take a genome-wide FASTA file and a BED file containing
+    coordinate windows of interest. Outputs a subsampled FASTA.
+    """
+    # Normalize everything to Path
+    input_FASTA = Path(input_FASTA)
+    input_bed = Path(input_bed)
+    output_directory = Path(output_directory)
+    output_FASTA = Path(output_FASTA)
+    # Ensure output directory exists
+    output_directory.mkdir(parents=True, exist_ok=True)
+    output_FASTA_path = output_directory / output_FASTA
+    # Load the FASTA file using pyfaidx
+    fasta = Fasta(str(input_FASTA))   # pyfaidx requires string paths
+    # Open BED + output FASTA
+    with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
+        for line in bed:
+            fields = line.strip().split()
+            chrom = fields[0]
+            start = int(fields[1]) # BED is 0-based
+            end   = int(fields[2]) # BED is 0-based and end is exclusive
+            desc  = " ".join(fields[3:]) if len(fields) > 3 else ""
+            if chrom not in fasta:
+                print(f"Warning: {chrom} not found in FASTA")
+                continue
+            # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
+            sequence = fasta[chrom][start:end].seq
+            header = f">{chrom}:{start}-{end}"
+            if desc:
+                header += f"    {desc}"
+            out_fasta.write(f"{header}\n{sequence}\n")

smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl