PyPI - smftools - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

smftools 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

smftools/__init__.py +6 -8
smftools/_settings.py +4 -6
smftools/_version.py +1 -1
smftools/cli/helpers.py +54 -0
smftools/cli/hmm_adata.py +937 -256
smftools/cli/load_adata.py +448 -268
smftools/cli/preprocess_adata.py +469 -263
smftools/cli/spatial_adata.py +536 -319
smftools/cli_entry.py +97 -182
smftools/config/__init__.py +1 -1
smftools/config/conversion.yaml +17 -6
smftools/config/deaminase.yaml +12 -10
smftools/config/default.yaml +142 -33
smftools/config/direct.yaml +11 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +594 -264
smftools/constants.py +37 -0
smftools/datasets/__init__.py +2 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2128 -1418
smftools/hmm/__init__.py +2 -9
smftools/hmm/archived/call_hmm_peaks.py +121 -0
smftools/hmm/call_hmm_peaks.py +299 -91
smftools/hmm/display_hmm.py +19 -6
smftools/hmm/hmm_readwrite.py +13 -4
smftools/hmm/nucleosome_hmm_refinement.py +102 -14
smftools/informatics/__init__.py +30 -7
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
smftools/informatics/archived/print_bam_query_seq.py +7 -1
smftools/informatics/bam_functions.py +397 -175
smftools/informatics/basecalling.py +51 -9
smftools/informatics/bed_functions.py +90 -57
smftools/informatics/binarize_converted_base_identities.py +18 -7
smftools/informatics/complement_base_list.py +7 -6
smftools/informatics/converted_BAM_to_adata.py +265 -122
smftools/informatics/fasta_functions.py +161 -83
smftools/informatics/h5ad_functions.py +196 -30
smftools/informatics/modkit_extract_to_adata.py +609 -270
smftools/informatics/modkit_functions.py +85 -44
smftools/informatics/ohe.py +44 -21
smftools/informatics/pod5_functions.py +112 -73
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +2 -7
smftools/machine_learning/data/anndata_data_module.py +143 -50
smftools/machine_learning/data/preprocessing.py +2 -1
smftools/machine_learning/evaluation/__init__.py +1 -1
smftools/machine_learning/evaluation/eval_utils.py +11 -14
smftools/machine_learning/evaluation/evaluators.py +46 -33
smftools/machine_learning/inference/__init__.py +1 -1
smftools/machine_learning/inference/inference_utils.py +7 -4
smftools/machine_learning/inference/lightning_inference.py +9 -13
smftools/machine_learning/inference/sklearn_inference.py +6 -8
smftools/machine_learning/inference/sliding_window_inference.py +35 -25
smftools/machine_learning/models/__init__.py +10 -5
smftools/machine_learning/models/base.py +28 -42
smftools/machine_learning/models/cnn.py +15 -11
smftools/machine_learning/models/lightning_base.py +71 -40
smftools/machine_learning/models/mlp.py +13 -4
smftools/machine_learning/models/positional.py +3 -2
smftools/machine_learning/models/rnn.py +3 -2
smftools/machine_learning/models/sklearn_models.py +39 -22
smftools/machine_learning/models/transformer.py +68 -53
smftools/machine_learning/models/wrappers.py +2 -1
smftools/machine_learning/training/__init__.py +2 -2
smftools/machine_learning/training/train_lightning_model.py +29 -20
smftools/machine_learning/training/train_sklearn_model.py +9 -15
smftools/machine_learning/utils/__init__.py +1 -1
smftools/machine_learning/utils/device.py +7 -4
smftools/machine_learning/utils/grl.py +3 -1
smftools/metadata.py +443 -0
smftools/plotting/__init__.py +19 -5
smftools/plotting/autocorrelation_plotting.py +145 -44
smftools/plotting/classifiers.py +162 -72
smftools/plotting/general_plotting.py +422 -197
smftools/plotting/hmm_plotting.py +42 -13
smftools/plotting/position_stats.py +147 -87
smftools/plotting/qc_plotting.py +20 -12
smftools/preprocessing/__init__.py +10 -12
smftools/preprocessing/append_base_context.py +115 -80
smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +129 -31
smftools/preprocessing/binary_layers_to_ohe.py +17 -11
smftools/preprocessing/calculate_complexity_II.py +86 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +50 -25
smftools/preprocessing/calculate_pairwise_differences.py +2 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
smftools/preprocessing/calculate_position_Youden.py +118 -54
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
smftools/preprocessing/flag_duplicate_reads.py +689 -272
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +8 -3
smftools/preprocessing/min_non_diagonal.py +2 -1
smftools/preprocessing/recipes.py +56 -23
smftools/preprocessing/reindex_references_adata.py +103 -0
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +331 -82
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +3 -4
smftools/tools/archived/classifiers.py +163 -0
smftools/tools/archived/subset_adata_v1.py +10 -1
smftools/tools/archived/subset_adata_v2.py +12 -1
smftools/tools/calculate_umap.py +54 -15
smftools/tools/cluster_adata_on_methylation.py +115 -46
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +229 -98
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
smftools-0.2.5.dist-info/RECORD +181 -0
smftools-0.2.3.dist-info/RECORD +0 -173
/smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
/smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
/smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
/smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
/smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
/smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
/smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/basecalling.py CHANGED Viewed

@@ -1,7 +1,17 @@
 import subprocess
-from pathlib import Path
-def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
+def canoncall(
+    model_dir,
+    model,
+    pod5_dir,
+    barcode_kit,
+    bam,
+    bam_suffix,
+    barcode_both_ends=True,
+    trim=False,
+    device="auto",
+):
     """
     Wrapper function for dorado canonical base calling.
@@ -15,13 +25,24 @@ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_
         barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
         trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
         device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
     Returns:
         None
             Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
     """
     output = bam + bam_suffix
-    command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
+    command = [
+        "dorado",
+        "basecaller",
+        "--models-directory",
+        model_dir,
+        "--kit-name",
+        barcode_kit,
+        "--device",
+        device,
+        "--batchsize",
+        "0",
+    ]
     if barcode_both_ends:
         command.append("--barcode-both-ends")
     if not trim:
@@ -32,7 +53,19 @@ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_
     with open(output, "w") as outfile:
         subprocess.run(command, stdout=outfile)
-def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
+def modcall(
+    model_dir,
+    model,
+    pod5_dir,
+    barcode_kit,
+    mod_list,
+    bam,
+    bam_suffix,
+    barcode_both_ends=True,
+    trim=False,
+    device="auto",
+):
     """
     Wrapper function for dorado modified base calling.
@@ -47,14 +80,23 @@ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix,
         barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
         trim (bool): Whether to trim barcodes, adapters, and primers from read ends
         device (str): Device to use for basecalling. auto, metal, cpu, cuda.
     Returns:
         None
             Outputs a BAM file holding the modified base calls output by the dorado basecaller.
     """
     import subprocess
     output = bam + bam_suffix
-    command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--modified-bases"]
+    command = [
+        "dorado",
+        "basecaller",
+        "--models-directory",
+        model_dir,
+        "--kit-name",
+        barcode_kit,
+        "--modified-bases",
+    ]
     command += mod_list
     command += ["--device", device, "--batchsize", "0"]
     if barcode_both_ends:
@@ -62,6 +104,6 @@ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix,
     if not trim:
         command.append("--no-trim")
     command += [model, pod5_dir]
-    print(f'Running: {" ".join(command)}')
+    print(f"Running: {' '.join(command)}")
     with open(output, "w") as outfile:
-        subprocess.run(command, stdout=outfile)
+        subprocess.run(command, stdout=outfile)

smftools/informatics/bed_functions.py CHANGED Viewed

@@ -1,20 +1,22 @@
-from pathlib import Path
+import concurrent.futures
 import os
-import subprocess
-from typing import List, Optional, Union
-import pysam
-import pybedtools
-import pyBigWig
+from concurrent.futures import ProcessPoolExecutor
+from pathlib import Path
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import concurrent.futures
-from concurrent.futures import ProcessPoolExecutor
+import pybedtools
+import pyBigWig
+import pysam
-import matplotlib.pyplot as plt
+from smftools.logging_utils import get_logger
 from ..readwrite import make_dirs
+logger = get_logger(__name__)
 def _bed_to_bigwig(fasta: str, bed: str) -> str:
     """
     BED → bedGraph → bigWig
@@ -33,14 +35,14 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
     bigwig = parent / f"{stem}.bw"
     # 1) Compute coverage → bedGraph
-    print(f"[pybedtools] generating coverage bedgraph from {bed}")
+    logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
     bt = pybedtools.BedTool(str(bed))
     # bedtools genomecov -bg
     coverage = bt.genome_coverage(bg=True, genome=str(fai))
     coverage.saveas(str(bedgraph))
     # 2) Convert bedGraph → BigWig via pyBigWig
-    print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
+    logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
     # read chrom sizes from the FASTA .fai index
     chrom_sizes = {}
@@ -61,9 +63,10 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
     bw.close()
-    print(f"BigWig written: {bigwig}")
+    logger.debug(f"BigWig written: {bigwig}")
     return str(bigwig)
 def _plot_bed_histograms(
     bed_file,
     plotting_directory,
@@ -71,9 +74,9 @@ def _plot_bed_histograms(
     *,
     bins=60,
     clip_quantiles=(0.0, 0.995),
-    cov_bin_size=1000,       # coverage bin size in bp
-    rows_per_fig=6,          # paginate if many chromosomes
-    include_mapq_quality=True,   # add MAPQ + avg read quality columns to grid
+    cov_bin_size=1000,  # coverage bin size in bp
+    rows_per_fig=6,  # paginate if many chromosomes
+    include_mapq_quality=True,  # add MAPQ + avg read quality columns to grid
     coordinate_mode="one_based",  # "one_based" (your BED-like) or "zero_based"
 ):
     """
@@ -113,19 +116,30 @@ def _plot_bed_histograms(
     os.makedirs(plotting_directory, exist_ok=True)
     bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
-    print(f"[plot_bed_histograms] Loading: {bed_file}")
+    logger.debug(f"[plot_bed_histograms] Loading: {bed_file}")
     # Load BED-like table
-    cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
-    df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
-        'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
-        'mapq': float, 'avg_q': float
-    })
+    cols = ["chrom", "start", "end", "read_len", "qname", "mapq", "avg_q"]
+    df = pd.read_csv(
+        bed_file,
+        sep="\t",
+        header=None,
+        names=cols,
+        dtype={
+            "chrom": str,
+            "start": int,
+            "end": int,
+            "read_len": int,
+            "qname": str,
+            "mapq": float,
+            "avg_q": float,
+        },
+    )
     # Drop unaligned records (chrom == '*') if present
-    df = df[df['chrom'] != '*'].copy()
+    df = df[df["chrom"] != "*"].copy()
     if df.empty:
-        print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
+        logger.debug("[plot_bed_histograms] No aligned reads found; nothing to plot.")
         return
     # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
@@ -135,15 +149,16 @@ def _plot_bed_histograms(
     if coordinate_mode == "one_based":
         # convert to 0-based half-open [start0, end0)
-        start0 = df['start'].to_numpy() - 1
-        end0   = df['end'].to_numpy()   # inclusive in input -> +1 already handled by not subtracting
+        start0 = df["start"].to_numpy() - 1
+        end0 = df["end"].to_numpy()  # inclusive in input -> +1 already handled by not subtracting
     else:
         # already 0-based half-open (assumption)
-        start0 = df['start'].to_numpy()
-        end0   = df['end'].to_numpy()
+        start0 = df["start"].to_numpy()
+        end0 = df["end"].to_numpy()
     # Clip helper for hist tails
     def _clip_series(s, q=(0.0, 0.995)):
+        """Clip a Series to quantile bounds for plotting."""
         if q is None:
             return s.to_numpy()
         lo = s.quantile(q[0]) if q[0] is not None else s.min()
@@ -157,42 +172,42 @@ def _plot_bed_histograms(
         ref_lengths = dict(zip(ref_names, fa.lengths))
     # Keep only chroms present in FASTA and with at least one read
-    chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
+    chroms = [c for c in df["chrom"].unique() if c in ref_lengths]
     # Order chromosomes by FASTA order
     chrom_order = [c for c in ref_names if c in chroms]
     if not chrom_order:
-        print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
+        logger.debug(
+            "[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting."
+        )
         return
     # Pagination
     def _sanitize(name: str) -> str:
+        """Sanitize a string for use in filenames."""
         return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
     cols_per_fig = 4 if include_mapq_quality else 2
     for start_idx in range(0, len(chrom_order), rows_per_fig):
-        chunk = chrom_order[start_idx:start_idx + rows_per_fig]
+        chunk = chrom_order[start_idx : start_idx + rows_per_fig]
         nrows = len(chunk)
         ncols = cols_per_fig
         fig, axes = plt.subplots(
-            nrows=nrows, ncols=ncols,
-            figsize=(4.0 * ncols, 2.6 * nrows),
-            dpi=160,
-            squeeze=False
+            nrows=nrows, ncols=ncols, figsize=(4.0 * ncols, 2.6 * nrows), dpi=160, squeeze=False
         )
         for r, chrom in enumerate(chunk):
             chrom_len = ref_lengths[chrom]
-            mask = (df['chrom'].to_numpy() == chrom)
+            mask = df["chrom"].to_numpy() == chrom
             # Slice per-chrom arrays for speed
             s0 = start0[mask]
             e0 = end0[mask]
-            len_arr = df.loc[mask, 'read_len']
-            mapq_arr = df.loc[mask, 'mapq']
-            q_arr = df.loc[mask, 'avg_q']
+            len_arr = df.loc[mask, "read_len"]
+            mapq_arr = df.loc[mask, "mapq"]
+            q_arr = df.loc[mask, "avg_q"]
             # --- Col 1: Read length histogram (clipped) ---
             ax = axes[r, 0]
@@ -222,7 +237,7 @@ def _plot_bed_histograms(
             # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
             for lo, hi in zip(b_lo, b_hi):
-                cov[lo:hi + 1] += 1
+                cov[lo : hi + 1] += 1
             x_mid = (edges[:-1] + edges[1:]) / 2.0
             ax.plot(x_mid, cov)
@@ -237,7 +252,12 @@ def _plot_bed_histograms(
                 # --- Col 3: MAPQ ---
                 ax = axes[r, 2]
                 # Clip MAPQ upper tail if needed (usually 60)
-                ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+                ax.hist(
+                    _clip_series(mapq_arr.fillna(0), clip_quantiles),
+                    bins=bins,
+                    edgecolor="black",
+                    alpha=0.7,
+                )
                 if r == 0:
                     ax.set_title("MAPQ")
                 ax.set_xlabel("MAPQ")
@@ -245,7 +265,12 @@ def _plot_bed_histograms(
                 # --- Col 4: Avg base quality ---
                 ax = axes[r, 3]
-                ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+                ax.hist(
+                    _clip_series(q_arr.fillna(np.nan), clip_quantiles),
+                    bins=bins,
+                    edgecolor="black",
+                    alpha=0.7,
+                )
                 if r == 0:
                     ax.set_title("Avg base qual")
                 ax.set_xlabel("Phred")
@@ -254,7 +279,8 @@ def _plot_bed_histograms(
         fig.suptitle(
             f"{bed_basename} — per-chromosome QC "
             f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
-            y=0.995, fontsize=11
+            y=0.995,
+            fontsize=11,
         )
         fig.tight_layout(rect=[0, 0, 1, 0.98])
@@ -263,7 +289,8 @@ def _plot_bed_histograms(
         plt.savefig(out_png, bbox_inches="tight")
         plt.close(fig)
-    print("[plot_bed_histograms] Done.")
+    logger.debug("[plot_bed_histograms] Done.")
 def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
     """
@@ -287,9 +314,9 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
     bed_dir = out_dir / "beds"
     make_dirs([plotting_dir, bed_dir])
-    bed_output = bed_dir /  str(aligned_BAM.name).replace(".bam", "_bed.bed")
+    bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
-    print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
+    logger.debug(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
     with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
         for read in bam.fetch(until_eof=True):
@@ -317,20 +344,24 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
             out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
-    print(f"BED-like file created: {bed_output}")
+    logger.debug(f"BED-like file created: {bed_output}")
     def split_bed(bed):
         """Splits into aligned and unaligned reads (chrom == '*')."""
         bed = str(bed)
         aligned = bed.replace(".bed", "_aligned.bed")
         unaligned = bed.replace(".bed", "_unaligned.bed")
-        with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
+        with (
+            open(bed, "r") as infile,
+            open(aligned, "w") as aligned_out,
+            open(unaligned, "w") as unaligned_out,
+        ):
             for line in infile:
                 (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
         os.remove(bed)
         return aligned
-    print(f"Splitting: {bed_output}")
+    logger.debug(f"Splitting: {bed_output}")
     aligned_bed = split_bed(bed_output)
     with ProcessPoolExecutor() as executor:
@@ -340,7 +371,8 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
             futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
         concurrent.futures.wait(futures)
-    print("Processing completed successfully.")
+    logger.debug("Processing completed successfully.")
 def extract_read_lengths_from_bed(file_path):
     """
@@ -352,15 +384,16 @@ def extract_read_lengths_from_bed(file_path):
         read_dict (dict)
     """
     import pandas as pd
-    columns = ['chrom', 'start', 'end', 'length', 'name']
-    df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
+    columns = ["chrom", "start", "end", "length", "name"]
+    df = pd.read_csv(file_path, sep="\t", header=None, names=columns, comment="#")
     read_dict = {}
     for _, row in df.iterrows():
-        chrom = row['chrom']
-        start = row['start']
-        end = row['end']
-        name = row['name']
-        length = row['length']
+        chrom = row["chrom"]
+        start = row["start"]
+        end = row["end"]
+        name = row["name"]
+        length = row["length"]
         read_dict[name] = length
-    return read_dict
+    return read_dict

smftools/informatics/binarize_converted_base_identities.py CHANGED Viewed

@@ -1,4 +1,13 @@
-def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu', deaminase_footprinting=False, mismatch_trend_per_read={}, on_missing="nan"):
+def binarize_converted_base_identities(
+    base_identities,
+    strand,
+    modification_type,
+    bam,
+    device="cpu",
+    deaminase_footprinting=False,
+    mismatch_trend_per_read={},
+    on_missing="nan",
+):
     """
     Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
@@ -10,7 +19,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
         deaminase_footprinting (bool): Whether direct deaminase footprinting chemistry was used.
         mismatch_trend_per_read (dict): For deaminase footprinting, indicates the type of conversion relative to the top strand reference for each read. (C->T or G->A if bottom strand was converted)
         on_missing (str): Error handling if a read is missing
     Returns:
         dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
         If deaminase_footprinting, 1 represents deaminated sites, while 0 represents non-deaminated sites.
@@ -64,14 +73,16 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
     # Non-deaminase mapping (bisulfite-style for 5mC; 6mA mapping is protocol dependent)
     bin_maps = {
-        ("top", "5mC"):    {"C": 1.0, "T": 0.0},
+        ("top", "5mC"): {"C": 1.0, "T": 0.0},
         ("bottom", "5mC"): {"G": 1.0, "A": 0.0},
-        ("top", "6mA"):    {"A": 1.0, "G": 0.0},
+        ("top", "6mA"): {"A": 1.0, "G": 0.0},
         ("bottom", "6mA"): {"T": 1.0, "C": 0.0},
     }
     key = (strand, modification_type)
     if key not in bin_maps:
-        raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
+        raise ValueError(
+            f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'"
+        )
     base_map = bin_maps[key]
@@ -110,7 +121,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
     #         binarized_base_identities[key] = binarized
     #     return binarized_base_identities
     # else:
     #     binarization_maps = {
     #         ('top', '5mC'): {'C': 1, 'T': 0},
@@ -152,7 +163,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
     # # Fetch the appropriate mapping
     # base_map = binarization_maps[(strand, modification_type)]
     # # Convert mapping to tensor
     # base_keys = list(base_map.keys())
     # base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)

smftools/informatics/complement_base_list.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # complement_base_list
 def complement_base_list(sequence):
     """
     Takes a list of DNA base identities and returns their complement.
@@ -11,11 +12,11 @@ def complement_base_list(sequence):
         complement (list): A list of complementary DNA bases.
     """
     complement_mapping = {
-        'A': 'T',
-        'T': 'A',
-        'C': 'G',
-        'G': 'C',
-        'N': 'N'  # Handling ambiguous bases like 'N'
+        "A": "T",
+        "T": "A",
+        "C": "G",
+        "G": "C",
+        "N": "N",  # Handling ambiguous bases like 'N'
     }
-    return [complement_mapping[base] for base in sequence]
+    return [complement_mapping[base] for base in sequence]

smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

smftools 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl