PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (120) hide show

smftools/__init__.py +9 -4
smftools/_version.py +1 -1
smftools/cli.py +184 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +33 -0
smftools/config/deaminase.yaml +56 -0
smftools/config/default.yaml +253 -0
smftools/config/direct.yaml +17 -0
smftools/config/experiment_config.py +1191 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +0 -2
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/fast5_to_pod5.py +4 -1
smftools/informatics/helpers/__init__.py +3 -4
smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
smftools/informatics/helpers/discover_input_files.py +100 -0
smftools/informatics/helpers/extract_base_identities.py +29 -3
smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
smftools/informatics/helpers/find_conversion_sites.py +5 -4
smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
smftools/informatics/helpers/plot_bed_histograms.py +269 -0
smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
smftools/informatics/helpers/split_and_index_BAM.py +1 -5
smftools/load_adata.py +1346 -0
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +611 -0
smftools/plotting/general_plotting.py +566 -89
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +13 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +849 -43
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
smftools-0.2.1.dist-info/RECORD +161 -0
smftools-0.2.1.dist-info/entry_points.txt +2 -0
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/evaluation/__init__.py +0 -0
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/helpers/converted_BAM_to_adata_II.py CHANGED Viewed

@@ -11,7 +11,11 @@ import traceback
 import gzip
 import torch
-from .. import readwrite
+import shutil
+from pathlib import Path
+from typing import Union, Iterable, Optional
+from ... import readwrite
 from .binarize_converted_base_identities import binarize_converted_base_identities
 from .find_conversion_sites import find_conversion_sites
 from .count_aligned_reads import count_aligned_reads
@@ -22,7 +26,17 @@ from .ohe_batching import ohe_batching
 if __name__ == "__main__":
     multiprocessing.set_start_method("forkserver", force=True)
-def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device='cpu', num_threads=8):
+def converted_BAM_to_adata_II(converted_FASTA,
+                              split_dir,
+                              mapping_threshold,
+                              experiment_name,
+                              conversions,
+                              bam_suffix,
+                              device='cpu',
+                              num_threads=8,
+                              deaminase_footprinting=False,
+                              delete_intermediates=True
+):
     """
     Converts BAM files into an AnnData object by binarizing modified base identities.
@@ -31,9 +45,10 @@ def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, exp
         split_dir (str): Directory containing converted BAM files.
         mapping_threshold (float): Minimum fraction of aligned reads required for inclusion.
         experiment_name (str): Name for the output AnnData object.
-        conversion_types (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
+        conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
         bam_suffix (str): File suffix for BAM files.
         num_threads (int): Number of parallel processing threads.
+        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
     Returns:
         str: Path to the final AnnData object.
@@ -48,14 +63,15 @@ def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, exp
     print(f"Using device: {device}")
     ## Set Up Directories and File Paths
-    parent_dir = os.path.dirname(split_dir)
-    h5_dir = os.path.join(parent_dir, 'h5ads')
-    tmp_dir = os.path.join(parent_dir, 'tmp')
+    #parent_dir = os.path.dirname(split_dir)
+    h5_dir = os.path.join(split_dir, 'h5ads')
+    tmp_dir = os.path.join(split_dir, 'tmp')
+    final_adata = None
     final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{os.path.basename(split_dir)}.h5ad.gz')
     if os.path.exists(final_adata_path):
         print(f"{final_adata_path} already exists. Using existing AnnData object.")
-        return final_adata_path
+        return final_adata, final_adata_path
     make_dirs([h5_dir, tmp_dir])
@@ -66,32 +82,46 @@ def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, exp
     print(f"Found {len(bam_files)} BAM files: {bam_files}")
     ## Process Conversion Sites
-    max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(converted_FASTA, conversion_types)
+    max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(converted_FASTA, conversions, deaminase_footprinting)
     ## Filter BAM Files by Mapping Threshold
     records_to_analyze = filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold)
     ## Process BAMs in Parallel
-    final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device)
+    final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting)
     for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
         final_adata.var[f'{chromosome}_top_strand_FASTA_base'] = list(seq)
         final_adata.var[f'{chromosome}_bottom_strand_FASTA_base'] = list(comp)
         final_adata.uns[f'{chromosome}_FASTA_sequence'] = seq
+    final_adata.obs_names_make_unique()
+    cols = final_adata.obs.columns
+    # Make obs cols categorical
+    for col in cols:
+        final_adata.obs[col] = final_adata.obs[col].astype('category')
     ## Save Final AnnData
-    # print(f"Saving AnnData to {final_adata_path}")
-    # final_adata.write_h5ad(final_adata_path, compression='gzip')
+    print(f"Saving AnnData to {final_adata_path}")
+    backup_dir=os.path.join(os.path.dirname(final_adata_path), 'adata_accessory_data')
+    readwrite.safe_write_h5ad(final_adata, final_adata_path, compression='gzip', backup=True, backup_dir=backup_dir)
+    ## Delete intermediate h5ad files and temp directories
+    if delete_intermediates:
+        delete_intermediate_h5ads_and_tmpdir(h5_dir, tmp_dir)
     return final_adata, final_adata_path
-def process_conversion_sites(converted_FASTA, conversion_types):
+def process_conversion_sites(converted_FASTA, conversions=['unconverted', '5mC'], deaminase_footprinting=False):
     """
     Extracts conversion sites and determines the max reference length.
     Parameters:
         converted_FASTA (str): Path to the converted reference FASTA.
-        conversion_types (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
+        conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
+        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
     Returns:
         max_reference_length (int): The length of the longest sequence.
@@ -101,11 +131,11 @@ def process_conversion_sites(converted_FASTA, conversion_types):
     record_FASTA_dict = {}
     chromosome_FASTA_dict = {}
     max_reference_length = 0
-    unconverted = conversion_types[0]
-    conversions = conversion_types[1:]
+    unconverted = conversions[0]
+    conversion_types = conversions[1:]
     # Process the unconverted sequence once
-    modification_dict[unconverted] = find_conversion_sites(converted_FASTA, unconverted, conversion_types)
+    modification_dict[unconverted] = find_conversion_sites(converted_FASTA, unconverted, conversions, deaminase_footprinting)
     # Above points to record_dict[record.id] = [sequence_length, [], [], sequence, complement] with only unconverted record.id keys
     # Get **max sequence length** from unconverted records
@@ -114,7 +144,11 @@ def process_conversion_sites(converted_FASTA, conversion_types):
     # Add **unconverted records** to `record_FASTA_dict`
     for record, values in modification_dict[unconverted].items():
         sequence_length, top_coords, bottom_coords, sequence, complement = values
-        chromosome = record.replace(f"_{unconverted}_top", "")
+        if not deaminase_footprinting:
+            chromosome = record.replace(f"_{unconverted}_top", "")
+        else:
+            chromosome = record
         # Store **original sequence**
         record_FASTA_dict[record] = [
@@ -127,13 +161,17 @@ def process_conversion_sites(converted_FASTA, conversion_types):
             chromosome_FASTA_dict[chromosome] = [sequence + "N" * (max_reference_length - sequence_length), complement + "N" * (max_reference_length - sequence_length)]
     # Process converted records
-    for conversion in conversions:
-        modification_dict[conversion] = find_conversion_sites(converted_FASTA, conversion, conversion_types)
+    for conversion in conversion_types:
+        modification_dict[conversion] = find_conversion_sites(converted_FASTA, conversion, conversions, deaminase_footprinting)
         # Above points to record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement] with only unconverted record.id keys
         for record, values in modification_dict[conversion].items():
             sequence_length, top_coords, bottom_coords, sequence, complement = values
-            chromosome = record.split(f"_{unconverted}_")[0]  # Extract chromosome name
+            if not deaminase_footprinting:
+                chromosome = record.split(f"_{unconverted}_")[0]  # Extract chromosome name
+            else:
+                chromosome = record
             # Add **both strands** for converted records
             for strand in ["top", "bottom"]:
@@ -168,7 +206,7 @@ def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold
     return records_to_analyze
-def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tmp_dir, max_reference_length, device):
+def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting):
     """Worker function to process a single BAM file (must be at top-level for multiprocessing)."""
     adata_list = []
@@ -177,9 +215,11 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
         chromosome = record_FASTA_dict[record][2]
         current_length = record_FASTA_dict[record][4]
         mod_type, strand = record_FASTA_dict[record][6], record_FASTA_dict[record][7]
+        sequence = chromosome_FASTA_dict[chromosome][0]
         # Extract Base Identities
-        fwd_bases, rev_bases = extract_base_identities(bam, record, range(current_length), max_reference_length)
+        fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read = extract_base_identities(bam, record, range(current_length), max_reference_length, sequence)
+        mismatch_trend_series = pd.Series(mismatch_trend_per_read)
         # Skip processing if both forward and reverse base identities are empty
         if not fwd_bases and not rev_bases:
@@ -190,11 +230,11 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
         # Binarize the Base Identities if they exist
         if fwd_bases:
-            fwd_bin = binarize_converted_base_identities(fwd_bases, strand, mod_type, bam, device)
+            fwd_bin = binarize_converted_base_identities(fwd_bases, strand, mod_type, bam, device,deaminase_footprinting, mismatch_trend_per_read)
             merged_bin.update(fwd_bin)
         if rev_bases:
-            rev_bin = binarize_converted_base_identities(rev_bases, strand, mod_type, bam, device)
+            rev_bin = binarize_converted_base_identities(rev_bases, strand, mod_type, bam, device, deaminase_footprinting, mismatch_trend_per_read)
             merged_bin.update(rev_bin)
         # Skip if merged_bin is empty (no valid binarized data)
@@ -257,11 +297,18 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
         adata.obs_names = bin_df.index.astype(str)
         adata.var_names = bin_df.columns.astype(str)
         adata.obs["Sample"] = [sample] * len(adata)
+        try:
+            barcode = sample.split('barcode')[1]
+        except:
+            barcode = np.nan
+        adata.obs["Barcode"] = [int(barcode)] * len(adata)
+        adata.obs["Barcode"] = adata.obs["Barcode"].astype(str)
         adata.obs["Reference"] = [chromosome] * len(adata)
         adata.obs["Strand"] = [strand] * len(adata)
         adata.obs["Dataset"] = [mod_type] * len(adata)
         adata.obs["Reference_dataset_strand"] = [f"{chromosome}_{mod_type}_{strand}"] * len(adata)
         adata.obs["Reference_strand"] = [f"{chromosome}_{strand}"] * len(adata)
+        adata.obs["Read_mismatch_trend"] = adata.obs_names.map(mismatch_trend_series)
         # Attach One-Hot Encodings to Layers
         adata.layers["A_binary_encoding"] = df_A
@@ -279,7 +326,7 @@ def timestamp():
     return time.strftime("[%Y-%m-%d %H:%M:%S]")
-def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, progress_queue):
+def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue):
     """Worker function that processes a single BAM and writes the output to an H5AD file."""
     worker_id = current_process().pid  # Get worker process ID
     sample = os.path.basename(bam).split(sep=".bam")[0]
@@ -302,7 +349,7 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
             return
         # Process BAM
-        adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, tmp_dir, max_reference_length, device)
+        adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting)
         if adata is not None:
             adata.write_h5ad(h5ad_path)
@@ -318,7 +365,7 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
         print(f"{timestamp()} [Worker {worker_id}] ERROR while processing {sample}:\n{traceback.format_exc()}")
         progress_queue.put(sample)  # Still signal completion to prevent deadlock
-def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device):
+def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting):
     """Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
     os.makedirs(h5_dir, exist_ok=True)  # Ensure h5_dir exists
@@ -337,7 +384,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
         with Pool(processes=num_threads) as pool:
             results = [
-                pool.apply_async(worker_function, (i, bam, records_to_analyze, shared_record_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, progress_queue))
+                pool.apply_async(worker_function, (i, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue))
                 for i, bam in enumerate(bam_path_list)
             ]
@@ -366,4 +413,93 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
     final_adata = ad.concat([ad.read_h5ad(f) for f in h5ad_files], join="outer")
     print(f"{timestamp()} Successfully generated final AnnData object.")
-    return final_adata
+    return final_adata
+def delete_intermediate_h5ads_and_tmpdir(
+    h5_dir: Union[str, Path, Iterable[str], None],
+    tmp_dir: Optional[Union[str, Path]] = None,
+    *,
+    dry_run: bool = False,
+    verbose: bool = True,
+):
+    """
+    Delete intermediate .h5ad files and a temporary directory.
+    Parameters
+    ----------
+    h5_dir : str | Path | iterable[str] | None
+        If a directory path is given, all files directly inside it will be considered.
+        If an iterable of file paths is given, those files will be considered.
+        Only files ending with '.h5ad' (and not ending with '.gz') are removed.
+    tmp_dir : str | Path | None
+        Path to a directory to remove recursively (e.g. a temp dir created earlier).
+    dry_run : bool
+        If True, print what *would* be removed but do not actually delete.
+    verbose : bool
+        Print progress / warnings.
+    """
+    # Helper: remove a single file path (Path-like or string)
+    def _maybe_unlink(p: Path):
+        if not p.exists():
+            if verbose:
+                print(f"[skip] not found: {p}")
+            return
+        if not p.is_file():
+            if verbose:
+                print(f"[skip] not a file: {p}")
+            return
+        if dry_run:
+            print(f"[dry-run] would remove file: {p}")
+            return
+        try:
+            p.unlink()
+            if verbose:
+                print(f"Removed file: {p}")
+        except Exception as e:
+            print(f"[error] failed to remove file {p}: {e}")
+    # Handle h5_dir input (directory OR iterable of file paths)
+    if h5_dir is not None:
+        # If it's a path to a directory, iterate its children
+        if isinstance(h5_dir, (str, Path)) and Path(h5_dir).is_dir():
+            dpath = Path(h5_dir)
+            for p in dpath.iterdir():
+                # only target top-level files (not recursing); require '.h5ad' suffix and exclude gz
+                name = p.name.lower()
+                if name.endswith(".h5ad") and not name.endswith(".gz"):
+                    _maybe_unlink(p)
+                else:
+                    if verbose:
+                        # optional: comment this out if too noisy
+                        print(f"[skip] not matching pattern: {p.name}")
+        else:
+            # treat as iterable of file paths
+            for f in h5_dir:
+                p = Path(f)
+                name = p.name.lower()
+                if name.endswith(".h5ad") and not name.endswith(".gz"):
+                    _maybe_unlink(p)
+                else:
+                    if verbose:
+                        print(f"[skip] not matching pattern or not a file: {p}")
+    # Remove tmp_dir recursively (if provided)
+    if tmp_dir is not None:
+        td = Path(tmp_dir)
+        if not td.exists():
+            if verbose:
+                print(f"[skip] tmp_dir not found: {td}")
+        else:
+            if not td.is_dir():
+                if verbose:
+                    print(f"[skip] tmp_dir is not a directory: {td}")
+            else:
+                if dry_run:
+                    print(f"[dry-run] would remove directory tree: {td}")
+                else:
+                    try:
+                        shutil.rmtree(td)
+                        if verbose:
+                            print(f"Removed directory tree: {td}")
+                    except Exception as e:
+                        print(f"[error] failed to remove tmp dir {td}: {e}")

smftools/informatics/helpers/discover_input_files.py ADDED Viewed

@@ -0,0 +1,100 @@
+from pathlib import Path
+from typing import Dict, List, Any, Tuple
+def discover_input_files(
+    input_data_path: str,
+    bam_suffix: str = ".bam",
+    recursive: bool = False,
+    follow_symlinks: bool = False,
+) -> Dict[str, Any]:
+    """
+    Discover input files under `input_data_path`.
+    Returns a dict with:
+      - pod5_paths, fast5_paths, fastq_paths, bam_paths (lists of str)
+      - input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
+      - all_files_searched (int)
+    Behavior:
+      - If `input_data_path` is a file, returns that single file categorized.
+      - If it is a directory, scans either immediate children (recursive=False)
+        or entire tree (recursive=True). Uses Path.suffixes to detect .fastq.gz etc.
+    """
+    p = Path(input_data_path)
+    pod5_exts = {".pod5", ".p5"}
+    fast5_exts = {".fast5", ".f5"}
+    fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.xz", ".fq.xz"}
+    # normalize bam suffix with leading dot
+    if not bam_suffix.startswith("."):
+        bam_suffix = "." + bam_suffix
+    bam_suffix = bam_suffix.lower()
+    pod5_paths: List[str] = []
+    fast5_paths: List[str] = []
+    fastq_paths: List[str] = []
+    bam_paths: List[str] = []
+    other_paths: List[str] = []
+    def _file_ext_key(pp: Path) -> str:
+        # join suffixes to handle .fastq.gz
+        return "".join(pp.suffixes).lower() if pp.suffixes else pp.suffix.lower()
+    if p.exists() and p.is_file():
+        ext_key = _file_ext_key(p)
+        if ext_key in pod5_exts:
+            pod5_paths.append(str(p))
+        elif ext_key in fast5_exts:
+            fast5_paths.append(str(p))
+        elif ext_key in fastq_exts:
+            fastq_paths.append(str(p))
+        elif ext_key == bam_suffix:
+            bam_paths.append(str(p))
+        else:
+            other_paths.append(str(p))
+        total_searched = 1
+    elif p.exists() and p.is_dir():
+        if recursive:
+            iterator = p.rglob("*")
+        else:
+            iterator = p.iterdir()
+        total_searched = 0
+        for fp in iterator:
+            if not fp.is_file():
+                continue
+            total_searched += 1
+            ext_key = _file_ext_key(fp)
+            if ext_key in pod5_exts:
+                pod5_paths.append(str(fp))
+            elif ext_key in fast5_exts:
+                fast5_paths.append(str(fp))
+            elif ext_key in fastq_exts:
+                fastq_paths.append(str(fp))
+            elif ext_key == bam_suffix:
+                bam_paths.append(str(fp))
+            else:
+                # additional heuristic: check filename contains extension fragments (.pod5 etc)
+                name = fp.name.lower()
+                if any(e in name for e in pod5_exts):
+                    pod5_paths.append(str(fp))
+                elif any(e in name for e in fast5_exts):
+                    fast5_paths.append(str(fp))
+                elif any(e in name for e in [".fastq", ".fq"]):
+                    fastq_paths.append(str(fp))
+                elif name.endswith(bam_suffix):
+                    bam_paths.append(str(fp))
+                else:
+                    other_paths.append(str(fp))
+    else:
+        raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
+    return {
+        "pod5_paths": sorted(pod5_paths),
+        "fast5_paths": sorted(fast5_paths),
+        "fastq_paths": sorted(fastq_paths),
+        "bam_paths": sorted(bam_paths),
+        "other_paths": sorted(other_paths),
+        "input_is_pod5": len(pod5_paths) > 0,
+        "input_is_fast5": len(fast5_paths) > 0,
+        "input_is_fastq": len(fastq_paths) > 0,
+        "input_is_bam": len(bam_paths) > 0,
+        "all_files_searched": total_searched,
+    }

smftools/informatics/helpers/extract_base_identities.py CHANGED Viewed

@@ -1,4 +1,4 @@
-def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
+def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
     """
     Efficiently extracts base identities from mapped reads with reference coordinates.
@@ -7,6 +7,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
         chromosome (str): Name of the reference chromosome.
         positions (list): Positions to extract (0-based).
         max_reference_length (int): Maximum reference length for padding.
+        sequence (str): The sequence of the record fasta
     Returns:
         dict: Base identities from forward mapped reads.
@@ -16,16 +17,19 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
     import numpy as np
     from collections import defaultdict
     import time
+    from collections import defaultdict, Counter
     timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
     positions = set(positions)
     fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
     rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
+    mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
     #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
     with pysam.AlignmentFile(bam_file, "rb") as bam:
         total_reads = bam.mapped
+        ref_seq = sequence.upper()
         for read in bam.fetch(chromosome):
             if not read.is_mapped:
                 continue  # Skip unmapped reads
@@ -39,6 +43,28 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
             for read_position, reference_position in aligned_pairs:
                 if reference_position in positions:
-                    base_dict[read_name][reference_position] = query_sequence[read_position]
+                    read_base = query_sequence[read_position]
+                    ref_base = ref_seq[reference_position]
-    return dict(fwd_base_identities), dict(rev_base_identities)
+                    base_dict[read_name][reference_position] = read_base
+                # Track mismatches (excluding Ns)
+                if read_base != ref_base and read_base != 'N' and ref_base != 'N':
+                    mismatch_counts_per_read[read_name][ref_base][read_base] += 1
+    # Determine C→T vs G→A dominance per read
+    mismatch_trend_per_read = {}
+    for read_name, ref_dict in mismatch_counts_per_read.items():
+        c_to_t = ref_dict.get("C", {}).get("T", 0)
+        g_to_a = ref_dict.get("G", {}).get("A", 0)
+        if abs(c_to_t - g_to_a) < 0.01 and c_to_t > 0:
+            mismatch_trend_per_read[read_name] = "equal"
+        elif c_to_t > g_to_a:
+            mismatch_trend_per_read[read_name] = "C->T"
+        elif g_to_a > c_to_t:
+            mismatch_trend_per_read[read_name] = "G->A"
+        else:
+            mismatch_trend_per_read[read_name] = "none"
+    return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read

smftools/informatics/helpers/extract_read_features_from_bam.py CHANGED Viewed

@@ -2,7 +2,7 @@
 def extract_read_features_from_bam(bam_file_path):
     """
-    Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length.
+    Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
     Params:
         bam_file_path (str):
     Returns:
@@ -26,6 +26,8 @@ def extract_read_features_from_bam(bam_file_path):
             reference_name = read.reference_name
             reference_index = bam_file.references.index(reference_name)
             reference_length = reference_lengths[reference_index]
-            read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length]
+            mapped_length = sum(end - start for start, end in read.get_blocks())
+            mapping_quality = read.mapping_quality  # Phred-scaled MAPQ
+            read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
     return read_metrics

smftools/informatics/helpers/find_conversion_sites.py CHANGED Viewed

@@ -1,11 +1,12 @@
-def find_conversion_sites(fasta_file, modification_type, conversion_types):
+def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
     """
     Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
     Parameters:
         fasta_file (str): Path to the converted reference FASTA.
         modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
-        conversion_types (list): List of conversion types. The first element is the unconverted record type.
+        conversions (list): List of conversion types. The first element is the unconverted record type.
+        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
     Returns:
         dict: Dictionary where keys are **both unconverted & converted record names**.
@@ -14,7 +15,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
     """
     import numpy as np
     from Bio import SeqIO
-    unconverted = conversion_types[0]
+    unconverted = conversions[0]
     record_dict = {}
     # Define base mapping based on modification type
@@ -26,7 +27,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
     # Read FASTA file and process records
     with open(fasta_file, "r") as f:
         for record in SeqIO.parse(f, "fasta"):
-            if unconverted in record.id:
+            if unconverted in record.id or deaminase_footprinting:
                 sequence = str(record.seq).upper()
                 complement = str(record.seq.complement()).upper()
                 sequence_length = len(sequence)

smftools/informatics/helpers/modkit_extract_to_adata.py CHANGED Viewed

@@ -386,14 +386,15 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
     existing_h5s = [h5 for h5 in existing_h5s if '.h5ad.gz' in h5]
     final_hdf = f'{experiment_name}_final_experiment_hdf5.h5ad'
     final_adata_path = os.path.join(h5_dir, final_hdf)
+    final_adata = None
     if os.path.exists(f"{final_adata_path}.gz"):
         print(f'{final_adata_path}.gz already exists. Using existing adata')
-        return f"{final_adata_path}.gz"
+        return final_adata, f"{final_adata_path}.gz"
     elif os.path.exists(f"{final_adata_path}"):
         print(f'{final_adata_path} already exists. Using existing adata')
-        return final_adata_path
+        return final_adata, final_adata_path
     # Filter file names that contain the search string in their filename and keep them in a list
     tsvs = [tsv for tsv in tsv_files if 'extract.tsv' in tsv and 'unclassified' not in tsv]
@@ -444,8 +445,9 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
             for record in records_to_analyze:
                 current_reference_length = reference_dict[record][0]
                 positions = range(current_reference_length)
+                ref_seq = reference_dict[record][1]
                 # Extract the base identities of reads aligned to the record
-                fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, positions, max_reference_length)
+                fwd_base_identities, rev_base_identities, mismatch_counts_per_read, mismatch_trend_per_read = extract_base_identities(bam, record, positions, max_reference_length, ref_seq)
                 # Store read names of fwd and rev mapped reads
                 fwd_mapped_reads.update(fwd_base_identities.keys())
                 rev_mapped_reads.update(rev_base_identities.keys())
@@ -708,6 +710,7 @@ def modkit_extract_to_adata(fasta, bam_dir, mapping_threshold, experiment_name,
                                 temp_adata.var_names = temp_adata.var_names.astype(str)
                                 print('{0}: Adding {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
                                 temp_adata.obs['Sample'] = [str(final_sample_index)] * len(temp_adata)
+                                temp_adata.obs['Barcode'] = [str(final_sample_index)] * len(temp_adata)
                                 temp_adata.obs['Reference'] = [f'{record}'] * len(temp_adata)
                                 temp_adata.obs['Strand'] = [strand] * len(temp_adata)
                                 temp_adata.obs['Dataset'] = [dataset] * len(temp_adata)

smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.1py3-none-any.whl