PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (133) hide show

smftools/__init__.py +6 -8
smftools/_settings.py +4 -6
smftools/_version.py +1 -1
smftools/cli/helpers.py +7 -1
smftools/cli/hmm_adata.py +902 -244
smftools/cli/load_adata.py +318 -198
smftools/cli/preprocess_adata.py +285 -171
smftools/cli/spatial_adata.py +137 -53
smftools/cli_entry.py +94 -178
smftools/config/__init__.py +1 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +22 -17
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +505 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +2 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2125 -1426
smftools/hmm/__init__.py +2 -3
smftools/hmm/archived/call_hmm_peaks.py +16 -1
smftools/hmm/call_hmm_peaks.py +173 -193
smftools/hmm/display_hmm.py +19 -6
smftools/hmm/hmm_readwrite.py +13 -4
smftools/hmm/nucleosome_hmm_refinement.py +102 -14
smftools/informatics/__init__.py +30 -7
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
smftools/informatics/archived/print_bam_query_seq.py +7 -1
smftools/informatics/bam_functions.py +379 -156
smftools/informatics/basecalling.py +51 -9
smftools/informatics/bed_functions.py +90 -57
smftools/informatics/binarize_converted_base_identities.py +18 -7
smftools/informatics/complement_base_list.py +7 -6
smftools/informatics/converted_BAM_to_adata.py +265 -122
smftools/informatics/fasta_functions.py +161 -83
smftools/informatics/h5ad_functions.py +195 -29
smftools/informatics/modkit_extract_to_adata.py +609 -270
smftools/informatics/modkit_functions.py +85 -44
smftools/informatics/ohe.py +44 -21
smftools/informatics/pod5_functions.py +112 -73
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +2 -7
smftools/machine_learning/data/anndata_data_module.py +143 -50
smftools/machine_learning/data/preprocessing.py +2 -1
smftools/machine_learning/evaluation/__init__.py +1 -1
smftools/machine_learning/evaluation/eval_utils.py +11 -14
smftools/machine_learning/evaluation/evaluators.py +46 -33
smftools/machine_learning/inference/__init__.py +1 -1
smftools/machine_learning/inference/inference_utils.py +7 -4
smftools/machine_learning/inference/lightning_inference.py +9 -13
smftools/machine_learning/inference/sklearn_inference.py +6 -8
smftools/machine_learning/inference/sliding_window_inference.py +35 -25
smftools/machine_learning/models/__init__.py +10 -5
smftools/machine_learning/models/base.py +28 -42
smftools/machine_learning/models/cnn.py +15 -11
smftools/machine_learning/models/lightning_base.py +71 -40
smftools/machine_learning/models/mlp.py +13 -4
smftools/machine_learning/models/positional.py +3 -2
smftools/machine_learning/models/rnn.py +3 -2
smftools/machine_learning/models/sklearn_models.py +39 -22
smftools/machine_learning/models/transformer.py +68 -53
smftools/machine_learning/models/wrappers.py +2 -1
smftools/machine_learning/training/__init__.py +2 -2
smftools/machine_learning/training/train_lightning_model.py +29 -20
smftools/machine_learning/training/train_sklearn_model.py +9 -15
smftools/machine_learning/utils/__init__.py +1 -1
smftools/machine_learning/utils/device.py +7 -4
smftools/machine_learning/utils/grl.py +3 -1
smftools/metadata.py +443 -0
smftools/plotting/__init__.py +19 -5
smftools/plotting/autocorrelation_plotting.py +145 -44
smftools/plotting/classifiers.py +162 -72
smftools/plotting/general_plotting.py +347 -168
smftools/plotting/hmm_plotting.py +42 -13
smftools/plotting/position_stats.py +145 -85
smftools/plotting/qc_plotting.py +20 -12
smftools/preprocessing/__init__.py +8 -8
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +17 -11
smftools/preprocessing/calculate_complexity_II.py +86 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +2 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
smftools/preprocessing/calculate_position_Youden.py +103 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
smftools/preprocessing/flag_duplicate_reads.py +688 -271
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +8 -3
smftools/preprocessing/min_non_diagonal.py +2 -1
smftools/preprocessing/recipes.py +56 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +3 -4
smftools/tools/archived/classifiers.py +163 -0
smftools/tools/archived/subset_adata_v1.py +10 -1
smftools/tools/archived/subset_adata_v2.py +12 -1
smftools/tools/calculate_umap.py +54 -15
smftools/tools/cluster_adata_on_methylation.py +115 -46
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +229 -98
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
smftools-0.2.5.dist-info/RECORD +181 -0
smftools-0.2.4.dist-info/RECORD +0 -176
/smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
/smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
/smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0

smftools/cli/load_adata.py CHANGED Viewed

@@ -1,13 +1,19 @@
 import shutil
 from pathlib import Path
-from typing import Union, Iterable
+from typing import Iterable, Union
+from smftools.logging_utils import get_logger
 from .helpers import AdataPaths
+logger = get_logger(__name__)
 def check_executable_exists(cmd: str) -> bool:
     """Return True if a command-line executable is available in PATH."""
     return shutil.which(cmd) is not None
 def delete_tsvs(
     tsv_dir: Union[str, Path, Iterable[str], None],
     *,
@@ -27,48 +33,50 @@ def delete_tsvs(
     verbose : bool
         Print progress / warnings.
     """
     # Helper: remove a single file path (Path-like or string)
     def _maybe_unlink(p: Path):
         if not p.exists():
             if verbose:
-                print(f"[skip] not found: {p}")
+                logger.info(f"[skip] not found: {p}")
             return
         if not p.is_file():
             if verbose:
-                print(f"[skip] not a file: {p}")
+                logger.info(f"[skip] not a file: {p}")
             return
         if dry_run:
-            print(f"[dry-run] would remove file: {p}")
+            logger.info(f"[dry-run] would remove file: {p}")
             return
         try:
             p.unlink()
             if verbose:
-                print(f"Removed file: {p}")
+                logger.info(f"Removed file: {p}")
         except Exception as e:
-            print(f"[error] failed to remove file {p}: {e}")
+            logger.warning(f"Failed to remove file {p}: {e}")
     # Remove tmp_dir recursively (if provided)
     if tsv_dir is not None:
         td = Path(tsv_dir)
         if not td.exists():
             if verbose:
-                print(f"[skip] tsv_dir not found: {td}")
+                logger.info(f"[skip] tsv_dir not found: {td}")
         else:
             if not td.is_dir():
                 if verbose:
-                    print(f"[skip] tsv_dir is not a directory: {td}")
+                    logger.info(f"[skip] tsv_dir is not a directory: {td}")
             else:
                 if dry_run:
-                    print(f"[dry-run] would remove directory tree: {td}")
+                    logger.info(f"[dry-run] would remove directory tree: {td}")
                 else:
                     try:
                         shutil.rmtree(td)
                         if verbose:
-                            print(f"Removed directory tree: {td}")
+                            logger.info(f"Removed directory tree: {td}")
                     except Exception as e:
-                        print(f"[error] failed to remove tmp dir {td}: {e}")
+                        logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
-def load_adata_core(cfg, paths: AdataPaths):
+def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
     """
     Core load pipeline.
@@ -97,28 +105,34 @@ def load_adata_core(cfg, paths: AdataPaths):
     cfg : ExperimentConfig
         (Same object, possibly with some fields updated, e.g. fasta path.)
     """
-    import os
     from pathlib import Path
     import numpy as np
-    import pandas as pd
-    import anndata as ad
-    import scanpy as sc
-    from .helpers import write_gz_h5ad
-    from ..readwrite import make_dirs, add_or_update_column_in_csv
-    from ..informatics.bam_functions import concatenate_fastqs_to_bam, align_and_sort_BAM, demux_and_index_BAM, split_and_index_BAM, bam_qc, extract_read_features_from_bam
+    from ..informatics.bam_functions import (
+        align_and_sort_BAM,
+        bam_qc,
+        concatenate_fastqs_to_bam,
+        demux_and_index_BAM,
+        extract_read_features_from_bam,
+        split_and_index_BAM,
+    )
+    from ..informatics.basecalling import canoncall, modcall
     from ..informatics.bed_functions import aligned_BAM_to_bed
-    from ..informatics.pod5_functions import fast5_to_pod5
-    from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
-    from ..informatics.basecalling import modcall, canoncall
-    from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
-    from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
     from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
+    from ..informatics.fasta_functions import (
+        generate_converted_FASTA,
+        get_chromosome_lengths,
+        subsample_fasta_from_bed,
+    )
     from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
+    from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
+    from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
+    from ..informatics.pod5_functions import fast5_to_pod5
     from ..informatics.run_multiqc import run_multiqc
+    from ..metadata import record_smftools_metadata
+    from ..readwrite import add_or_update_column_in_csv, make_dirs
+    from .helpers import write_gz_h5ad
     ################################### 1) General params and input organization ###################################
     output_directory = Path(cfg.output_directory)
@@ -169,19 +183,20 @@ def load_adata_core(cfg, paths: AdataPaths):
     if cfg.aligner == "minimap2":
         if not check_executable_exists("minimap2"):
             raise RuntimeError(
-                "Error: 'minimap2' is not installed or not in PATH. "
-                "Install minimap2"
+                "Error: 'minimap2' is not installed or not in PATH. Install minimap2"
             )
     # # Detect the input filetypes
     # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
     if cfg.input_type == "fast5":
         # take the input directory of fast5 files and write out a single pod5 file into the output directory.
-        output_pod5 = cfg.output_directory / 'FAST5s_to_POD5.pod5'
+        output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
         if output_pod5.exists():
             pass
         else:
-            print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
+            logger.info(
+                f"Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}"
+            )
             fast5_to_pod5(cfg.input_data_path, output_pod5)
         # Reassign the pod5_dir variable to point to the new pod5 file.
         cfg.input_data_path = output_pod5
@@ -189,22 +204,24 @@ def load_adata_core(cfg, paths: AdataPaths):
     # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
     elif cfg.input_type == "fastq":
         # Output file for FASTQ concatenation.
-        output_bam = cfg.output_directory / 'canonical_basecalls.bam'
+        output_bam = cfg.output_directory / "canonical_basecalls.bam"
         if output_bam.exists():
-            pass
+            logger.debug("Output BAM already exists")
         else:
+            logger.info("Concatenating FASTQ files into a single BAM file")
             summary = concatenate_fastqs_to_bam(
                 cfg.input_files,
                 output_bam,
-                barcode_tag='BC',
-                gzip_suffixes=('.gz','.gzip'),
+                barcode_tag="BC",
+                gzip_suffixes=(".gz", ".gzip"),
                 barcode_map=cfg.fastq_barcode_map,
                 add_read_group=True,
                 rg_sample_field=None,
                 progress=False,
-                auto_pair=cfg.fastq_auto_pairing)
-            print(f"Found the following barcodes: {summary['barcodes']}")
+                auto_pair=cfg.fastq_auto_pairing,
+            )
+            logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
         # Set the input data path to the concatenated BAM.
         cfg.input_data_path = output_bam
@@ -213,24 +230,24 @@ def load_adata_core(cfg, paths: AdataPaths):
         pass
     else:
         pass
     add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
     # Determine if the input data needs to be basecalled
     if cfg.input_type == "pod5":
-        print(f'Detected pod5 inputs: {cfg.input_files}')
+        logger.info(f"Detected pod5 inputs: {cfg.input_files}")
         basecall = True
     elif cfg.input_type in ["bam"]:
-        print(f'Detected bam input: {cfg.input_files}')
+        logger.info(f"Detected bam input: {cfg.input_files}")
         basecall = False
     else:
-        print('Error, can not find input bam or pod5')
+        logger.info("Error, can not find input bam or pod5")
     # Generate the base name of the unaligned bam without the .bam suffix
     if basecall:
         model_basename = Path(cfg.model).name
-        model_basename = str(model_basename).replace('.', '_')
-        if cfg.smf_modality == 'direct':
+        model_basename = str(model_basename).replace(".", "_")
+        if cfg.smf_modality == "direct":
             mod_string = "_".join(cfg.mod_list)
             bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
         else:
@@ -241,7 +258,9 @@ def load_adata_core(cfg, paths: AdataPaths):
     # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
     unaligned_output = bam.with_suffix(cfg.bam_suffix)
-    aligned_BAM = cfg.output_directory / (bam.stem + "_aligned") # doing this allows specifying an input bam in a seperate directory as the aligned output bams
+    aligned_BAM = (
+        cfg.output_directory / (bam.stem + "_aligned")
+    )  # doing this allows specifying an input bam in a seperate directory as the aligned output bams
     aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
     aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
     aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
@@ -252,34 +271,40 @@ def load_adata_core(cfg, paths: AdataPaths):
     ########################################################################################################################
     ################################### 2) FASTA Handling ###################################
-    from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
     try:
         cfg.fasta = Path(cfg.fasta)
-    except:
-        print("Need to provide an input FASTA path to proceed with smftools load")
+    except Exception:
+        logger.warning("Need to provide an input FASTA path to proceed with smftools load")
     # If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
-    if cfg.fasta_regions_of_interest and '.bed' in cfg.fasta_regions_of_interest:
-        fasta_basename = cfg.fasta.parent / cfg.fasta.stem
-        bed_basename_minus_suffix = Path(cfg.fasta_regions_of_interest).stem
-        output_FASTA = fasta_basename.with_name(fasta_basename.name + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta')
-        subsample_fasta_from_bed(cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA)
-        fasta = cfg.output_directory / output_FASTA
+    if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
+        fasta_stem = cfg.fasta.stem
+        bed_stem = Path(cfg.fasta_regions_of_interest).stem
+        output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
+        logger.info("Subsampling FASTA records using the provided BED file")
+        subsample_fasta_from_bed(
+            cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
+        )
+        fasta = output_FASTA
     else:
+        logger.info("Using the full FASTA file")
         fasta = cfg.fasta
     # For conversion style SMF, make a converted reference FASTA
-    if cfg.smf_modality == 'conversion':
-        fasta_basename = fasta.parent / fasta.stem
-        converted_FASTA_basename = fasta_basename.with_name(fasta_basename.name + '_converted.fasta')
+    if cfg.smf_modality == "conversion":
+        fasta_stem = fasta.stem
+        converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
         converted_FASTA = cfg.output_directory / converted_FASTA_basename
-        if 'converted.fa' in fasta.name:
-            print(f'{fasta} is already converted. Using existing converted FASTA.')
+        if "converted.fa" in fasta.name:
+            logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
             converted_FASTA = fasta
         elif converted_FASTA.exists():
-            print(f'{converted_FASTA} already exists. Using existing converted FASTA.')
+            logger.info(f"{converted_FASTA} already exists. Using existing converted FASTA.")
         else:
+            logger.info(f"Converting FASTA base sequences")
             generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
         fasta = converted_FASTA
@@ -290,121 +315,164 @@ def load_adata_core(cfg, paths: AdataPaths):
     ########################################################################################################################
     ################################### 3) Basecalling ###################################
-    from ..informatics.basecalling import modcall, canoncall
     # 1) Basecall using dorado
-    if basecall and cfg.sequencer == 'ont':
+    if basecall and cfg.sequencer == "ont":
         try:
             cfg.model_dir = Path(cfg.model_dir)
-        except:
-            print("Need to provide a valid path to a dorado model directory to use dorado basecalling")
+        except Exception:
+            logger.warning(
+                "Need to provide a valid path to a dorado model directory to use dorado basecalling"
+            )
         if aligned_sorted_output.exists():
-            print(f'{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM.')
+            logger.info(
+                f"{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM."
+            )
         elif unaligned_output.exists():
-            print(f'{unaligned_output} already exists. Using existing basecalled BAM.')
-        elif cfg.smf_modality != 'direct':
-            canoncall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
+            logger.info(f"{unaligned_output} already exists. Using existing basecalled BAM.")
+        elif cfg.smf_modality != "direct":
+            logger.info("Running canonical basecalling using dorado")
+            canoncall(
+                str(cfg.model_dir),
+                cfg.model,
+                str(cfg.input_data_path),
+                cfg.barcode_kit,
+                str(bam),
+                cfg.bam_suffix,
+                cfg.barcode_both_ends,
+                cfg.trim,
+                cfg.device,
+            )
         else:
-            modcall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, cfg.mod_list, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
+            logger.info("Running modified basecalling using dorado")
+            modcall(
+                str(cfg.model_dir),
+                cfg.model,
+                str(cfg.input_data_path),
+                cfg.barcode_kit,
+                cfg.mod_list,
+                str(bam),
+                cfg.bam_suffix,
+                cfg.barcode_both_ends,
+                cfg.trim,
+                cfg.device,
+            )
     elif basecall:
-        print(f"Basecalling is currently only supported for ont sequencers and not pacbio.")
+        logger.error("Basecalling is currently only supported for ont sequencers and not pacbio.")
     else:
         pass
     ########################################################################################################################
     ################################### 4) Alignment and sorting #############################################
-    from ..informatics.bam_functions import align_and_sort_BAM
-    from ..informatics.bed_functions import aligned_BAM_to_bed
     # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
     if aligned_sorted_output.exists():
-        print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
+        logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
     else:
+        logger.info(f"Aligning and sorting reads")
         align_and_sort_BAM(fasta, unaligned_output, cfg)
         # Deleted the unsorted aligned output
         aligned_output.unlink()
     if cfg.make_beds:
         # Make beds and provide basic histograms
-        bed_dir = cfg.output_directory / 'beds'
+        bed_dir = cfg.output_directory / "beds"
         if bed_dir.is_dir():
-            print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}')
+            logger.debug(
+                f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
+            )
         else:
-            aligned_BAM_to_bed(aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads)
+            logger.info("Making bed files from the aligned and sorted BAM file")
+            aligned_BAM_to_bed(
+                aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads
+            )
     ########################################################################################################################
     ################################### 5) Demultiplexing ######################################################################
-    from ..informatics.bam_functions import demux_and_index_BAM, split_and_index_BAM
     # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
     if cfg.input_already_demuxed:
         if cfg.split_path.is_dir():
-            print(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
+            logger.debug(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
             all_bam_files = sorted(
-                p for p in cfg.split_path.iterdir()
-                if p.is_file()
-                and p.suffix == cfg.bam_suffix
+                p for p in cfg.split_path.iterdir() if p.is_file() and p.suffix == cfg.bam_suffix
             )
             unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
             bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
         else:
             make_dirs([cfg.split_path])
-            all_bam_files = split_and_index_BAM(aligned_sorted_BAM,
-                                cfg.split_path,
-                                cfg.bam_suffix)
+            logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
+            all_bam_files = split_and_index_BAM(aligned_sorted_BAM, cfg.split_path, cfg.bam_suffix)
             unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
             bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
         se_bam_files = bam_files
         bam_dir = cfg.split_path
     else:
         if single_barcoded_path.is_dir():
-            print(f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs.")
+            logger.debug(
+                f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs."
+            )
             all_se_bam_files = sorted(
-                p for p in single_barcoded_path.iterdir()
-                if p.is_file()
-                and p.suffix == cfg.bam_suffix
-            )
+                p
+                for p in single_barcoded_path.iterdir()
+                if p.is_file() and p.suffix == cfg.bam_suffix
+            )
             unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
             se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
         else:
-            make_dirs([cfg.split_path, single_barcoded_path])
-            all_se_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
-                                            single_barcoded_path,
-                                            cfg.bam_suffix,
-                                            cfg.barcode_kit,
-                                            False,
-                                            cfg.trim,
-                                            cfg.threads)
+            make_dirs([cfg.split_path, single_barcoded_path])
+            logger.info(
+                "Demultiplexing samples into individual aligned/sorted BAM files based on single end barcode status with Dorado"
+            )
+            all_se_bam_files = demux_and_index_BAM(
+                aligned_sorted_BAM,
+                single_barcoded_path,
+                cfg.bam_suffix,
+                cfg.barcode_kit,
+                False,
+                cfg.trim,
+                cfg.threads,
+            )
             unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
             se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
         if double_barcoded_path.is_dir():
-            print(f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs.")
+            logger.debug(
+                f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs."
+            )
             all_de_bam_files = sorted(
-                p for p in double_barcoded_path.iterdir()
-                if p.is_file()
-                and p.suffix == cfg.bam_suffix
-            )
+                p
+                for p in double_barcoded_path.iterdir()
+                if p.is_file() and p.suffix == cfg.bam_suffix
+            )
             unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
             de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
-        else:
-            make_dirs([cfg.split_path, double_barcoded_path])
-            all_de_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
-                                            double_barcoded_path,
-                                            cfg.bam_suffix,
-                                            cfg.barcode_kit,
-                                            True,
-                                            cfg.trim,
-                                            cfg.threads)
+        else:
+            make_dirs([cfg.split_path, double_barcoded_path])
+            logger.info(
+                "Demultiplexing samples into individual aligned/sorted BAM files based on double end barcode status with Dorado"
+            )
+            all_de_bam_files = demux_and_index_BAM(
+                aligned_sorted_BAM,
+                double_barcoded_path,
+                cfg.bam_suffix,
+                cfg.barcode_kit,
+                True,
+                cfg.trim,
+                cfg.threads,
+            )
             unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
             de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
         bam_files = se_bam_files + de_bam_files
         unclassified_bams = unclassified_se_bams + unclassified_de_bams
         bam_dir = single_barcoded_path
@@ -413,138 +481,192 @@ def load_adata_core(cfg, paths: AdataPaths):
     if cfg.make_beds:
         # Make beds and provide basic histograms
-        bed_dir = cfg.split_path / 'beds'
+        bed_dir = cfg.split_path / "beds"
         if bed_dir.is_dir():
-            print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams')
+            logger.debug(
+                f"{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams"
+            )
         else:
+            logger.info("Making BED files from BAM files for each sample")
             for bam in bam_files:
                 aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
     ########################################################################################################################
     ################################### 6) SAMTools based BAM QC ######################################################################
-    from ..informatics.bam_functions import bam_qc
     # 5) Samtools QC metrics on split BAM files
     bam_qc_dir = cfg.split_path / "bam_qc"
     if bam_qc_dir.is_dir():
-        print( f'{bam_qc_dir} already exists. Using existing BAM QC calculations.')
+        logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
     else:
         make_dirs([bam_qc_dir])
+        logger.info("Performing BAM QC")
         bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
-    ########################################################################################################################
+    ########################################################################################################################
     ################################### 7) AnnData loading ######################################################################
-    if cfg.smf_modality != 'direct':
+    if cfg.smf_modality != "direct":
         from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
         # 6) Take the converted BAM and load it into an adata object.
-        if cfg.smf_modality == 'deaminase':
+        if cfg.smf_modality == "deaminase":
             deaminase_footprinting = True
         else:
             deaminase_footprinting = False
-        raw_adata, raw_adata_path = converted_BAM_to_adata(fasta,
-                                                                  bam_dir,
-                                                                  cfg.output_directory,
-                                                                  cfg.input_already_demuxed,
-                                                                  cfg.mapping_threshold,
-                                                                  cfg.experiment_name,
-                                                                  cfg.conversion_types,
-                                                                  cfg.bam_suffix,
-                                                                  cfg.device,
-                                                                  cfg.threads,
-                                                                  deaminase_footprinting,
-                                                                  delete_intermediates=cfg.delete_intermediate_hdfs,
-                                                                  double_barcoded_path=double_barcoded_path)
+        logger.info(f"Loading Anndata from BAM files for {cfg.smf_modality} footprinting")
+        raw_adata, raw_adata_path = converted_BAM_to_adata(
+            fasta,
+            bam_dir,
+            cfg.output_directory,
+            cfg.input_already_demuxed,
+            cfg.mapping_threshold,
+            cfg.experiment_name,
+            cfg.conversion_types,
+            cfg.bam_suffix,
+            cfg.device,
+            cfg.threads,
+            deaminase_footprinting,
+            delete_intermediates=cfg.delete_intermediate_hdfs,
+            double_barcoded_path=double_barcoded_path,
+        )
     else:
         if mod_bed_dir.is_dir():
-            print(f'{mod_bed_dir} already exists, skipping making modbeds')
+            logger.debug(f"{mod_bed_dir} already exists, skipping making modbeds")
         else:
-            from ..informatics.modkit_functions import modQC, make_modbed
-            make_dirs([mod_bed_dir])
-            modQC(aligned_sorted_output,
-                  cfg.thresholds) # get QC metrics for mod calls
-            make_modbed(aligned_sorted_output,
-                        cfg.thresholds,
-                        mod_bed_dir) # Generate bed files of position methylation summaries for every sample
+            from ..informatics.modkit_functions import make_modbed, modQC
+            make_dirs([mod_bed_dir])
+            logger.info("Performing modQC for direct footprinting samples")
+            modQC(aligned_sorted_output, cfg.thresholds)  # get QC metrics for mod calls
+            logger.info("Making modified BED files for direct footprinting samples")
+            make_modbed(
+                aligned_sorted_output, cfg.thresholds, mod_bed_dir
+            )  # Generate bed files of position methylation summaries for every sample
         from ..informatics.modkit_functions import extract_mods
         make_dirs([mod_tsv_dir])
-        extract_mods(cfg.thresholds,
-                        mod_tsv_dir,
-                        bam_dir,
-                        cfg.bam_suffix,
-                        skip_unclassified=cfg.skip_unclassified,
-                        modkit_summary=False,
-                        threads=cfg.threads) # Extract methylations calls for split BAM files into split TSV files
+        logger.info(
+            "Extracting single read modification states into TSVs for direct footprinting samples"
+        )
+        extract_mods(
+            cfg.thresholds,
+            mod_tsv_dir,
+            bam_dir,
+            cfg.bam_suffix,
+            skip_unclassified=cfg.skip_unclassified,
+            modkit_summary=False,
+            threads=cfg.threads,
+        )  # Extract methylations calls for split BAM files into split TSV files
         from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
-        #6 Load the modification data from TSVs into an adata object
-        raw_adata, raw_adata_path = modkit_extract_to_adata(fasta,
-                                                                bam_dir,
-                                                                cfg.output_directory,
-                                                                cfg.input_already_demuxed,
-                                                                cfg.mapping_threshold,
-                                                                cfg.experiment_name,
-                                                                mods,
-                                                                cfg.batch_size,
-                                                                mod_tsv_dir,
-                                                                cfg.delete_batch_hdfs,
-                                                                cfg.threads,
-                                                                double_barcoded_path)
+        logger.info("Making Anndata for direct modification detection SMF samples")
+        # 6 Load the modification data from TSVs into an adata object
+        raw_adata, raw_adata_path = modkit_extract_to_adata(
+            fasta,
+            bam_dir,
+            cfg.output_directory,
+            cfg.input_already_demuxed,
+            cfg.mapping_threshold,
+            cfg.experiment_name,
+            mods,
+            cfg.batch_size,
+            mod_tsv_dir,
+            cfg.delete_batch_hdfs,
+            cfg.threads,
+            double_barcoded_path,
+        )
         if cfg.delete_intermediate_tsvs:
             delete_tsvs(mod_tsv_dir)
-    raw_adata.obs['Experiment_name'] = [cfg.experiment_name] * raw_adata.shape[0]
-    raw_adata.obs['Experiment_name_and_barcode'] = (raw_adata.obs['Experiment_name'].astype(str) + "_" + raw_adata.obs['Barcode'].astype(str))
+    raw_adata.obs["Experiment_name"] = [cfg.experiment_name] * raw_adata.shape[0]
+    raw_adata.obs["Experiment_name_and_barcode"] = (
+        raw_adata.obs["Experiment_name"].astype(str) + "_" + raw_adata.obs["Barcode"].astype(str)
+    )
     ########################################################################################################################
     ############################################### Add basic read length, read quality, mapping quality stats ###############################################
-    from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
-    from ..informatics.bam_functions import extract_read_features_from_bam
-    add_read_length_and_mapping_qc(raw_adata, se_bam_files,
-                                   extract_read_features_from_bam_callable=extract_read_features_from_bam,
-                                   bypass=cfg.bypass_add_read_length_and_mapping_qc,
-                                   force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
-    raw_adata.obs['Raw_modification_signal'] =  np.nansum(raw_adata.X, axis=1)
+    logger.info("Adding read length, mapping quality, and modification signal to Anndata")
+    add_read_length_and_mapping_qc(
+        raw_adata,
+        se_bam_files,
+        extract_read_features_from_bam_callable=extract_read_features_from_bam,
+        bypass=cfg.bypass_add_read_length_and_mapping_qc,
+        force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
+    )
+    raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
+    ########################################################################################################################
+    ############################################### if input data type was pod5, append the pod5 file origin to each read ###############################################
+    from ..informatics.h5ad_functions import annotate_pod5_origin
+    if cfg.input_type == "pod5":
+        logger.info("Adding the POD5 origin file to each read into Anndata")
+        annotate_pod5_origin(
+            raw_adata,
+            cfg.input_data_path,
+            n_jobs=cfg.threads,
+            csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
+        )
     ########################################################################################################################
     ############################################### Save final adata ###############################################
-    print(f"Saving AnnData to {raw_adata_path}")
+    logger.info(f"Saving AnnData to {raw_adata_path}")
+    record_smftools_metadata(
+        raw_adata,
+        step_name="load",
+        cfg=cfg,
+        config_path=config_path,
+        output_path=raw_adata_path,
+    )
     write_gz_h5ad(raw_adata, raw_adata_path)
     ########################################################################################################################
     ############################################### MultiQC HTML Report ###############################################
-    from ..informatics.run_multiqc import run_multiqc
     # multiqc ###
     mqc_dir = cfg.split_path / "multiqc"
     if mqc_dir.is_dir():
-        print(f'{mqc_dir} already exists, skipping multiqc')
+        logger.debug(f"{mqc_dir} already exists, skipping multiqc")
     else:
+        logger.info("Running multiqc")
         run_multiqc(cfg.split_path, mqc_dir)
     ########################################################################################################################
     ############################################### delete intermediate BAM files ###############################################
     if cfg.delete_intermediate_bams:
+        logger.info("Deleting intermediate BAM files")
         # delete aligned and sorted bam
         aligned_sorted_output.unlink()
-        bai = aligned_sorted_output.parent / (aligned_sorted_output.name + '.bai')
+        bai = aligned_sorted_output.parent / (aligned_sorted_output.name + ".bai")
         bai.unlink()
         # delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
         for bam in bam_files:
-            bai = bam.parent / (bam.name + '.bai')
+            bai = bam.parent / (bam.name + ".bai")
             bam.unlink()
             bai.unlink()
         for bam in unclassified_bams:
-            bai = bam.parent / (bam.name + '.bai')
+            bai = bam.parent / (bam.name + ".bai")
             bam.unlink()
-            bai.unlink()
+            bai.unlink()
+        logger.info("Finished deleting intermediate BAM files")
     ########################################################################################################################
     return raw_adata, raw_adata_path, cfg
 def load_adata(config_path: str):
     """
     CLI-facing wrapper for the load pipeline.
@@ -565,15 +687,11 @@ def load_adata(config_path: str):
     cfg : ExperimentConfig
         Config object for downstream steps.
     """
-    from importlib import resources
     from datetime import datetime
-    from pathlib import Path
-    import pandas as pd  # used for summary file reading downstream if needed
-    from ..readwrite import make_dirs, add_or_update_column_in_csv
-    from ..config import LoadExperimentConfig, ExperimentConfig
+    from importlib import resources
+    from ..config import ExperimentConfig, LoadExperimentConfig
+    from ..readwrite import add_or_update_column_in_csv, make_dirs
     from .helpers import get_adata_paths
     date_str = datetime.today().strftime("%y%m%d")
@@ -613,25 +731,27 @@ def load_adata(config_path: str):
     # -----------------------------
     if not getattr(cfg, "force_redo_load_adata", False):
         if paths.hmm.exists():
-            print(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
+            logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
             return None, paths.hmm, cfg
         if paths.spatial.exists():
-            print(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
+            logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
             return None, paths.spatial, cfg
         if paths.pp_dedup.exists():
-            print(
+            logger.debug(
                 f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
                 f"Skipping smftools load"
             )
             return None, paths.pp_dedup, cfg
         if paths.pp.exists():
-            print(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
+            logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
             return None, paths.pp, cfg
         if paths.raw.exists():
-            print(f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load")
+            logger.debug(
+                f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
+            )
             return None, paths.raw, cfg
     # If we get here, we actually want to run the full load pipeline
-    adata, adata_path, cfg = load_adata_core(cfg, paths)
+    adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
-    return adata, adata_path, cfg
+    return adata, adata_path, cfg

smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl