PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

smftools/__init__.py +43 -13
smftools/_settings.py +6 -6
smftools/_version.py +3 -1
smftools/cli/__init__.py +1 -0
smftools/cli/archived/cli_flows.py +2 -0
smftools/cli/helpers.py +9 -1
smftools/cli/hmm_adata.py +905 -242
smftools/cli/load_adata.py +432 -280
smftools/cli/preprocess_adata.py +287 -171
smftools/cli/spatial_adata.py +141 -53
smftools/cli_entry.py +119 -178
smftools/config/__init__.py +3 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +26 -18
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +511 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +4 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2133 -1428
smftools/hmm/__init__.py +24 -14
smftools/hmm/archived/apply_hmm_batched.py +2 -0
smftools/hmm/archived/calculate_distances.py +2 -0
smftools/hmm/archived/call_hmm_peaks.py +18 -1
smftools/hmm/archived/train_hmm.py +2 -0
smftools/hmm/call_hmm_peaks.py +176 -193
smftools/hmm/display_hmm.py +23 -7
smftools/hmm/hmm_readwrite.py +20 -6
smftools/hmm/nucleosome_hmm_refinement.py +104 -14
smftools/informatics/__init__.py +55 -13
smftools/informatics/archived/bam_conversion.py +2 -0
smftools/informatics/archived/bam_direct.py +2 -0
smftools/informatics/archived/basecall_pod5s.py +2 -0
smftools/informatics/archived/basecalls_to_adata.py +2 -0
smftools/informatics/archived/conversion_smf.py +2 -0
smftools/informatics/archived/deaminase_smf.py +1 -0
smftools/informatics/archived/direct_smf.py +2 -0
smftools/informatics/archived/fast5_to_pod5.py +2 -0
smftools/informatics/archived/helpers/archived/__init__.py +2 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
smftools/informatics/archived/helpers/archived/informatics.py +2 -0
smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
smftools/informatics/archived/helpers/archived/modQC.py +2 -0
smftools/informatics/archived/helpers/archived/modcall.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +9 -1
smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
smftools/informatics/archived/subsample_pod5.py +2 -0
smftools/informatics/bam_functions.py +1059 -269
smftools/informatics/basecalling.py +53 -9
smftools/informatics/bed_functions.py +357 -114
smftools/informatics/binarize_converted_base_identities.py +21 -7
smftools/informatics/complement_base_list.py +9 -6
smftools/informatics/converted_BAM_to_adata.py +324 -137
smftools/informatics/fasta_functions.py +251 -89
smftools/informatics/h5ad_functions.py +202 -30
smftools/informatics/modkit_extract_to_adata.py +623 -274
smftools/informatics/modkit_functions.py +87 -44
smftools/informatics/ohe.py +46 -21
smftools/informatics/pod5_functions.py +114 -74
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +23 -12
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +157 -50
smftools/machine_learning/data/preprocessing.py +4 -1
smftools/machine_learning/evaluation/__init__.py +3 -1
smftools/machine_learning/evaluation/eval_utils.py +13 -14
smftools/machine_learning/evaluation/evaluators.py +52 -34
smftools/machine_learning/inference/__init__.py +3 -1
smftools/machine_learning/inference/inference_utils.py +9 -4
smftools/machine_learning/inference/lightning_inference.py +14 -13
smftools/machine_learning/inference/sklearn_inference.py +8 -8
smftools/machine_learning/inference/sliding_window_inference.py +37 -25
smftools/machine_learning/models/__init__.py +12 -5
smftools/machine_learning/models/base.py +34 -43
smftools/machine_learning/models/cnn.py +22 -13
smftools/machine_learning/models/lightning_base.py +78 -42
smftools/machine_learning/models/mlp.py +18 -5
smftools/machine_learning/models/positional.py +10 -4
smftools/machine_learning/models/rnn.py +8 -3
smftools/machine_learning/models/sklearn_models.py +46 -24
smftools/machine_learning/models/transformer.py +75 -55
smftools/machine_learning/models/wrappers.py +8 -3
smftools/machine_learning/training/__init__.py +4 -2
smftools/machine_learning/training/train_lightning_model.py +42 -23
smftools/machine_learning/training/train_sklearn_model.py +11 -15
smftools/machine_learning/utils/__init__.py +3 -1
smftools/machine_learning/utils/device.py +12 -5
smftools/machine_learning/utils/grl.py +8 -2
smftools/metadata.py +443 -0
smftools/optional_imports.py +31 -0
smftools/plotting/__init__.py +32 -17
smftools/plotting/autocorrelation_plotting.py +153 -48
smftools/plotting/classifiers.py +175 -73
smftools/plotting/general_plotting.py +350 -168
smftools/plotting/hmm_plotting.py +53 -14
smftools/plotting/position_stats.py +155 -87
smftools/plotting/qc_plotting.py +25 -12
smftools/preprocessing/__init__.py +35 -37
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +18 -11
smftools/preprocessing/calculate_complexity_II.py +89 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +4 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
smftools/preprocessing/calculate_position_Youden.py +110 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
smftools/preprocessing/flag_duplicate_reads.py +708 -303
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +9 -3
smftools/preprocessing/min_non_diagonal.py +4 -1
smftools/preprocessing/recipes.py +58 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +25 -18
smftools/tools/archived/apply_hmm.py +2 -0
smftools/tools/archived/classifiers.py +165 -0
smftools/tools/archived/classify_methylated_features.py +2 -0
smftools/tools/archived/classify_non_methylated_features.py +2 -0
smftools/tools/archived/subset_adata_v1.py +12 -1
smftools/tools/archived/subset_adata_v2.py +14 -1
smftools/tools/calculate_umap.py +56 -15
smftools/tools/cluster_adata_on_methylation.py +122 -47
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +220 -99
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
smftools-0.3.0.dist-info/METADATA +147 -0
smftools-0.3.0.dist-info/RECORD +182 -0
smftools-0.2.4.dist-info/METADATA +0 -141
smftools-0.2.4.dist-info/RECORD +0 -176
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/modkit_functions.py CHANGED Viewed

@@ -1,10 +1,21 @@
-import os
+from __future__ import annotations
 import subprocess
-import glob
-import zipfile
-from pathlib import Path
-def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassified=True, modkit_summary=False, threads=None):
+from smftools.logging_utils import get_logger
+logger = get_logger(__name__)
+def extract_mods(
+    thresholds,
+    mod_tsv_dir,
+    split_dir,
+    bam_suffix,
+    skip_unclassified=True,
+    modkit_summary=False,
+    threads=None,
+):
     """
     Takes all of the aligned, sorted, split modified BAM files and runs Nanopore Modkit Extract to load the modification data into zipped TSV files
@@ -23,10 +34,12 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
     """
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
-    bam_files = sorted(p for p in split_dir.iterdir() if bam_suffix in p.name and '.bai' not in p.name)
+    bam_files = sorted(
+        p for p in split_dir.iterdir() if bam_suffix in p.name and ".bai" not in p.name
+    )
     if skip_unclassified:
         bam_files = [p for p in bam_files if "unclassified" not in p.name]
-    print(f"Running modkit extract for the following bam files: {bam_files}")
+    logger.info(f"Running modkit extract for the following bam files: {bam_files}")
     if threads:
         threads = str(threads)
@@ -34,14 +47,14 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
         pass
     for input_file in bam_files:
-        print(input_file)
+        logger.debug(input_file)
         # Construct the output TSV file path
         output_tsv = mod_tsv_dir / (input_file.stem + "_extract.tsv")
-        output_tsv_gz = output_tsv.parent / (output_tsv.name + '.gz')
+        output_tsv_gz = output_tsv.parent / (output_tsv.name + ".gz")
         if output_tsv_gz.exists():
-            print(f"{output_tsv_gz} already exists, skipping modkit extract")
+            logger.debug(f"{output_tsv_gz} already exists, skipping modkit extract")
         else:
-            print(f"Extracting modification data from {input_file}")
+            logger.info(f"Extracting modification data from {input_file}")
             if modkit_summary:
                 # Run modkit summary
                 subprocess.run(["modkit", "summary", str(input_file)])
@@ -50,28 +63,43 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
             # Run modkit extract
             if threads:
                 extract_command = [
-                    "modkit", "extract",
-                    "calls", "--mapped-only",
-                    "--filter-threshold", f'{filter_threshold}',
-                    "--mod-thresholds", f"m:{m5C_threshold}",
-                    "--mod-thresholds", f"a:{m6A_threshold}",
-                    "--mod-thresholds", f"h:{hm5C_threshold}",
-                    "-t", threads,
-                    str(input_file), str(output_tsv)
-                    ]
+                    "modkit",
+                    "extract",
+                    "calls",
+                    "--mapped-only",
+                    "--filter-threshold",
+                    f"{filter_threshold}",
+                    "--mod-thresholds",
+                    f"m:{m5C_threshold}",
+                    "--mod-thresholds",
+                    f"a:{m6A_threshold}",
+                    "--mod-thresholds",
+                    f"h:{hm5C_threshold}",
+                    "-t",
+                    threads,
+                    str(input_file),
+                    str(output_tsv),
+                ]
             else:
                 extract_command = [
-                    "modkit", "extract",
-                    "calls", "--mapped-only",
-                    "--filter-threshold", f'{filter_threshold}',
-                    "--mod-thresholds", f"m:{m5C_threshold}",
-                    "--mod-thresholds", f"a:{m6A_threshold}",
-                    "--mod-thresholds", f"h:{hm5C_threshold}",
-                    str(input_file), str(output_tsv)
-                    ]
+                    "modkit",
+                    "extract",
+                    "calls",
+                    "--mapped-only",
+                    "--filter-threshold",
+                    f"{filter_threshold}",
+                    "--mod-thresholds",
+                    f"m:{m5C_threshold}",
+                    "--mod-thresholds",
+                    f"a:{m6A_threshold}",
+                    "--mod-thresholds",
+                    f"h:{hm5C_threshold}",
+                    str(input_file),
+                    str(output_tsv),
+                ]
             subprocess.run(extract_command)
             # Zip the output TSV
-            print(f'zipping {output_tsv}')
+            logger.info(f"zipping {output_tsv}")
             if threads:
                 zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
             else:
@@ -79,30 +107,39 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
             subprocess.run(zip_command, check=True)
     return
 def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
     """
     Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
     Parameters:
         aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
     Returns:
         None
     """
-    import os
     import subprocess
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
     command = [
-        "modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
-        "--partition-tag", "BC",
+        "modkit",
+        "pileup",
+        str(aligned_sorted_output),
+        str(mod_bed_dir),
+        "--partition-tag",
+        "BC",
         "--only-tabs",
-        "--filter-threshold", f'{filter_threshold}',
-        "--mod-thresholds", f"m:{m5C_threshold}",
-        "--mod-thresholds", f"a:{m6A_threshold}",
-        "--mod-thresholds", f"h:{hm5C_threshold}"
+        "--filter-threshold",
+        f"{filter_threshold}",
+        "--mod-thresholds",
+        f"m:{m5C_threshold}",
+        "--mod-thresholds",
+        f"a:{m6A_threshold}",
+        "--mod-thresholds",
+        f"h:{hm5C_threshold}",
     ]
     subprocess.run(command)
 def modQC(aligned_sorted_output, thresholds):
     """
     Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
@@ -120,10 +157,16 @@ def modQC(aligned_sorted_output, thresholds):
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
     subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
     command = [
-        "modkit", "summary", str(aligned_sorted_output),
-        "--filter-threshold", f"{filter_threshold}",
-        "--mod-thresholds", f"m:{m5C_threshold}",
-        "--mod-thresholds", f"a:{m6A_threshold}",
-        "--mod-thresholds", f"h:{hm5C_threshold}"
+        "modkit",
+        "summary",
+        str(aligned_sorted_output),
+        "--filter-threshold",
+        f"{filter_threshold}",
+        "--mod-thresholds",
+        f"m:{m5C_threshold}",
+        "--mod-thresholds",
+        f"a:{m6A_threshold}",
+        "--mod-thresholds",
+        f"h:{hm5C_threshold}",
     ]
-    subprocess.run(command)
+    subprocess.run(command)

smftools/informatics/ohe.py CHANGED Viewed

@@ -1,10 +1,17 @@
-import numpy as np
-import anndata as ad
+from __future__ import annotations
-import os
 import concurrent.futures
+import os
+import anndata as ad
+import numpy as np
+from smftools.logging_utils import get_logger
+logger = get_logger(__name__)
-def one_hot_encode(sequence, device='auto'):
+def one_hot_encode(sequence, device="auto"):
     """
     One-hot encodes a DNA sequence.
@@ -14,7 +21,7 @@ def one_hot_encode(sequence, device='auto'):
     Returns:
         ndarray: Flattened one-hot encoded representation of the input sequence.
     """
-    mapping = np.array(['A', 'C', 'G', 'T', 'N'])
+    mapping = np.array(["A", "C", "G", "T", "N"])
     # Ensure input is a list of characters
     if not isinstance(sequence, list):
@@ -22,14 +29,14 @@ def one_hot_encode(sequence, device='auto'):
     # Handle empty sequences
     if len(sequence) == 0:
-        print("Warning: Empty sequence encountered in one_hot_encode()")
+        logger.warning("Empty sequence encountered in one_hot_encode()")
         return np.zeros(len(mapping))  # Return empty encoding instead of failing
     # Convert sequence to NumPy array
-    seq_array = np.array(sequence, dtype='<U1')
+    seq_array = np.array(sequence, dtype="<U1")
     # Replace invalid bases with 'N'
-    seq_array = np.where(np.isin(seq_array, mapping), seq_array, 'N')
+    seq_array = np.where(np.isin(seq_array, mapping), seq_array, "N")
     # Create one-hot encoding matrix
     one_hot_matrix = (seq_array[:, None] == mapping).astype(int)
@@ -37,6 +44,7 @@ def one_hot_encode(sequence, device='auto'):
     # Flatten and return
     return one_hot_matrix.flatten()
 def one_hot_decode(ohe_array):
     """
     Takes a flattened one hot encoded array and returns the sequence string from that array.
@@ -47,20 +55,21 @@ def one_hot_decode(ohe_array):
         sequence (str): Sequence string of the one hot encoded array
     """
     # Define the mapping of one-hot encoded indices to DNA bases
-    mapping = ['A', 'C', 'G', 'T', 'N']
+    mapping = ["A", "C", "G", "T", "N"]
     # Reshape the flattened array into a 2D matrix with 5 columns (one for each base)
     one_hot_matrix = ohe_array.reshape(-1, 5)
     # Get the index of the maximum value (which will be 1) in each row
     decoded_indices = np.argmax(one_hot_matrix, axis=1)
     # Map the indices back to the corresponding bases
     sequence_list = [mapping[i] for i in decoded_indices]
-    sequence = ''.join(sequence_list)
+    sequence = "".join(sequence_list)
     return sequence
 def ohe_layers_decode(adata, obs_names):
     """
     Takes an anndata object and a list of observation names. Returns a list of sequence strings for the reads of interest.
@@ -72,7 +81,7 @@ def ohe_layers_decode(adata, obs_names):
         sequences (list of str): List of strings of the one hot encoded array
     """
     # Define the mapping of one-hot encoded indices to DNA bases
-    mapping = ['A', 'C', 'G', 'T', 'N']
+    mapping = ["A", "C", "G", "T", "N"]
     ohe_layers = [f"{base}_binary_encoding" for base in mapping]
     sequences = []
@@ -85,9 +94,10 @@ def ohe_layers_decode(adata, obs_names):
         ohe_array = np.array(ohe_list)
         sequence = one_hot_decode(ohe_array)
         sequences.append(sequence)
     return sequences
 def _encode_sequence(args):
     """Parallel helper function for one-hot encoding."""
     read_name, seq, device = args
@@ -97,18 +107,29 @@ def _encode_sequence(args):
     except Exception:
         return None  # Skip invalid sequences
 def _encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number):
     """Encodes a batch and writes to disk immediately."""
     batch = {read_name: matrix for read_name, matrix in batch_data if matrix is not None}
     if batch:
-        save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad')
+        save_name = os.path.join(tmp_dir, f"tmp_{prefix}_{record}_{batch_number}.h5ad")
         tmp_ad = ad.AnnData(X=np.zeros((1, 1)), uns=batch)  # Placeholder X
         tmp_ad.write_h5ad(save_name)
         return save_name
     return None
-def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000, progress_bar=None, device='auto', threads=None):
+def ohe_batching(
+    base_identities,
+    tmp_dir,
+    record,
+    prefix="",
+    batch_size=100000,
+    progress_bar=None,
+    device="auto",
+    threads=None,
+):
     """
     Efficient version of ohe_batching: one-hot encodes sequences in parallel and writes batches immediately.
@@ -131,7 +152,9 @@ def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000,
     file_names = []
     # Step 1: Prepare Data for Parallel Encoding
-    encoding_args = [(read_name, seq, device) for read_name, seq in base_identities.items() if seq is not None]
+    encoding_args = [
+        (read_name, seq, device) for read_name, seq in base_identities.items() if seq is not None
+    ]
     # Step 2: Parallel One-Hot Encoding using threads (to avoid nested processes)
     with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
@@ -141,7 +164,9 @@ def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000,
                 if len(batch_data) >= batch_size:
                     # Step 3: Process and Write Batch Immediately
-                    file_name = _encode_and_save_batch(batch_data.copy(), tmp_dir, prefix, record, batch_number)
+                    file_name = _encode_and_save_batch(
+                        batch_data.copy(), tmp_dir, prefix, record, batch_number
+                    )
                     if file_name:
                         file_names.append(file_name)
@@ -157,4 +182,4 @@ def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000,
         if file_name:
             file_names.append(file_name)
-    return file_names
+    return file_names

smftools/informatics/pod5_functions.py CHANGED Viewed

@@ -1,26 +1,30 @@
-from ..config import LoadExperimentConfig
-from ..readwrite import make_dirs
+from __future__ import annotations
 import os
 import subprocess
 from pathlib import Path
+from typing import Iterable
-import pod5 as p5
+from smftools.logging_utils import get_logger
+from smftools.optional_imports import require
-from typing import Union, List
+from ..config import LoadExperimentConfig
+from ..informatics.basecalling import canoncall, modcall
+from ..readwrite import make_dirs
-def basecall_pod5s(config_path):
-    """
-    Basecall from pod5s given a config file.
+logger = get_logger(__name__)
-    Parameters:
-        config_path (str): File path to the basecall configuration file
+p5 = require("pod5", extra="ont", purpose="POD5 IO")
-    Returns:
-        None
+def basecall_pod5s(config_path: str | Path) -> None:
+    """Basecall POD5 inputs using a configuration file.
+    Args:
+        config_path: Path to the basecall configuration file.
     """
     # Default params
-    bam_suffix = '.bam' # If different, change from here.
+    bam_suffix = ".bam"  # If different, change from here.
     # Load experiment config parameters into global variables
     experiment_config = LoadExperimentConfig(config_path)
@@ -30,66 +34,89 @@ def basecall_pod5s(config_path):
     default_value = None
     # General config variable init
-    input_data_path = Path(var_dict.get('input_data_path', default_value)) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
-    output_directory = Path(var_dict.get('output_directory', default_value)) # Path to the output directory to make for the analysis. Necessary.
-    model = var_dict.get('model', default_value) # needed for dorado basecaller
-    model_dir = Path(var_dict.get('model_dir', default_value)) # model directory
-    barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
-    barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
-    trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
-    device = var_dict.get('device', 'auto')
+    input_data_path = Path(
+        var_dict.get("input_data_path", default_value)
+    )  # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
+    output_directory = Path(
+        var_dict.get("output_directory", default_value)
+    )  # Path to the output directory to make for the analysis. Necessary.
+    model = var_dict.get("model", default_value)  # needed for dorado basecaller
+    model_dir = Path(var_dict.get("model_dir", default_value))  # model directory
+    barcode_kit = var_dict.get("barcode_kit", default_value)  # needed for dorado basecaller
+    barcode_both_ends = var_dict.get("barcode_both_ends", default_value)  # dorado demultiplexing
+    trim = var_dict.get("trim", default_value)  # dorado adapter and barcode removal
+    device = var_dict.get("device", "auto")
     # Modified basecalling specific variable init
-    filter_threshold = var_dict.get('filter_threshold', default_value)
-    m6A_threshold = var_dict.get('m6A_threshold', default_value)
-    m5C_threshold = var_dict.get('m5C_threshold', default_value)
-    hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
+    filter_threshold = var_dict.get("filter_threshold", default_value)
+    m6A_threshold = var_dict.get("m6A_threshold", default_value)
+    m5C_threshold = var_dict.get("m5C_threshold", default_value)
+    hm5C_threshold = var_dict.get("hm5C_threshold", default_value)
     thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
-    mod_list = var_dict.get('mod_list', default_value)
+    mod_list = var_dict.get("mod_list", default_value)
     # Make initial output directory
     make_dirs([output_directory])
     # Get the input filetype
     if input_data_path.is_file():
         input_data_filetype = input_data_path.suffixes[0]
-        input_is_pod5 = input_data_filetype in ['.pod5','.p5']
-        input_is_fast5 = input_data_filetype in ['.fast5','.f5']
+        input_is_pod5 = input_data_filetype in [".pod5", ".p5"]
+        input_is_fast5 = input_data_filetype in [".fast5", ".f5"]
     elif input_data_path.is_dir():
         # Get the file names in the input data dir
         input_files = input_data_path.iterdir()
-        input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
-        input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
+        input_is_pod5 = sum([True for file in input_files if ".pod5" in file or ".p5" in file])
+        input_is_fast5 = sum([True for file in input_files if ".fast5" in file or ".f5" in file])
     # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
     if input_is_fast5 and not input_is_pod5:
         # take the input directory of fast5 files and write out a single pod5 file into the output directory.
-        output_pod5 = output_directory / 'FAST5s_to_POD5.pod5'
-        print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
+        output_pod5 = output_directory / "FAST5s_to_POD5.pod5"
+        logger.info(
+            f"Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}"
+        )
         fast5_to_pod5(input_data_path, output_pod5)
         # Reassign the pod5_dir variable to point to the new pod5 file.
         input_data_path = output_pod5
     model_basename = model.name
-    model_basename = model_basename.replace('.', '_')
+    model_basename = model_basename.replace(".", "_")
     if mod_list:
         mod_string = "_".join(mod_list)
         bam = output_directory / f"{model_basename}_{mod_string}_calls"
-        modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
+        modcall(
+            model,
+            input_data_path,
+            barcode_kit,
+            mod_list,
+            bam,
+            bam_suffix,
+            barcode_both_ends,
+            trim,
+            device,
+        )
     else:
         bam = output_directory / f"{model_basename}_canonical_basecalls"
-        canoncall(model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
+        canoncall(
+            model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device
+        )
 def fast5_to_pod5(
-    fast5_dir: Union[str, Path, List[Union[str, Path]]],
-    output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
+    fast5_dir: str | Path | Iterable[str | Path],
+    output_pod5: str | Path = "FAST5s_to_POD5.pod5",
 ) -> None:
-    """
-    Convert Nanopore FAST5 files (single file, list of files, or directory)
-    into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
+    """Convert FAST5 inputs into a single POD5 file.
+    Args:
+        fast5_dir: FAST5 file path, directory, or iterable of file paths to convert.
+        output_pod5: Output POD5 file path.
+    Raises:
+        FileNotFoundError: If no FAST5 files are found or the input path is invalid.
     """
     output_pod5 = str(output_pod5)  # ensure string
@@ -122,45 +149,51 @@ def fast5_to_pod5(
     raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
-def subsample_pod5(pod5_path, read_name_path, output_directory):
-    """
-    Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
-    This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
-    Parameters:
-        pod5_path (str): File path to the POD5 file (or directory of multiple pod5 files) to subsample.
-        read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
-        output_directory (str): A file path to the directory to output the file.
+def subsample_pod5(
+    pod5_path: str | Path,
+    read_name_path: str | int,
+    output_directory: str | Path,
+) -> None:
+    """Write a subsampled POD5 containing selected reads.
-    Returns:
-        None
+    Args:
+        pod5_path: POD5 file path or directory of POD5 files to subsample.
+        read_name_path: Path to a text file of read names (one per line) or an integer
+            specifying a random subset size.
+        output_directory: Directory to write the subsampled POD5 file.
     """
     if os.path.isdir(pod5_path):
         pod5_path_is_dir = True
-        input_pod5_base = 'input_pod5s.pod5'
+        input_pod5_base = "input_pod5s.pod5"
         files = os.listdir(pod5_path)
-        pod5_files = [os.path.join(pod5_path, file) for file in files if '.pod5' in file]
+        pod5_files = [os.path.join(pod5_path, file) for file in files if ".pod5" in file]
         pod5_files.sort()
-        print(f'Found input pod5s: {pod5_files}')
+        logger.info(f"Found input pod5s: {pod5_files}")
     elif os.path.exists(pod5_path):
         pod5_path_is_dir = False
         input_pod5_base = os.path.basename(pod5_path)
     else:
-        print('Error: pod5_path passed does not exist')
+        logger.error("pod5_path passed does not exist")
         return None
-    if type(read_name_path) == str:
+    if type(read_name_path) is str:
         input_read_name_base = os.path.basename(read_name_path)
-        output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
+        output_base = (
+            input_pod5_base.split(".pod5")[0]
+            + "_"
+            + input_read_name_base.split(".txt")[0]
+            + "_subsampled.pod5"
+        )
         # extract read names into a list of strings
-        with open(read_name_path, 'r') as file:
+        with open(read_name_path, "r") as file:
             read_names = [line.strip() for line in file]
-        print(f'Looking for read_ids: {read_names}')
+        logger.info(f"Looking for read_ids: {read_names}")
         read_records = []
         if pod5_path_is_dir:
@@ -168,22 +201,25 @@ def subsample_pod5(pod5_path, read_name_path, output_directory):
                 with p5.Reader(input_pod5) as reader:
                     try:
                         for read_record in reader.reads(selection=read_names, missing_ok=True):
-                            read_records.append(read_record.to_read())
-                            print(f'Found read in {input_pod5}: {read_record.read_id}')
-                    except:
-                        print('Skipping pod5, could not find reads')
-        else:
+                            read_records.append(read_record.to_read())
+                            logger.info(f"Found read in {input_pod5}: {read_record.read_id}")
+                    except Exception:
+                        logger.warning("Skipping pod5, could not find reads")
+        else:
             with p5.Reader(pod5_path) as reader:
                 try:
                     for read_record in reader.reads(selection=read_names):
                         read_records.append(read_record.to_read())
-                        print(f'Found read in {input_pod5}: {read_record}')
-                except:
-                    print('Could not find reads')
+                        logger.info(f"Found read in {input_pod5}: {read_record}")
+                except Exception:
+                    logger.warning("Could not find reads")
-    elif type(read_name_path) == int:
+    elif type(read_name_path) is int:
         import random
-        output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
+        output_base = (
+            input_pod5_base.split(".pod5")[0] + f"_{read_name_path}_randomly_subsampled.pod5"
+        )
         all_read_records = []
         if pod5_path_is_dir:
@@ -191,7 +227,7 @@ def subsample_pod5(pod5_path, read_name_path, output_directory):
             random.shuffle(pod5_files)
             for input_pod5 in pod5_files:
                 # iterate over the input pod5s
-                print(f'Opening pod5 file {input_pod5}')
+                logger.info(f"Opening pod5 file {input_pod5}")
                 with p5.Reader(pod5_path) as reader:
                     for read_record in reader.reads():
                         all_read_records.append(read_record.to_read())
@@ -202,9 +238,11 @@ def subsample_pod5(pod5_path, read_name_path, output_directory):
             if read_name_path <= len(all_read_records):
                 read_records = random.sample(all_read_records, read_name_path)
             else:
-                print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
+                logger.info(
+                    "Trying to sample more reads than are contained in the input pod5s, taking all reads"
+                )
                 read_records = all_read_records
         else:
             with p5.Reader(pod5_path) as reader:
                 for read_record in reader.reads():
@@ -214,11 +252,13 @@ def subsample_pod5(pod5_path, read_name_path, output_directory):
                 # if the subsampling amount is less than the record amount in the file, randomly subsample the reads
                 read_records = random.sample(all_read_records, read_name_path)
             else:
-                print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
+                logger.info(
+                    "Trying to sample more reads than are contained in the input pod5s, taking all reads"
+                )
                 read_records = all_read_records
     output_pod5 = os.path.join(output_directory, output_base)
     # Write the subsampled POD5
     with p5.Writer(output_pod5) as writer:
-        writer.add_reads(read_records)
+        writer.add_reads(read_records)

smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl