PyPI - smftools - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

smftools 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

smftools/__init__.py +6 -8
smftools/_settings.py +4 -6
smftools/_version.py +1 -1
smftools/cli/helpers.py +54 -0
smftools/cli/hmm_adata.py +937 -256
smftools/cli/load_adata.py +448 -268
smftools/cli/preprocess_adata.py +469 -263
smftools/cli/spatial_adata.py +536 -319
smftools/cli_entry.py +97 -182
smftools/config/__init__.py +1 -1
smftools/config/conversion.yaml +17 -6
smftools/config/deaminase.yaml +12 -10
smftools/config/default.yaml +142 -33
smftools/config/direct.yaml +11 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +594 -264
smftools/constants.py +37 -0
smftools/datasets/__init__.py +2 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2128 -1418
smftools/hmm/__init__.py +2 -9
smftools/hmm/archived/call_hmm_peaks.py +121 -0
smftools/hmm/call_hmm_peaks.py +299 -91
smftools/hmm/display_hmm.py +19 -6
smftools/hmm/hmm_readwrite.py +13 -4
smftools/hmm/nucleosome_hmm_refinement.py +102 -14
smftools/informatics/__init__.py +30 -7
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
smftools/informatics/archived/print_bam_query_seq.py +7 -1
smftools/informatics/bam_functions.py +397 -175
smftools/informatics/basecalling.py +51 -9
smftools/informatics/bed_functions.py +90 -57
smftools/informatics/binarize_converted_base_identities.py +18 -7
smftools/informatics/complement_base_list.py +7 -6
smftools/informatics/converted_BAM_to_adata.py +265 -122
smftools/informatics/fasta_functions.py +161 -83
smftools/informatics/h5ad_functions.py +196 -30
smftools/informatics/modkit_extract_to_adata.py +609 -270
smftools/informatics/modkit_functions.py +85 -44
smftools/informatics/ohe.py +44 -21
smftools/informatics/pod5_functions.py +112 -73
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +2 -7
smftools/machine_learning/data/anndata_data_module.py +143 -50
smftools/machine_learning/data/preprocessing.py +2 -1
smftools/machine_learning/evaluation/__init__.py +1 -1
smftools/machine_learning/evaluation/eval_utils.py +11 -14
smftools/machine_learning/evaluation/evaluators.py +46 -33
smftools/machine_learning/inference/__init__.py +1 -1
smftools/machine_learning/inference/inference_utils.py +7 -4
smftools/machine_learning/inference/lightning_inference.py +9 -13
smftools/machine_learning/inference/sklearn_inference.py +6 -8
smftools/machine_learning/inference/sliding_window_inference.py +35 -25
smftools/machine_learning/models/__init__.py +10 -5
smftools/machine_learning/models/base.py +28 -42
smftools/machine_learning/models/cnn.py +15 -11
smftools/machine_learning/models/lightning_base.py +71 -40
smftools/machine_learning/models/mlp.py +13 -4
smftools/machine_learning/models/positional.py +3 -2
smftools/machine_learning/models/rnn.py +3 -2
smftools/machine_learning/models/sklearn_models.py +39 -22
smftools/machine_learning/models/transformer.py +68 -53
smftools/machine_learning/models/wrappers.py +2 -1
smftools/machine_learning/training/__init__.py +2 -2
smftools/machine_learning/training/train_lightning_model.py +29 -20
smftools/machine_learning/training/train_sklearn_model.py +9 -15
smftools/machine_learning/utils/__init__.py +1 -1
smftools/machine_learning/utils/device.py +7 -4
smftools/machine_learning/utils/grl.py +3 -1
smftools/metadata.py +443 -0
smftools/plotting/__init__.py +19 -5
smftools/plotting/autocorrelation_plotting.py +145 -44
smftools/plotting/classifiers.py +162 -72
smftools/plotting/general_plotting.py +422 -197
smftools/plotting/hmm_plotting.py +42 -13
smftools/plotting/position_stats.py +147 -87
smftools/plotting/qc_plotting.py +20 -12
smftools/preprocessing/__init__.py +10 -12
smftools/preprocessing/append_base_context.py +115 -80
smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +129 -31
smftools/preprocessing/binary_layers_to_ohe.py +17 -11
smftools/preprocessing/calculate_complexity_II.py +86 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +50 -25
smftools/preprocessing/calculate_pairwise_differences.py +2 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
smftools/preprocessing/calculate_position_Youden.py +118 -54
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
smftools/preprocessing/flag_duplicate_reads.py +689 -272
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +8 -3
smftools/preprocessing/min_non_diagonal.py +2 -1
smftools/preprocessing/recipes.py +56 -23
smftools/preprocessing/reindex_references_adata.py +103 -0
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +331 -82
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +3 -4
smftools/tools/archived/classifiers.py +163 -0
smftools/tools/archived/subset_adata_v1.py +10 -1
smftools/tools/archived/subset_adata_v2.py +12 -1
smftools/tools/calculate_umap.py +54 -15
smftools/tools/cluster_adata_on_methylation.py +115 -46
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +229 -98
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
smftools-0.2.5.dist-info/RECORD +181 -0
smftools-0.2.3.dist-info/RECORD +0 -173
/smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
/smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
/smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
/smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
/smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
/smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
/smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/modkit_extract_to_adata.py CHANGED Viewed

@@ -1,64 +1,81 @@
 import concurrent.futures
 import gc
-from .bam_functions import count_aligned_reads
+import re
+import shutil
+from pathlib import Path
+from typing import Iterable, Optional, Union
+import numpy as np
 import pandas as pd
 from tqdm import tqdm
-import numpy as np
-from pathlib import Path
-from typing import Union, Iterable, Optional
-import shutil
+from smftools.logging_utils import get_logger
+from .bam_functions import count_aligned_reads
+logger = get_logger(__name__)
 def filter_bam_records(bam, mapping_threshold):
     """Processes a single BAM file, counts reads, and determines records to analyze."""
     aligned_reads_count, unaligned_reads_count, record_counts_dict = count_aligned_reads(bam)
     total_reads = aligned_reads_count + unaligned_reads_count
     percent_aligned = (aligned_reads_count * 100 / total_reads) if total_reads > 0 else 0
-    print(f'{percent_aligned:.2f}% of reads in {bam} aligned successfully')
+    logger.info(f"{percent_aligned:.2f}% of reads in {bam} aligned successfully")
     records = []
     for record, (count, percentage) in record_counts_dict.items():
-        print(f'{count} reads mapped to reference {record}. This is {percentage*100:.2f}% of all mapped reads in {bam}')
+        logger.info(
+            f"{count} reads mapped to reference {record}. This is {percentage * 100:.2f}% of all mapped reads in {bam}"
+        )
         if percentage >= mapping_threshold:
             records.append(record)
     return set(records)
 def parallel_filter_bams(bam_path_list, mapping_threshold):
     """Parallel processing for multiple BAM files."""
     records_to_analyze = set()
     with concurrent.futures.ProcessPoolExecutor() as executor:
-        results = executor.map(filter_bam_records, bam_path_list, [mapping_threshold] * len(bam_path_list))
+        results = executor.map(
+            filter_bam_records, bam_path_list, [mapping_threshold] * len(bam_path_list)
+        )
     # Aggregate results
     for result in results:
         records_to_analyze.update(result)
-    print(f'Records to analyze: {records_to_analyze}')
+    logger.info(f"Records to analyze: {records_to_analyze}")
     return records_to_analyze
 def process_tsv(tsv, records_to_analyze, reference_dict, sample_index):
     """
     Loads and filters a single TSV file based on chromosome and position criteria.
     """
-    temp_df = pd.read_csv(tsv, sep='\t', header=0)
+    temp_df = pd.read_csv(tsv, sep="\t", header=0)
     filtered_records = {}
     for record in records_to_analyze:
         if record not in reference_dict:
             continue
         ref_length = reference_dict[record][0]
-        filtered_df = temp_df[(temp_df['chrom'] == record) &
-                              (temp_df['ref_position'] >= 0) &
-                              (temp_df['ref_position'] < ref_length)]
+        filtered_df = temp_df[
+            (temp_df["chrom"] == record)
+            & (temp_df["ref_position"] >= 0)
+            & (temp_df["ref_position"] < ref_length)
+        ]
         if not filtered_df.empty:
             filtered_records[record] = {sample_index: filtered_df}
     return filtered_records
 def parallel_load_tsvs(tsv_batch, records_to_analyze, reference_dict, batch, batch_size, threads=4):
     """
     Loads and filters TSV files in parallel.
@@ -78,52 +95,60 @@ def parallel_load_tsvs(tsv_batch, records_to_analyze, reference_dict, batch, bat
     with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
         futures = {
-            executor.submit(process_tsv, tsv, records_to_analyze, reference_dict, sample_index): sample_index
+            executor.submit(
+                process_tsv, tsv, records_to_analyze, reference_dict, sample_index
+            ): sample_index
             for sample_index, tsv in enumerate(tsv_batch)
         }
-        for future in tqdm(concurrent.futures.as_completed(futures), desc=f'Processing batch {batch}', total=batch_size):
+        for future in tqdm(
+            concurrent.futures.as_completed(futures),
+            desc=f"Processing batch {batch}",
+            total=batch_size,
+        ):
             result = future.result()
             for record, sample_data in result.items():
                 dict_total[record].update(sample_data)
     return dict_total
 def update_dict_to_skip(dict_to_skip, detected_modifications):
     """
     Updates the dict_to_skip set based on the detected modifications.
     Parameters:
         dict_to_skip (set): The initial set of dictionary indices to skip.
         detected_modifications (list or set): The modifications (e.g. ['6mA', '5mC']) present.
     Returns:
         set: The updated dict_to_skip set.
     """
     # Define which indices correspond to modification-specific or strand-specific dictionaries
-    A_stranded_dicts = {2, 3}       # m6A bottom and top strand dictionaries
-    C_stranded_dicts = {5, 6}       # 5mC bottom and top strand dictionaries
-    combined_dicts   = {7, 8}       # Combined strand dictionaries
+    A_stranded_dicts = {2, 3}  # m6A bottom and top strand dictionaries
+    C_stranded_dicts = {5, 6}  # 5mC bottom and top strand dictionaries
+    combined_dicts = {7, 8}  # Combined strand dictionaries
     # If '6mA' is present, remove the A_stranded indices from the skip set
-    if '6mA' in detected_modifications:
+    if "6mA" in detected_modifications:
         dict_to_skip -= A_stranded_dicts
     # If '5mC' is present, remove the C_stranded indices from the skip set
-    if '5mC' in detected_modifications:
+    if "5mC" in detected_modifications:
         dict_to_skip -= C_stranded_dicts
     # If both modifications are present, remove the combined indices from the skip set
-    if '6mA' in detected_modifications and '5mC' in detected_modifications:
+    if "6mA" in detected_modifications and "5mC" in detected_modifications:
         dict_to_skip -= combined_dicts
     return dict_to_skip
 def process_modifications_for_sample(args):
     """
     Processes a single (record, sample) pair to extract modification-specific data.
     Parameters:
         args: (record, sample_index, sample_df, mods, max_reference_length)
     Returns:
         (record, sample_index, result) where result is a dict with keys:
           'm6A', 'm6A_minus', 'm6A_plus', '5mC', '5mC_minus', '5mC_plus', and
@@ -131,29 +156,30 @@ def process_modifications_for_sample(args):
     """
     record, sample_index, sample_df, mods, max_reference_length = args
     result = {}
-    if '6mA' in mods:
-        m6a_df = sample_df[sample_df['modified_primary_base'] == 'A']
-        result['m6A'] = m6a_df
-        result['m6A_minus'] = m6a_df[m6a_df['ref_strand'] == '-']
-        result['m6A_plus'] = m6a_df[m6a_df['ref_strand'] == '+']
+    if "6mA" in mods:
+        m6a_df = sample_df[sample_df["modified_primary_base"] == "A"]
+        result["m6A"] = m6a_df
+        result["m6A_minus"] = m6a_df[m6a_df["ref_strand"] == "-"]
+        result["m6A_plus"] = m6a_df[m6a_df["ref_strand"] == "+"]
         m6a_df = None
         gc.collect()
-    if '5mC' in mods:
-        m5c_df = sample_df[sample_df['modified_primary_base'] == 'C']
-        result['5mC'] = m5c_df
-        result['5mC_minus'] = m5c_df[m5c_df['ref_strand'] == '-']
-        result['5mC_plus'] = m5c_df[m5c_df['ref_strand'] == '+']
+    if "5mC" in mods:
+        m5c_df = sample_df[sample_df["modified_primary_base"] == "C"]
+        result["5mC"] = m5c_df
+        result["5mC_minus"] = m5c_df[m5c_df["ref_strand"] == "-"]
+        result["5mC_plus"] = m5c_df[m5c_df["ref_strand"] == "+"]
         m5c_df = None
         gc.collect()
-    if '6mA' in mods and '5mC' in mods:
-        result['combined_minus'] = []
-        result['combined_plus'] = []
+    if "6mA" in mods and "5mC" in mods:
+        result["combined_minus"] = []
+        result["combined_plus"] = []
     return record, sample_index, result
 def parallel_process_modifications(dict_total, mods, max_reference_length, threads=4):
     """
     Processes each (record, sample) pair in dict_total in parallel to extract modification-specific data.
     Returns:
         processed_results: Dict keyed by record, with sub-dict keyed by sample index and the processed results.
     """
@@ -164,18 +190,20 @@ def parallel_process_modifications(dict_total, mods, max_reference_length, threa
     processed_results = {}
     with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
         for record, sample_index, result in tqdm(
-                executor.map(process_modifications_for_sample, tasks),
-                total=len(tasks),
-                desc="Processing modifications"):
+            executor.map(process_modifications_for_sample, tasks),
+            total=len(tasks),
+            desc="Processing modifications",
+        ):
             if record not in processed_results:
                 processed_results[record] = {}
             processed_results[record][sample_index] = result
     return processed_results
 def merge_modification_results(processed_results, mods):
     """
     Merges individual sample results into global dictionaries.
     Returns:
         A tuple: (m6A_dict, m6A_minus, m6A_plus, c5m_dict, c5m_minus, c5m_plus, combined_minus, combined_plus)
     """
@@ -189,44 +217,52 @@ def merge_modification_results(processed_results, mods):
     combined_plus = {}
     for record, sample_results in processed_results.items():
         for sample_index, res in sample_results.items():
-            if '6mA' in mods:
+            if "6mA" in mods:
                 if record not in m6A_dict:
                     m6A_dict[record], m6A_minus[record], m6A_plus[record] = {}, {}, {}
-                m6A_dict[record][sample_index] = res.get('m6A', pd.DataFrame())
-                m6A_minus[record][sample_index] = res.get('m6A_minus', pd.DataFrame())
-                m6A_plus[record][sample_index] = res.get('m6A_plus', pd.DataFrame())
-            if '5mC' in mods:
+                m6A_dict[record][sample_index] = res.get("m6A", pd.DataFrame())
+                m6A_minus[record][sample_index] = res.get("m6A_minus", pd.DataFrame())
+                m6A_plus[record][sample_index] = res.get("m6A_plus", pd.DataFrame())
+            if "5mC" in mods:
                 if record not in c5m_dict:
                     c5m_dict[record], c5m_minus[record], c5m_plus[record] = {}, {}, {}
-                c5m_dict[record][sample_index] = res.get('5mC', pd.DataFrame())
-                c5m_minus[record][sample_index] = res.get('5mC_minus', pd.DataFrame())
-                c5m_plus[record][sample_index] = res.get('5mC_plus', pd.DataFrame())
-            if '6mA' in mods and '5mC' in mods:
+                c5m_dict[record][sample_index] = res.get("5mC", pd.DataFrame())
+                c5m_minus[record][sample_index] = res.get("5mC_minus", pd.DataFrame())
+                c5m_plus[record][sample_index] = res.get("5mC_plus", pd.DataFrame())
+            if "6mA" in mods and "5mC" in mods:
                 if record not in combined_minus:
                     combined_minus[record], combined_plus[record] = {}, {}
-                combined_minus[record][sample_index] = res.get('combined_minus', [])
-                combined_plus[record][sample_index] = res.get('combined_plus', [])
-    return (m6A_dict, m6A_minus, m6A_plus,
-            c5m_dict, c5m_minus, c5m_plus,
-            combined_minus, combined_plus)
+                combined_minus[record][sample_index] = res.get("combined_minus", [])
+                combined_plus[record][sample_index] = res.get("combined_plus", [])
+    return (
+        m6A_dict,
+        m6A_minus,
+        m6A_plus,
+        c5m_dict,
+        c5m_minus,
+        c5m_plus,
+        combined_minus,
+        combined_plus,
+    )
 def process_stranded_methylation(args):
     """
     Processes a single (dict_index, record, sample) task.
     For combined dictionaries (indices 7 or 8), it merges the corresponding A-stranded and C-stranded data.
-    For other dictionaries, it converts the DataFrame into a nested dictionary mapping read names to a
+    For other dictionaries, it converts the DataFrame into a nested dictionary mapping read names to a
     NumPy methylation array (of float type). Non-numeric values (e.g. '-') are coerced to NaN.
     Parameters:
         args: (dict_index, record, sample, dict_list, max_reference_length)
     Returns:
         (dict_index, record, sample, processed_data)
     """
     dict_index, record, sample, dict_list, max_reference_length = args
     processed_data = {}
     # For combined bottom strand (index 7)
     if dict_index == 7:
         temp_a = dict_list[2][record].get(sample, {}).copy()
@@ -235,18 +271,18 @@ def process_stranded_methylation(args):
         for read in set(temp_a.keys()) | set(temp_c.keys()):
             if read in temp_a:
                 # Convert using pd.to_numeric with errors='coerce'
-                value_a = pd.to_numeric(np.array(temp_a[read]), errors='coerce')
+                value_a = pd.to_numeric(np.array(temp_a[read]), errors="coerce")
             else:
                 value_a = None
             if read in temp_c:
-                value_c = pd.to_numeric(np.array(temp_c[read]), errors='coerce')
+                value_c = pd.to_numeric(np.array(temp_c[read]), errors="coerce")
             else:
                 value_c = None
             if value_a is not None and value_c is not None:
                 processed_data[read] = np.where(
                     np.isnan(value_a) & np.isnan(value_c),
                     np.nan,
-                    np.nan_to_num(value_a) + np.nan_to_num(value_c)
+                    np.nan_to_num(value_a) + np.nan_to_num(value_c),
                 )
             elif value_a is not None:
                 processed_data[read] = value_a
@@ -261,18 +297,18 @@ def process_stranded_methylation(args):
         processed_data = {}
         for read in set(temp_a.keys()) | set(temp_c.keys()):
             if read in temp_a:
-                value_a = pd.to_numeric(np.array(temp_a[read]), errors='coerce')
+                value_a = pd.to_numeric(np.array(temp_a[read]), errors="coerce")
             else:
                 value_a = None
             if read in temp_c:
-                value_c = pd.to_numeric(np.array(temp_c[read]), errors='coerce')
+                value_c = pd.to_numeric(np.array(temp_c[read]), errors="coerce")
             else:
                 value_c = None
             if value_a is not None and value_c is not None:
                 processed_data[read] = np.where(
                     np.isnan(value_a) & np.isnan(value_c),
                     np.nan,
-                    np.nan_to_num(value_a) + np.nan_to_num(value_c)
+                    np.nan_to_num(value_a) + np.nan_to_num(value_c),
                 )
             elif value_a is not None:
                 processed_data[read] = value_a
@@ -286,24 +322,28 @@ def process_stranded_methylation(args):
         temp_df = dict_list[dict_index][record][sample]
         processed_data = {}
         # Extract columns and convert probabilities to float (coercing errors)
-        read_ids = temp_df['read_id'].values
-        positions = temp_df['ref_position'].values
-        call_codes = temp_df['call_code'].values
-        probabilities = pd.to_numeric(temp_df['call_prob'].values, errors='coerce')
-        modified_codes = {'a', 'h', 'm'}
-        canonical_codes = {'-'}
+        read_ids = temp_df["read_id"].values
+        positions = temp_df["ref_position"].values
+        call_codes = temp_df["call_code"].values
+        probabilities = pd.to_numeric(temp_df["call_prob"].values, errors="coerce")
+        modified_codes = {"a", "h", "m"}
+        canonical_codes = {"-"}
         # Compute methylation probabilities (vectorized)
         methylation_prob = np.full(probabilities.shape, np.nan, dtype=float)
-        methylation_prob[np.isin(call_codes, list(modified_codes))] = probabilities[np.isin(call_codes, list(modified_codes))]
-        methylation_prob[np.isin(call_codes, list(canonical_codes))] = 1 - probabilities[np.isin(call_codes, list(canonical_codes))]
+        methylation_prob[np.isin(call_codes, list(modified_codes))] = probabilities[
+            np.isin(call_codes, list(modified_codes))
+        ]
+        methylation_prob[np.isin(call_codes, list(canonical_codes))] = (
+            1 - probabilities[np.isin(call_codes, list(canonical_codes))]
+        )
         # Preallocate storage for each unique read
         unique_reads = np.unique(read_ids)
         for read in unique_reads:
             processed_data[read] = np.full(max_reference_length, np.nan, dtype=float)
         # Assign values efficiently
         for i in range(len(read_ids)):
             read = read_ids[i]
@@ -314,10 +354,11 @@ def process_stranded_methylation(args):
     gc.collect()
     return dict_index, record, sample, processed_data
 def parallel_extract_stranded_methylation(dict_list, dict_to_skip, max_reference_length, threads=4):
     """
     Processes all (dict_index, record, sample) tasks in dict_list (excluding indices in dict_to_skip) in parallel.
     Returns:
         Updated dict_list with processed (nested) dictionaries.
     """
@@ -327,16 +368,17 @@ def parallel_extract_stranded_methylation(dict_list, dict_to_skip, max_reference
             for record in current_dict.keys():
                 for sample in current_dict[record].keys():
                     tasks.append((dict_index, record, sample, dict_list, max_reference_length))
     with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
         for dict_index, record, sample, processed_data in tqdm(
             executor.map(process_stranded_methylation, tasks),
             total=len(tasks),
-            desc="Extracting stranded methylation states"
+            desc="Extracting stranded methylation states",
         ):
             dict_list[dict_index][record][sample] = processed_data
     return dict_list
 def delete_intermediate_h5ads_and_tmpdir(
     h5_dir: Union[str, Path, Iterable[str], None],
     tmp_dir: Optional[Union[str, Path]] = None,
@@ -360,25 +402,27 @@ def delete_intermediate_h5ads_and_tmpdir(
     verbose : bool
         Print progress / warnings.
     """
     # Helper: remove a single file path (Path-like or string)
     def _maybe_unlink(p: Path):
+        """Remove a file path if it exists and is a file."""
         if not p.exists():
             if verbose:
-                print(f"[skip] not found: {p}")
+                logger.debug(f"[skip] not found: {p}")
             return
         if not p.is_file():
             if verbose:
-                print(f"[skip] not a file: {p}")
+                logger.debug(f"[skip] not a file: {p}")
             return
         if dry_run:
-            print(f"[dry-run] would remove file: {p}")
+            logger.debug(f"[dry-run] would remove file: {p}")
             return
         try:
             p.unlink()
             if verbose:
-                print(f"Removed file: {p}")
+                logger.info(f"Removed file: {p}")
         except Exception as e:
-            print(f"[error] failed to remove file {p}: {e}")
+            logger.warning(f"[error] failed to remove file {p}: {e}")
     # Handle h5_dir input (directory OR iterable of file paths)
     if h5_dir is not None:
@@ -393,7 +437,7 @@ def delete_intermediate_h5ads_and_tmpdir(
                 else:
                     if verbose:
                         # optional: comment this out if too noisy
-                        print(f"[skip] not matching pattern: {p.name}")
+                        logger.debug(f"[skip] not matching pattern: {p.name}")
         else:
             # treat as iterable of file paths
             for f in h5_dir:
@@ -403,30 +447,44 @@ def delete_intermediate_h5ads_and_tmpdir(
                     _maybe_unlink(p)
                 else:
                     if verbose:
-                        print(f"[skip] not matching pattern or not a file: {p}")
+                        logger.debug(f"[skip] not matching pattern or not a file: {p}")
     # Remove tmp_dir recursively (if provided)
     if tmp_dir is not None:
         td = Path(tmp_dir)
         if not td.exists():
             if verbose:
-                print(f"[skip] tmp_dir not found: {td}")
+                logger.debug(f"[skip] tmp_dir not found: {td}")
         else:
             if not td.is_dir():
                 if verbose:
-                    print(f"[skip] tmp_dir is not a directory: {td}")
+                    logger.debug(f"[skip] tmp_dir is not a directory: {td}")
             else:
                 if dry_run:
-                    print(f"[dry-run] would remove directory tree: {td}")
+                    logger.debug(f"[dry-run] would remove directory tree: {td}")
                 else:
                     try:
                         shutil.rmtree(td)
                         if verbose:
-                            print(f"Removed directory tree: {td}")
+                            logger.info(f"Removed directory tree: {td}")
                     except Exception as e:
-                        print(f"[error] failed to remove tmp dir {td}: {e}")
-def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir, delete_batch_hdfs=False, threads=None, double_barcoded_path = None):
+                        logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
+def modkit_extract_to_adata(
+    fasta,
+    bam_dir,
+    out_dir,
+    input_already_demuxed,
+    mapping_threshold,
+    experiment_name,
+    mods,
+    batch_size,
+    mod_tsv_dir,
+    delete_batch_hdfs=False,
+    threads=None,
+    double_barcoded_path=None,
+):
     """
     Takes modkit extract outputs and organizes it into an adata object
@@ -448,50 +506,87 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
     """
     ###################################################
     # Package imports
-    from .. import readwrite
-    from ..readwrite import safe_write_h5ad, make_dirs
-    from .fasta_functions import get_native_references
-    from .bam_functions import extract_base_identities
-    from .ohe import ohe_batching
-    import pandas as pd
-    import anndata as ad
-    import os
     import gc
     import math
+    import anndata as ad
     import numpy as np
+    import pandas as pd
     from Bio.Seq import Seq
     from tqdm import tqdm
-    import h5py
+    from .. import readwrite
+    from ..readwrite import make_dirs
+    from .bam_functions import extract_base_identities
+    from .fasta_functions import get_native_references
+    from .ohe import ohe_batching
     ###################################################
     ################## Get input tsv and bam file names into a sorted list ################
     # Make output dirs
-    h5_dir = out_dir / 'h5ads'
-    tmp_dir = out_dir / 'tmp'
+    h5_dir = out_dir / "h5ads"
+    tmp_dir = out_dir / "tmp"
     make_dirs([h5_dir, tmp_dir])
-    existing_h5s =  h5_dir.iterdir()
-    existing_h5s = [h5 for h5 in existing_h5s if '.h5ad.gz' in str(h5)]
-    final_hdf = f'{experiment_name}.h5ad.gz'
+    existing_h5s = h5_dir.iterdir()
+    existing_h5s = [h5 for h5 in existing_h5s if ".h5ad.gz" in str(h5)]
+    final_hdf = f"{experiment_name}.h5ad.gz"
     final_adata_path = h5_dir / final_hdf
     final_adata = None
     if final_adata_path.exists():
-        print(f'{final_adata_path} already exists. Using existing adata')
+        logger.debug(f"{final_adata_path} already exists. Using existing adata")
         return final_adata, final_adata_path
     # List all files in the directory
     tsvs = sorted(
-        p for p in mod_tsv_dir.iterdir()
-        if p.is_file() and 'unclassified' not in p.name and 'extract.tsv' in p.name)
+        p
+        for p in mod_tsv_dir.iterdir()
+        if p.is_file() and "unclassified" not in p.name and "extract.tsv" in p.name
+    )
     bams = sorted(
-        p for p in bam_dir.iterdir()
-        if p.is_file() and p.suffix == '.bam' and 'unclassified' not in p.name and '.bai' not in p.name)
+        p
+        for p in bam_dir.iterdir()
+        if p.is_file()
+        and p.suffix == ".bam"
+        and "unclassified" not in p.name
+        and ".bai" not in p.name
+    )
+    tsv_path_list = [tsv for tsv in tsvs]
+    bam_path_list = [bam for bam in bams]
+    logger.info(f"{len(tsvs)} sample tsv files found: {tsvs}")
+    logger.info(f"{len(bams)} sample bams found: {bams}")
+    # Map global sample index (bami / final_sample_index) -> sample name / barcode
+    sample_name_map = {}
+    barcode_map = {}
+    for idx, bam_path in enumerate(bam_path_list):
+        stem = bam_path.stem
+        # Try to peel off a "barcode..." suffix if present.
+        # This handles things like:
+        #   "mySample_barcode01"   -> sample="mySample", barcode="barcode01"
+        #   "run1-s1_barcode05"   -> sample="run1-s1", barcode="barcode05"
+        #   "barcode01"           -> sample="barcode01", barcode="barcode01"
+        m = re.search(r"^(.*?)[_\-\.]?(barcode[0-9A-Za-z\-]+)$", stem)
+        if m:
+            sample_name = m.group(1) or stem
+            barcode = m.group(2)
+        else:
+            # Fallback: treat the whole stem as both sample & barcode
+            sample_name = stem
+            barcode = stem
+        # make sample name of the format of the bam file stem
+        sample_name = sample_name + f"_{barcode}"
+        # Clean the barcode name to be an integer
+        barcode = int(barcode.split("barcode")[1])
-    tsv_path_list = [mod_tsv_dir / tsv for tsv in tsvs]
-    bam_path_list = [bam_dir / bam for bam in bams]
-    print(f'{len(tsvs)} sample tsv files found: {tsvs}')
-    print(f'{len(bams)} sample bams found: {bams}')
+        sample_name_map[idx] = sample_name
+        barcode_map[idx] = str(barcode)
     ##########################################################################################
     ######### Get Record names that have over a passed threshold of mapped reads #############
@@ -503,27 +598,29 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
     ########### Determine the maximum record length to analyze in the dataset ################
     # Get all references within the FASTA and indicate the length and identity of the record sequence
     max_reference_length = 0
-    reference_dict = get_native_references(str(fasta)) # returns a dict keyed by record name. Points to a tuple of (reference length, reference sequence)
+    reference_dict = get_native_references(
+        str(fasta)
+    )  # returns a dict keyed by record name. Points to a tuple of (reference length, reference sequence)
     # Get the max record length in the dataset.
     for record in records_to_analyze:
         if reference_dict[record][0] > max_reference_length:
             max_reference_length = reference_dict[record][0]
-    print(f'{readwrite.time_string()}: Max reference length in dataset: {max_reference_length}')
-    batches = math.ceil(len(tsvs) / batch_size) # Number of batches to process
-    print('{0}: Processing input tsvs in {1} batches of {2} tsvs '.format(readwrite.time_string(), batches, batch_size))
+    logger.info(f"Max reference length in dataset: {max_reference_length}")
+    batches = math.ceil(len(tsvs) / batch_size)  # Number of batches to process
+    logger.info("Processing input tsvs in {0} batches of {1} tsvs ".format(batches, batch_size))
     ##########################################################################################
     ##########################################################################################
-    # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
+    # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
     # Save the file paths in the bam_record_ohe_files dict.
     bam_record_ohe_files = {}
-    bam_record_save = tmp_dir / 'tmp_file_dict.h5ad'
+    bam_record_save = tmp_dir / "tmp_file_dict.h5ad"
     fwd_mapped_reads = set()
     rev_mapped_reads = set()
     # If this step has already been performed, read in the tmp_dile_dict
     if bam_record_save.exists():
         bam_record_ohe_files = ad.read_h5ad(bam_record_save).uns
-        print('Found existing OHE reads, using these')
+        logger.debug("Found existing OHE reads, using these")
     else:
         # Iterate over split bams
         for bami, bam in enumerate(bam_path_list):
@@ -533,18 +630,37 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
                 positions = range(current_reference_length)
                 ref_seq = reference_dict[record][1]
                 # Extract the base identities of reads aligned to the record
-                fwd_base_identities, rev_base_identities, mismatch_counts_per_read, mismatch_trend_per_read = extract_base_identities(bam, record, positions, max_reference_length, ref_seq)
+                (
+                    fwd_base_identities,
+                    rev_base_identities,
+                    mismatch_counts_per_read,
+                    mismatch_trend_per_read,
+                ) = extract_base_identities(bam, record, positions, max_reference_length, ref_seq)
                 # Store read names of fwd and rev mapped reads
                 fwd_mapped_reads.update(fwd_base_identities.keys())
                 rev_mapped_reads.update(rev_base_identities.keys())
                 # One hot encode the sequence string of the reads
-                fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bami}_fwd",batch_size=100000, threads=threads)
-                rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bami}_rev",batch_size=100000, threads=threads)
-                bam_record_ohe_files[f'{bami}_{record}'] = fwd_ohe_files + rev_ohe_files
+                fwd_ohe_files = ohe_batching(
+                    fwd_base_identities,
+                    tmp_dir,
+                    record,
+                    f"{bami}_fwd",
+                    batch_size=100000,
+                    threads=threads,
+                )
+                rev_ohe_files = ohe_batching(
+                    rev_base_identities,
+                    tmp_dir,
+                    record,
+                    f"{bami}_rev",
+                    batch_size=100000,
+                    threads=threads,
+                )
+                bam_record_ohe_files[f"{bami}_{record}"] = fwd_ohe_files + rev_ohe_files
                 del fwd_base_identities, rev_base_identities
         # Save out the ohe file paths
         X = np.random.rand(1, 1)
-        tmp_ad = ad.AnnData(X=X, uns=bam_record_ohe_files)
+        tmp_ad = ad.AnnData(X=X, uns=bam_record_ohe_files)
         tmp_ad.write_h5ad(bam_record_save)
     ##########################################################################################
@@ -554,39 +670,73 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
     for record in records_to_analyze:
         current_reference_length = reference_dict[record][0]
         delta_max_length = max_reference_length - current_reference_length
-        sequence = reference_dict[record][1] + 'N'*delta_max_length
-        complement = str(Seq(reference_dict[record][1]).complement()).upper() + 'N'*delta_max_length
+        sequence = reference_dict[record][1] + "N" * delta_max_length
+        complement = (
+            str(Seq(reference_dict[record][1]).complement()).upper() + "N" * delta_max_length
+        )
         record_seq_dict[record] = (sequence, complement)
     ##########################################################################################
     ###################################################
     # Begin iterating over batches
     for batch in range(batches):
-        print('{0}: Processing tsvs for batch {1} '.format(readwrite.time_string(), batch))
+        logger.info("Processing tsvs for batch {0} ".format(batch))
         # For the final batch, just take the remaining tsv and bam files
         if batch == batches - 1:
             tsv_batch = tsv_path_list
             bam_batch = bam_path_list
-        # For all other batches, take the next batch of tsvs and bams out of the file queue.
+        # For all other batches, take the next batch of tsvs and bams out of the file queue.
         else:
             tsv_batch = tsv_path_list[:batch_size]
             bam_batch = bam_path_list[:batch_size]
             tsv_path_list = tsv_path_list[batch_size:]
             bam_path_list = bam_path_list[batch_size:]
-        print('{0}: tsvs in batch {1} '.format(readwrite.time_string(), tsv_batch))
+        logger.info("tsvs in batch {0} ".format(tsv_batch))
-        batch_already_processed = sum([1 for h5 in existing_h5s if f'_{batch}_' in h5.name])
-    ###################################################
+        batch_already_processed = sum([1 for h5 in existing_h5s if f"_{batch}_" in h5.name])
+        ###################################################
         if batch_already_processed:
-            print(f'Batch {batch} has already been processed into h5ads. Skipping batch and using existing files')
+            logger.debug(
+                f"Batch {batch} has already been processed into h5ads. Skipping batch and using existing files"
+            )
         else:
             ###################################################
             ### Add the tsvs as dataframes to a dictionary (dict_total) keyed by integer index. Also make modification specific dictionaries and strand specific dictionaries.
             # # Initialize dictionaries and place them in a list
-            dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top = {},{},{},{},{},{},{},{},{}
-            dict_list = [dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top]
+            (
+                dict_total,
+                dict_a,
+                dict_a_bottom,
+                dict_a_top,
+                dict_c,
+                dict_c_bottom,
+                dict_c_top,
+                dict_combined_bottom,
+                dict_combined_top,
+            ) = {}, {}, {}, {}, {}, {}, {}, {}, {}
+            dict_list = [
+                dict_total,
+                dict_a,
+                dict_a_bottom,
+                dict_a_top,
+                dict_c,
+                dict_c_bottom,
+                dict_c_top,
+                dict_combined_bottom,
+                dict_combined_top,
+            ]
             # Give names to represent each dictionary in the list
-            sample_types = ['total', 'm6A', 'm6A_bottom_strand', 'm6A_top_strand', '5mC', '5mC_bottom_strand', '5mC_top_strand', 'combined_bottom_strand', 'combined_top_strand']
+            sample_types = [
+                "total",
+                "m6A",
+                "m6A_bottom_strand",
+                "m6A_top_strand",
+                "5mC",
+                "5mC_bottom_strand",
+                "5mC_top_strand",
+                "combined_bottom_strand",
+                "combined_top_strand",
+            ]
             # Give indices of dictionaries to skip for analysis and final dictionary saving.
             dict_to_skip = [0, 1, 4]
             combined_dicts = [7, 8]
@@ -596,7 +746,14 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
             dict_to_skip = set(dict_to_skip)
             # # Step 1):Load the dict_total dictionary with all of the batch tsv files as dataframes.
-            dict_total = parallel_load_tsvs(tsv_batch, records_to_analyze, reference_dict, batch, batch_size=len(tsv_batch), threads=threads)
+            dict_total = parallel_load_tsvs(
+                tsv_batch,
+                records_to_analyze,
+                reference_dict,
+                batch,
+                batch_size=len(tsv_batch),
+                threads=threads,
+            )
             # # Step 2: Extract modification-specific data (per (record,sample)) in parallel
             # processed_mod_results = parallel_process_modifications(dict_total, mods, max_reference_length, threads=threads or 4)
@@ -621,56 +778,112 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
             # Iterate over dict_total of all the tsv files and extract the modification specific and strand specific dataframes into dictionaries
             for record in dict_total.keys():
                 for sample_index in dict_total[record].keys():
-                    if '6mA' in mods:
+                    if "6mA" in mods:
                         # Remove Adenine stranded dicts from the dicts to skip set
                         dict_to_skip.difference_update(set(A_stranded_dicts))
-                        if record not in dict_a.keys() and record not in dict_a_bottom.keys() and record not in dict_a_top.keys():
+                        if (
+                            record not in dict_a.keys()
+                            and record not in dict_a_bottom.keys()
+                            and record not in dict_a_top.keys()
+                        ):
                             dict_a[record], dict_a_bottom[record], dict_a_top[record] = {}, {}, {}
                         # get a dictionary of dataframes that only contain methylated adenine positions
-                        dict_a[record][sample_index] = dict_total[record][sample_index][dict_total[record][sample_index]['modified_primary_base'] == 'A']
-                        print('{}: Successfully loaded a methyl-adenine dictionary for '.format(readwrite.time_string()) + str(sample_index))
+                        dict_a[record][sample_index] = dict_total[record][sample_index][
+                            dict_total[record][sample_index]["modified_primary_base"] == "A"
+                        ]
+                        logger.debug(
+                            "Successfully loaded a methyl-adenine dictionary for {}".format(
+                                str(sample_index)
+                            )
+                        )
                         # Stratify the adenine dictionary into two strand specific dictionaries.
-                        dict_a_bottom[record][sample_index] = dict_a[record][sample_index][dict_a[record][sample_index]['ref_strand'] == '-']
-                        print('{}: Successfully loaded a minus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(sample_index))
-                        dict_a_top[record][sample_index] = dict_a[record][sample_index][dict_a[record][sample_index]['ref_strand'] == '+']
-                        print('{}: Successfully loaded a plus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(sample_index))
+                        dict_a_bottom[record][sample_index] = dict_a[record][sample_index][
+                            dict_a[record][sample_index]["ref_strand"] == "-"
+                        ]
+                        logger.debug(
+                            "Successfully loaded a minus strand methyl-adenine dictionary for {}".format(
+                                str(sample_index)
+                            )
+                        )
+                        dict_a_top[record][sample_index] = dict_a[record][sample_index][
+                            dict_a[record][sample_index]["ref_strand"] == "+"
+                        ]
+                        logger.debug(
+                            "Successfully loaded a plus strand methyl-adenine dictionary for ".format(
+                                str(sample_index)
+                            )
+                        )
                         # Reassign pointer for dict_a to None and delete the original value that it pointed to in order to decrease memory usage.
                         dict_a[record][sample_index] = None
                         gc.collect()
-                    if '5mC' in mods:
+                    if "5mC" in mods:
                         # Remove Cytosine stranded dicts from the dicts to skip set
                         dict_to_skip.difference_update(set(C_stranded_dicts))
-                        if record not in dict_c.keys() and record not in dict_c_bottom.keys() and record not in dict_c_top.keys():
+                        if (
+                            record not in dict_c.keys()
+                            and record not in dict_c_bottom.keys()
+                            and record not in dict_c_top.keys()
+                        ):
                             dict_c[record], dict_c_bottom[record], dict_c_top[record] = {}, {}, {}
                         # get a dictionary of dataframes that only contain methylated cytosine positions
-                        dict_c[record][sample_index] = dict_total[record][sample_index][dict_total[record][sample_index]['modified_primary_base'] == 'C']
-                        print('{}: Successfully loaded a methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(sample_index))
+                        dict_c[record][sample_index] = dict_total[record][sample_index][
+                            dict_total[record][sample_index]["modified_primary_base"] == "C"
+                        ]
+                        logger.debug(
+                            "Successfully loaded a methyl-cytosine dictionary for {}".format(
+                                str(sample_index)
+                            )
+                        )
                         # Stratify the cytosine dictionary into two strand specific dictionaries.
-                        dict_c_bottom[record][sample_index] = dict_c[record][sample_index][dict_c[record][sample_index]['ref_strand'] == '-']
-                        print('{}: Successfully loaded a minus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(sample_index))
-                        dict_c_top[record][sample_index] = dict_c[record][sample_index][dict_c[record][sample_index]['ref_strand'] == '+']
-                        print('{}: Successfully loaded a plus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(sample_index))
+                        dict_c_bottom[record][sample_index] = dict_c[record][sample_index][
+                            dict_c[record][sample_index]["ref_strand"] == "-"
+                        ]
+                        logger.debug(
+                            "Successfully loaded a minus strand methyl-cytosine dictionary for {}".format(
+                                str(sample_index)
+                            )
+                        )
+                        dict_c_top[record][sample_index] = dict_c[record][sample_index][
+                            dict_c[record][sample_index]["ref_strand"] == "+"
+                        ]
+                        logger.debug(
+                            "Successfully loaded a plus strand methyl-cytosine dictionary for {}".format(
+                                str(sample_index)
+                            )
+                        )
                         # Reassign pointer for dict_c to None and delete the original value that it pointed to in order to decrease memory usage.
                         dict_c[record][sample_index] = None
                         gc.collect()
-                    if '6mA' in mods and '5mC' in mods:
+                    if "6mA" in mods and "5mC" in mods:
                         # Remove combined stranded dicts from the dicts to skip set
-                        dict_to_skip.difference_update(set(combined_dicts))
+                        dict_to_skip.difference_update(set(combined_dicts))
                         # Initialize the sample keys for the combined dictionaries
-                        if record not in dict_combined_bottom.keys() and record not in dict_combined_top.keys():
-                            dict_combined_bottom[record], dict_combined_top[record]= {}, {}
-                        print('{}: Successfully created a minus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(sample_index))
+                        if (
+                            record not in dict_combined_bottom.keys()
+                            and record not in dict_combined_top.keys()
+                        ):
+                            dict_combined_bottom[record], dict_combined_top[record] = {}, {}
+                        logger.debug(
+                            "Successfully created a minus strand combined methylation dictionary for {}".format(
+                                str(sample_index)
+                            )
+                        )
                         dict_combined_bottom[record][sample_index] = []
-                        print('{}: Successfully created a plus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(sample_index))
+                        logger.debug(
+                            "Successfully created a plus strand combined methylation dictionary for {}".format(
+                                str(sample_index)
+                            )
+                        )
                         dict_combined_top[record][sample_index] = []
                     # Reassign pointer for dict_total to None and delete the original value that it pointed to in order to decrease memory usage.
@@ -681,14 +894,24 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
             for dict_index, dict_type in enumerate(dict_list):
                 # Only iterate over stranded dictionaries
                 if dict_index not in dict_to_skip:
-                    print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), sample_types[dict_index]))
+                    logger.debug(
+                        "Extracting methylation states for {} dictionary".format(
+                            sample_types[dict_index]
+                        )
+                    )
                     for record in dict_type.keys():
                         # Get the dictionary for the modification type of interest from the reference mapping of interest
                         mod_strand_record_sample_dict = dict_type[record]
-                        print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), record))
+                        logger.debug(
+                            "Extracting methylation states for {} dictionary".format(record)
+                        )
                         # For each sample in a stranded dictionary
                         n_samples = len(mod_strand_record_sample_dict.keys())
-                        for sample in tqdm(mod_strand_record_sample_dict.keys(), desc=f'Extracting {sample_types[dict_index]} dictionary from record {record} for sample', total=n_samples):
+                        for sample in tqdm(
+                            mod_strand_record_sample_dict.keys(),
+                            desc=f"Extracting {sample_types[dict_index]} dictionary from record {record} for sample",
+                            total=n_samples,
+                        ):
                             # Load the combined bottom strand dictionary after all the individual dictionaries have been made for the sample
                             if dict_index == 7:
                                 # Load the minus strand dictionaries for each sample into temporary variables
@@ -699,16 +922,26 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
                                 for read in set(temp_a_dict) | set(temp_c_dict):
                                     # Add the arrays element-wise if the read is present in both dictionaries
                                     if read in temp_a_dict and read in temp_c_dict:
-                                        mod_strand_record_sample_dict[sample][read] = np.where(np.isnan(temp_a_dict[read]) & np.isnan(temp_c_dict[read]), np.nan, np.nan_to_num(temp_a_dict[read]) + np.nan_to_num(temp_c_dict[read]))
+                                        mod_strand_record_sample_dict[sample][read] = np.where(
+                                            np.isnan(temp_a_dict[read])
+                                            & np.isnan(temp_c_dict[read]),
+                                            np.nan,
+                                            np.nan_to_num(temp_a_dict[read])
+                                            + np.nan_to_num(temp_c_dict[read]),
+                                        )
                                     # If the read is present in only one dictionary, copy its value
                                     elif read in temp_a_dict:
-                                        mod_strand_record_sample_dict[sample][read] = temp_a_dict[read]
+                                        mod_strand_record_sample_dict[sample][read] = temp_a_dict[
+                                            read
+                                        ]
                                     elif read in temp_c_dict:
-                                        mod_strand_record_sample_dict[sample][read] = temp_c_dict[read]
+                                        mod_strand_record_sample_dict[sample][read] = temp_c_dict[
+                                            read
+                                        ]
                                 del temp_a_dict, temp_c_dict
-                        # Load the combined top strand dictionary after all the individual dictionaries have been made for the sample
+                            # Load the combined top strand dictionary after all the individual dictionaries have been made for the sample
                             elif dict_index == 8:
-                            # Load the plus strand dictionaries for each sample into temporary variables
+                                # Load the plus strand dictionaries for each sample into temporary variables
                                 temp_a_dict = dict_list[3][record][sample].copy()
                                 temp_c_dict = dict_list[6][record][sample].copy()
                                 mod_strand_record_sample_dict[sample] = {}
@@ -716,105 +949,163 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
                                 for read in set(temp_a_dict) | set(temp_c_dict):
                                     # Add the arrays element-wise if the read is present in both dictionaries
                                     if read in temp_a_dict and read in temp_c_dict:
-                                        mod_strand_record_sample_dict[sample][read] = np.where(np.isnan(temp_a_dict[read]) & np.isnan(temp_c_dict[read]), np.nan, np.nan_to_num(temp_a_dict[read]) + np.nan_to_num(temp_c_dict[read]))
+                                        mod_strand_record_sample_dict[sample][read] = np.where(
+                                            np.isnan(temp_a_dict[read])
+                                            & np.isnan(temp_c_dict[read]),
+                                            np.nan,
+                                            np.nan_to_num(temp_a_dict[read])
+                                            + np.nan_to_num(temp_c_dict[read]),
+                                        )
                                     # If the read is present in only one dictionary, copy its value
                                     elif read in temp_a_dict:
-                                        mod_strand_record_sample_dict[sample][read] = temp_a_dict[read]
+                                        mod_strand_record_sample_dict[sample][read] = temp_a_dict[
+                                            read
+                                        ]
                                     elif read in temp_c_dict:
-                                        mod_strand_record_sample_dict[sample][read] = temp_c_dict[read]
+                                        mod_strand_record_sample_dict[sample][read] = temp_c_dict[
+                                            read
+                                        ]
                                 del temp_a_dict, temp_c_dict
                             # For all other dictionaries
                             else:
                                 # use temp_df to point to the dataframe held in mod_strand_record_sample_dict[sample]
                                 temp_df = mod_strand_record_sample_dict[sample]
                                 # reassign the dictionary pointer to a nested dictionary.
                                 mod_strand_record_sample_dict[sample] = {}
                                 # Get relevant columns as NumPy arrays
-                                read_ids = temp_df['read_id'].values
-                                positions = temp_df['ref_position'].values
-                                call_codes = temp_df['call_code'].values
-                                probabilities = temp_df['call_prob'].values
+                                read_ids = temp_df["read_id"].values
+                                positions = temp_df["ref_position"].values
+                                call_codes = temp_df["call_code"].values
+                                probabilities = temp_df["call_prob"].values
                                 # Define valid call code categories
-                                modified_codes = {'a', 'h', 'm'}
-                                canonical_codes = {'-'}
+                                modified_codes = {"a", "h", "m"}
+                                canonical_codes = {"-"}
                                 # Vectorized methylation calculation with NaN for other codes
-                                methylation_prob = np.full_like(probabilities, np.nan)  # Default all to NaN
-                                methylation_prob[np.isin(call_codes, list(modified_codes))] = probabilities[np.isin(call_codes, list(modified_codes))]
-                                methylation_prob[np.isin(call_codes, list(canonical_codes))] = 1 - probabilities[np.isin(call_codes, list(canonical_codes))]
+                                methylation_prob = np.full_like(
+                                    probabilities, np.nan
+                                )  # Default all to NaN
+                                methylation_prob[np.isin(call_codes, list(modified_codes))] = (
+                                    probabilities[np.isin(call_codes, list(modified_codes))]
+                                )
+                                methylation_prob[np.isin(call_codes, list(canonical_codes))] = (
+                                    1 - probabilities[np.isin(call_codes, list(canonical_codes))]
+                                )
                                 # Find unique reads
                                 unique_reads = np.unique(read_ids)
                                 # Preallocate storage for each read
                                 for read in unique_reads:
-                                    mod_strand_record_sample_dict[sample][read] = np.full(max_reference_length, np.nan)
+                                    mod_strand_record_sample_dict[sample][read] = np.full(
+                                        max_reference_length, np.nan
+                                    )
                                 # Efficient NumPy indexing to assign values
                                 for i in range(len(read_ids)):
                                     read = read_ids[i]
                                     pos = positions[i]
                                     prob = methylation_prob[i]
                                     # Assign methylation probability
                                     mod_strand_record_sample_dict[sample][read][pos] = prob
             # Save the sample files in the batch as gzipped hdf5 files
-            print('{0}: Converting batch {1} dictionaries to anndata objects'.format(readwrite.time_string(), batch))
+            logger.info("Converting batch {} dictionaries to anndata objects".format(batch))
             for dict_index, dict_type in enumerate(dict_list):
                 if dict_index not in dict_to_skip:
                     # Initialize an hdf5 file for the current modified strand
                     adata = None
-                    print('{0}: Converting {1} dictionary to an anndata object'.format(readwrite.time_string(), sample_types[dict_index]))
+                    logger.info(
+                        "Converting {} dictionary to an anndata object".format(
+                            sample_types[dict_index]
+                        )
+                    )
                     for record in dict_type.keys():
                         # Get the dictionary for the modification type of interest from the reference mapping of interest
                         mod_strand_record_sample_dict = dict_type[record]
                         for sample in mod_strand_record_sample_dict.keys():
-                            print('{0}: Converting {1} dictionary for sample {2} to an anndata object'.format(readwrite.time_string(), sample_types[dict_index], sample))
+                            logger.info(
+                                "Converting {0} dictionary for sample {1} to an anndata object".format(
+                                    sample_types[dict_index], sample
+                                )
+                            )
                             sample = int(sample)
                             final_sample_index = sample + (batch * batch_size)
-                            print('{0}: Final sample index for sample: {1}'.format(readwrite.time_string(), final_sample_index))
-                            print('{0}: Converting {1} dictionary for sample {2} to a dataframe'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
-                            temp_df = pd.DataFrame.from_dict(mod_strand_record_sample_dict[sample], orient='index')
-                            mod_strand_record_sample_dict[sample] = None # reassign pointer to facilitate memory usage
+                            logger.info(
+                                "Final sample index for sample: {}".format(final_sample_index)
+                            )
+                            logger.debug(
+                                "Converting {0} dictionary for sample {1} to a dataframe".format(
+                                    sample_types[dict_index],
+                                    final_sample_index,
+                                )
+                            )
+                            temp_df = pd.DataFrame.from_dict(
+                                mod_strand_record_sample_dict[sample], orient="index"
+                            )
+                            mod_strand_record_sample_dict[sample] = (
+                                None  # reassign pointer to facilitate memory usage
+                            )
                             sorted_index = sorted(temp_df.index)
                             temp_df = temp_df.reindex(sorted_index)
                             X = temp_df.values
-                            dataset, strand = sample_types[dict_index].split('_')[:2]
-                            print('{0}: Loading {1} dataframe for sample {2} into a temp anndata object'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
+                            dataset, strand = sample_types[dict_index].split("_")[:2]
+                            logger.info(
+                                "Loading {0} dataframe for sample {1} into a temp anndata object".format(
+                                    sample_types[dict_index],
+                                    final_sample_index,
+                                )
+                            )
                             temp_adata = ad.AnnData(X)
                             if temp_adata.shape[0] > 0:
-                                print('{0}: Adding read names and position ids to {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
+                                logger.info(
+                                    "Adding read names and position ids to {0} anndata for sample {1}".format(
+                                        sample_types[dict_index],
+                                        final_sample_index,
+                                    )
+                                )
                                 temp_adata.obs_names = temp_df.index
                                 temp_adata.obs_names = temp_adata.obs_names.astype(str)
                                 temp_adata.var_names = temp_df.columns
                                 temp_adata.var_names = temp_adata.var_names.astype(str)
-                                print('{0}: Adding {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
-                                temp_adata.obs['Sample'] = [str(final_sample_index)] * len(temp_adata)
-                                temp_adata.obs['Barcode'] = [str(final_sample_index)] * len(temp_adata)
-                                temp_adata.obs['Reference'] = [f'{record}'] * len(temp_adata)
-                                temp_adata.obs['Strand'] = [strand] * len(temp_adata)
-                                temp_adata.obs['Dataset'] = [dataset] * len(temp_adata)
-                                temp_adata.obs['Reference_dataset_strand'] = [f'{record}_{dataset}_{strand}'] * len(temp_adata)
-                                temp_adata.obs['Reference_strand'] = [f'{record}_{strand}'] * len(temp_adata)
+                                logger.info(
+                                    "Adding {0} anndata for sample {1}".format(
+                                        sample_types[dict_index],
+                                        final_sample_index,
+                                    )
+                                )
+                                temp_adata.obs["Sample"] = [
+                                    sample_name_map[final_sample_index]
+                                ] * len(temp_adata)
+                                temp_adata.obs["Barcode"] = [barcode_map[final_sample_index]] * len(
+                                    temp_adata
+                                )
+                                temp_adata.obs["Reference"] = [f"{record}"] * len(temp_adata)
+                                temp_adata.obs["Strand"] = [strand] * len(temp_adata)
+                                temp_adata.obs["Dataset"] = [dataset] * len(temp_adata)
+                                temp_adata.obs["Reference_dataset_strand"] = [
+                                    f"{record}_{dataset}_{strand}"
+                                ] * len(temp_adata)
+                                temp_adata.obs["Reference_strand"] = [f"{record}_{strand}"] * len(
+                                    temp_adata
+                                )
                                 # Load in the one hot encoded reads from the current sample and record
                                 one_hot_reads = {}
                                 n_rows_OHE = 5
-                                ohe_files = bam_record_ohe_files[f'{final_sample_index}_{record}']
-                                print(f'Loading OHEs from {ohe_files}')
+                                ohe_files = bam_record_ohe_files[f"{final_sample_index}_{record}"]
+                                logger.info(f"Loading OHEs from {ohe_files}")
                                 fwd_mapped_reads = set()
                                 rev_mapped_reads = set()
                                 for ohe_file in ohe_files:
                                     tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
                                     one_hot_reads.update(tmp_ohe_dict)
-                                    if '_fwd_' in ohe_file:
+                                    if "_fwd_" in ohe_file:
                                         fwd_mapped_reads.update(tmp_ohe_dict.keys())
-                                    elif '_rev_' in ohe_file:
+                                    elif "_rev_" in ohe_file:
                                         rev_mapped_reads.update(tmp_ohe_dict.keys())
                                     del tmp_ohe_dict
@@ -823,18 +1114,20 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
                                 read_mapping_direction = []
                                 for read_id in temp_adata.obs_names:
                                     if read_id in fwd_mapped_reads:
-                                        read_mapping_direction.append('fwd')
+                                        read_mapping_direction.append("fwd")
                                     elif read_id in rev_mapped_reads:
-                                        read_mapping_direction.append('rev')
+                                        read_mapping_direction.append("rev")
                                     else:
-                                        read_mapping_direction.append('unk')
+                                        read_mapping_direction.append("unk")
-                                temp_adata.obs['Read_mapping_direction'] = read_mapping_direction
+                                temp_adata.obs["Read_mapping_direction"] = read_mapping_direction
                                 del temp_df
                                 # Initialize NumPy arrays
-                                sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
+                                sequence_length = (
+                                    one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
+                                )
                                 df_A = np.zeros((len(sorted_index), sequence_length), dtype=int)
                                 df_C = np.zeros((len(sorted_index), sequence_length), dtype=int)
                                 df_G = np.zeros((len(sorted_index), sequence_length), dtype=int)
@@ -855,7 +1148,11 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
                                 gc.collect()
                                 # Fill the arrays
-                                for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
+                                for j, read_name in tqdm(
+                                    enumerate(sorted_index),
+                                    desc="Loading dataframes of OHE reads",
+                                    total=len(sorted_index),
+                                ):
                                     df_A[j, :] = dict_A[read_name]
                                     df_C[j, :] = dict_C[read_name]
                                     df_G[j, :] = dict_G[read_name]
@@ -867,43 +1164,78 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
                                 # Store the results in AnnData layers
                                 ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
-                                for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
-                                    temp_adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j]
-                                    ohe_df_map[j] = None  # Reassign pointer for memory usage purposes
-                                # If final adata object already has a sample loaded, concatenate the current sample into the existing adata object
+                                for j, base in enumerate(["A", "C", "G", "T", "N"]):
+                                    temp_adata.layers[f"{base}_binary_sequence_encoding"] = (
+                                        ohe_df_map[j]
+                                    )
+                                    ohe_df_map[j] = (
+                                        None  # Reassign pointer for memory usage purposes
+                                    )
+                                # If final adata object already has a sample loaded, concatenate the current sample into the existing adata object
                                 if adata:
                                     if temp_adata.shape[0] > 0:
-                                        print('{0}: Concatenating {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
-                                        adata = ad.concat([adata, temp_adata], join='outer', index_unique=None)
+                                        logger.info(
+                                            "Concatenating {0} anndata object for sample {1}".format(
+                                                sample_types[dict_index],
+                                                final_sample_index,
+                                            )
+                                        )
+                                        adata = ad.concat(
+                                            [adata, temp_adata], join="outer", index_unique=None
+                                        )
                                         del temp_adata
                                     else:
-                                        print(f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata")
+                                        logger.warning(
+                                            f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata"
+                                        )
                                 else:
                                     if temp_adata.shape[0] > 0:
-                                        print('{0}: Initializing {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[dict_index], final_sample_index))
+                                        logger.info(
+                                            "Initializing {0} anndata object for sample {1}".format(
+                                                sample_types[dict_index],
+                                                final_sample_index,
+                                            )
+                                        )
                                         adata = temp_adata
                                     else:
-                                        print(f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata")
+                                        logger.warning(
+                                            f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata"
+                                        )
                                 gc.collect()
                             else:
-                                print(f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata. Skipping sample.")
+                                logger.warning(
+                                    f"{sample} did not have any mapped reads on {record}_{dataset}_{strand}, omiting from final adata. Skipping sample."
+                                )
                     try:
-                        print('{0}: Writing {1} anndata out as a hdf5 file'.format(readwrite.time_string(), sample_types[dict_index]))
-                        adata.write_h5ad(h5_dir / '{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz'.format(readwrite.date_string(), batch, sample_types[dict_index]), compression='gzip')
-                    except:
-                        print(f"Skipping writing anndata for sample")
-            # Delete the batch dictionaries from memory
-            del dict_list, adata
+                        logger.info(
+                            "Writing {0} anndata out as a hdf5 file".format(
+                                sample_types[dict_index]
+                            )
+                        )
+                        adata.write_h5ad(
+                            h5_dir
+                            / "{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz".format(
+                                readwrite.date_string(), batch, sample_types[dict_index]
+                            ),
+                            compression="gzip",
+                        )
+                    except Exception:
+                        logger.debug("Skipping writing anndata for sample")
+            try:
+                # Delete the batch dictionaries from memory
+                del dict_list, adata
+            except Exception:
+                pass
             gc.collect()
     # Iterate over all of the batched hdf5 files and concatenate them.
-    files = h5_dir.iterdir()
+    files = h5_dir.iterdir()
     # Filter file names that contain the search string in their filename and keep them in a list
-    hdfs = [hdf for hdf in files if 'hdf5.h5ad' in hdf.name and hdf != final_hdf]
+    hdfs = [hdf for hdf in files if "hdf5.h5ad" in hdf.name and hdf != final_hdf]
     combined_hdfs = [hdf for hdf in hdfs if "combined" in hdf.name]
     if len(combined_hdfs) > 0:
         hdfs = combined_hdfs
@@ -911,55 +1243,62 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
         pass
     # Sort file list by names and print the list of file names
     hdfs.sort()
-    print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
-    hdf_paths = [h5_dir / hd5 for hd5 in hdfs]
+    logger.info("{0} sample files found: {1}".format(len(hdfs), hdfs))
+    hdf_paths = [hd5 for hd5 in hdfs]
     final_adata = None
     for hdf_index, hdf in enumerate(hdf_paths):
-        print('{0}: Reading in {1} hdf5 file'.format(readwrite.time_string(), hdfs[hdf_index]))
+        logger.info("Reading in {} hdf5 file".format(hdfs[hdf_index]))
         temp_adata = ad.read_h5ad(hdf)
         if final_adata:
-            print('{0}: Concatenating final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdfs[hdf_index]))
-            final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
+            logger.info(
+                "Concatenating final adata object with {} hdf5 file".format(hdfs[hdf_index])
+            )
+            final_adata = ad.concat([final_adata, temp_adata], join="outer", index_unique=None)
         else:
-            print('{0}: Initializing final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdfs[hdf_index]))
+            logger.info("Initializing final adata object with {} hdf5 file".format(hdfs[hdf_index]))
             final_adata = temp_adata
         del temp_adata
     # Set obs columns to type 'category'
     for col in final_adata.obs.columns:
-        final_adata.obs[col] = final_adata.obs[col].astype('category')
+        final_adata.obs[col] = final_adata.obs[col].astype("category")
-    ohe_bases = ['A', 'C', 'G', 'T'] # ignore N bases for consensus
-    ohe_layers = [f"{ohe_base}_binary_encoding" for ohe_base in ohe_bases]
-    final_adata.uns['References'] = {}
+    ohe_bases = ["A", "C", "G", "T"]  # ignore N bases for consensus
+    ohe_layers = [f"{ohe_base}_binary_sequence_encoding" for ohe_base in ohe_bases]
+    final_adata.uns["References"] = {}
     for record in records_to_analyze:
         # Add FASTA sequence to the object
         sequence = record_seq_dict[record][0]
         complement = record_seq_dict[record][1]
-        final_adata.var[f'{record}_top_strand_FASTA_base'] = list(sequence)
-        final_adata.var[f'{record}_bottom_strand_FASTA_base'] = list(complement)
-        final_adata.uns[f'{record}_FASTA_sequence'] = sequence
-        final_adata.uns['References'][f'{record}_FASTA_sequence'] = sequence
+        final_adata.var[f"{record}_top_strand_FASTA_base"] = list(sequence)
+        final_adata.var[f"{record}_bottom_strand_FASTA_base"] = list(complement)
+        final_adata.uns[f"{record}_FASTA_sequence"] = sequence
+        final_adata.uns["References"][f"{record}_FASTA_sequence"] = sequence
         # Add consensus sequence of samples mapped to the record to the object
-        record_subset = final_adata[final_adata.obs['Reference'] == record]
-        for strand in record_subset.obs['Strand'].cat.categories:
-            strand_subset = record_subset[record_subset.obs['Strand'] == strand]
-            for mapping_dir in strand_subset.obs['Read_mapping_direction'].cat.categories:
-                mapping_dir_subset = strand_subset[strand_subset.obs['Read_mapping_direction'] == mapping_dir]
+        record_subset = final_adata[final_adata.obs["Reference"] == record]
+        for strand in record_subset.obs["Strand"].cat.categories:
+            strand_subset = record_subset[record_subset.obs["Strand"] == strand]
+            for mapping_dir in strand_subset.obs["Read_mapping_direction"].cat.categories:
+                mapping_dir_subset = strand_subset[
+                    strand_subset.obs["Read_mapping_direction"] == mapping_dir
+                ]
                 layer_map, layer_counts = {}, []
                 for i, layer in enumerate(ohe_layers):
-                    layer_map[i] = layer.split('_')[0]
+                    layer_map[i] = layer.split("_")[0]
                     layer_counts.append(np.sum(mapping_dir_subset.layers[layer], axis=0))
                 count_array = np.array(layer_counts)
                 nucleotide_indexes = np.argmax(count_array, axis=0)
                 consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
-                final_adata.var[f'{record}_{strand}_{mapping_dir}_consensus_sequence_from_all_samples'] = consensus_sequence_list
+                final_adata.var[
+                    f"{record}_{strand}_{mapping_dir}_consensus_sequence_from_all_samples"
+                ] = consensus_sequence_list
     if input_already_demuxed:
         final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
         final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
     else:
         from .h5ad_functions import add_demux_type_annotation
         double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
         add_demux_type_annotation(final_adata, double_barcoded_reads)
@@ -967,4 +1306,4 @@ def modkit_extract_to_adata(fasta, bam_dir, out_dir, input_already_demuxed, mapp
     if delete_batch_hdfs:
         delete_intermediate_h5ads_and_tmpdir(h5_dir, tmp_dir)
-    return final_adata, final_adata_path
+    return final_adata, final_adata_path

smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

smftools 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl