PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

smftools/__init__.py +7 -6
smftools/_version.py +1 -1
smftools/cli/cli_flows.py +94 -0
smftools/cli/hmm_adata.py +338 -0
smftools/cli/load_adata.py +577 -0
smftools/cli/preprocess_adata.py +363 -0
smftools/cli/spatial_adata.py +564 -0
smftools/cli_entry.py +435 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +38 -0
smftools/config/deaminase.yaml +61 -0
smftools/config/default.yaml +264 -0
smftools/config/direct.yaml +41 -0
smftools/config/discover_input_files.py +115 -0
smftools/config/experiment_config.py +1288 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +13 -9
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/archived/fast5_to_pod5.py +43 -0
smftools/informatics/archived/helpers/archived/__init__.py +71 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
smftools/informatics/bam_functions.py +812 -0
smftools/informatics/basecalling.py +67 -0
smftools/informatics/bed_functions.py +366 -0
smftools/informatics/binarize_converted_base_identities.py +172 -0
smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
smftools/informatics/fasta_functions.py +255 -0
smftools/informatics/h5ad_functions.py +197 -0
smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
smftools/informatics/modkit_functions.py +129 -0
smftools/informatics/ohe.py +160 -0
smftools/informatics/pod5_functions.py +224 -0
smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +609 -0
smftools/plotting/general_plotting.py +1292 -140
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +15 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/binarize.py +17 -0
smftools/preprocessing/binarize_on_Youden.py +2 -2
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_position_Youden.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +1021 -89
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
smftools-0.2.3.dist-info/RECORD +173 -0
smftools-0.2.3.dist-info/entry_points.txt +2 -0
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/__init__.py +0 -74
smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
smftools/informatics/helpers/bam_qc.py +0 -66
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools/evaluation → cli}/__init__.py +0 -0
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
/smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
/smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py RENAMED Viewed

@@ -18,13 +18,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
         bam_files (list): List of split BAM file path strings
             Splits an input BAM file on barcode value and makes a BAM index file.
     """
-    from .. import readwrite
+    from ...readwrite import make_dirs
     import os
     import subprocess
     import glob
-    from .make_dirs import make_dirs
-    input_bam = aligned_sorted_BAM + bam_suffix
+    input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
     command = ["dorado", "demux", "--kit-name", barcode_kit]
     if barcode_both_ends:
         command.append("--barcode-both-ends")
@@ -34,17 +33,16 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
         command += ["-t", str(threads)]
     else:
         pass
-    command += ["--emit-summary", "--sort-bam", "--output-dir", split_dir]
-    command.append(input_bam)
+    command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
+    command.append(str(input_bam))
     command_string = ' '.join(command)
     print(f"Running: {command_string}")
     subprocess.run(command)
-    # Make a BAM index file for the BAMs in that directory
-    bam_pattern = '*' + bam_suffix
-    bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
-    bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
-    bam_files.sort()
+    bam_files = sorted(
+        p for p in split_dir.glob(f"*{bam_suffix}")
+        if p.is_file() and p.suffix == bam_suffix and "unclassified" not in p.name
+    )
     if not bam_files:
         raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")

smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py RENAMED Viewed

@@ -1,4 +1,4 @@
-def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
+def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
     """
     Efficiently extracts base identities from mapped reads with reference coordinates.
@@ -7,6 +7,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
         chromosome (str): Name of the reference chromosome.
         positions (list): Positions to extract (0-based).
         max_reference_length (int): Maximum reference length for padding.
+        sequence (str): The sequence of the record fasta
     Returns:
         dict: Base identities from forward mapped reads.
@@ -16,16 +17,19 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
     import numpy as np
     from collections import defaultdict
     import time
+    from collections import defaultdict, Counter
     timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
     positions = set(positions)
     fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
     rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
+    mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
     #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
-    with pysam.AlignmentFile(bam_file, "rb") as bam:
+    with pysam.AlignmentFile(str(bam_file), "rb") as bam:
         total_reads = bam.mapped
+        ref_seq = sequence.upper()
         for read in bam.fetch(chromosome):
             if not read.is_mapped:
                 continue  # Skip unmapped reads
@@ -39,6 +43,28 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
             for read_position, reference_position in aligned_pairs:
                 if reference_position in positions:
-                    base_dict[read_name][reference_position] = query_sequence[read_position]
+                    read_base = query_sequence[read_position]
+                    ref_base = ref_seq[reference_position]
-    return dict(fwd_base_identities), dict(rev_base_identities)
+                    base_dict[read_name][reference_position] = read_base
+                # Track mismatches (excluding Ns)
+                if read_base != ref_base and read_base != 'N' and ref_base != 'N':
+                    mismatch_counts_per_read[read_name][ref_base][read_base] += 1
+    # Determine C→T vs G→A dominance per read
+    mismatch_trend_per_read = {}
+    for read_name, ref_dict in mismatch_counts_per_read.items():
+        c_to_t = ref_dict.get("C", {}).get("T", 0)
+        g_to_a = ref_dict.get("G", {}).get("A", 0)
+        if abs(c_to_t - g_to_a) < 0.01 and c_to_t > 0:
+            mismatch_trend_per_read[read_name] = "equal"
+        elif c_to_t > g_to_a:
+            mismatch_trend_per_read[read_name] = "C->T"
+        elif g_to_a > c_to_t:
+            mismatch_trend_per_read[read_name] = "G->A"
+        else:
+            mismatch_trend_per_read[read_name] = "none"
+    return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read

smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py RENAMED Viewed

@@ -23,9 +23,9 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
     import glob
     import zipfile
-    os.chdir(mod_tsv_dir)
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
-    bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
+    bam_files = glob.glob(split_dir / f"*{bam_suffix}")
+    print(f"Running modkit extract for the following bam files: {bam_files}")
     if threads:
         threads = str(threads)
@@ -35,20 +35,20 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
     for input_file in bam_files:
         print(input_file)
         # Extract the file basename
-        file_name = os.path.basename(input_file)
+        file_name = input_file.name
         if skip_unclassified and "unclassified" in file_name:
             print("Skipping modkit extract on unclassified reads")
         else:
             # Construct the output TSV file path
-            output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
-            output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
-            if os.path.exists(f"{output_tsv}.gz"):
-                print(f"{output_tsv}.gz already exists, skipping modkit extract")
+            output_tsv = mod_tsv_dir / file_name.stem + "_extract.tsv"
+            output_tsv_gz = output_tsv + '.gz'
+            if output_tsv_gz.exists():
+                print(f"{output_tsv_gz} already exists, skipping modkit extract")
             else:
                 print(f"Extracting modification data from {input_file}")
                 if modkit_summary:
                     # Run modkit summary
-                    subprocess.run(["modkit", "summary", input_file])
+                    subprocess.run(["modkit", "summary", str(input_file)])
                 else:
                     pass
                 # Run modkit extract
@@ -61,7 +61,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
                         "--mod-thresholds", f"a:{m6A_threshold}",
                         "--mod-thresholds", f"h:{hm5C_threshold}",
                         "-t", threads,
-                        input_file, output_tsv
+                        str(input_file), str(output_tsv)
                         ]
                 else:
                     extract_command = [
@@ -71,13 +71,15 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
                         "--mod-thresholds", f"m:{m5C_threshold}",
                         "--mod-thresholds", f"a:{m6A_threshold}",
                         "--mod-thresholds", f"h:{hm5C_threshold}",
-                        input_file, output_tsv
+                        str(input_file), str(output_tsv)
                         ]
                 subprocess.run(extract_command)
                 # Zip the output TSV
                 print(f'zipping {output_tsv}')
                 if threads:
-                    zip_command = ["pigz", "-f", "-p", threads, output_tsv]
+                    zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
                 else:
-                    zip_command = ["pigz", "-f", output_tsv]
-                subprocess.run(zip_command, check=True)
+                    zip_command = ["pigz", "-f", str(output_tsv)]
+                subprocess.run(zip_command, check=True)
+    return

smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py RENAMED Viewed

@@ -2,7 +2,7 @@
 def extract_read_features_from_bam(bam_file_path):
     """
-    Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length.
+    Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
     Params:
         bam_file_path (str):
     Returns:
@@ -26,6 +26,8 @@ def extract_read_features_from_bam(bam_file_path):
             reference_name = read.reference_name
             reference_index = bam_file.references.index(reference_name)
             reference_length = reference_lengths[reference_index]
-            read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length]
+            mapped_length = sum(end - start for start, end in read.get_blocks())
+            mapping_quality = read.mapping_quality  # Phred-scaled MAPQ
+            read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
     return read_metrics

smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py RENAMED Viewed

@@ -1,11 +1,12 @@
-def find_conversion_sites(fasta_file, modification_type, conversion_types):
+def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
     """
     Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
     Parameters:
         fasta_file (str): Path to the converted reference FASTA.
         modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
-        conversion_types (list): List of conversion types. The first element is the unconverted record type.
+        conversions (list): List of conversion types. The first element is the unconverted record type.
+        deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
     Returns:
         dict: Dictionary where keys are **both unconverted & converted record names**.
@@ -14,7 +15,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
     """
     import numpy as np
     from Bio import SeqIO
-    unconverted = conversion_types[0]
+    unconverted = conversions[0]
     record_dict = {}
     # Define base mapping based on modification type
@@ -26,7 +27,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
     # Read FASTA file and process records
     with open(fasta_file, "r") as f:
         for record in SeqIO.parse(f, "fasta"):
-            if unconverted in record.id:
+            if unconverted in record.id or deaminase_footprinting:
                 sequence = str(record.seq).upper()
                 complement = str(record.seq.complement()).upper()
                 sequence_length = len(sequence)

smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py RENAMED Viewed

@@ -67,6 +67,8 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
         None (Writes the converted FASTA file).
     """
     unconverted = modification_types[0]
+    input_fasta = str(input_fasta)
+    output_fasta = str(output_fasta)
     # Detect if input is gzipped
     open_func = gzip.open if input_fasta.endswith('.gz') else open

smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py RENAMED Viewed

@@ -8,25 +8,26 @@ def get_chromosome_lengths(fasta):
         fasta (str): Path to the input fasta
     """
     import os
+    from pathlib import Path
     import subprocess
     from .index_fasta import index_fasta
     # Make a fasta index file if one isn't already available
-    index_path = f'{fasta}.fai'
-    if os.path.exists(index_path):
+    index_path = fasta / '.fai'
+    if index_path.exists():
         print(f'Using existing fasta index file: {index_path}')
     else:
         index_fasta(fasta)
-    parent_dir = os.path.dirname(fasta)
-    fasta_basename = os.path.basename(fasta)
-    chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
-    chrom_path = os.path.join(parent_dir, chrom_basename)
+    parent_dir = fasta.parent
+    fasta_basename = fasta.name
+    chrom_basename = fasta.stem + '.chrom.sizes'
+    chrom_path = parent_dir / chrom_basename
     # Make a chromosome length file
-    if os.path.exists(chrom_path):
+    if chrom_path.exists():
         print(f'Using existing chrom length index file: {chrom_path}')
     else:
         with open(chrom_path, 'w') as outfile:
-            command = ["cut", "-f1,2", index_path]
+            command = ["cut", "-f1,2", str(index_path)]
             subprocess.run(command, stdout=outfile)

smftools/informatics/archived/helpers/archived/index_fasta.py ADDED Viewed

@@ -0,0 +1,24 @@
+import pysam
+from pathlib import Path
+def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
+    """
+    Index a FASTA and optionally write <fasta>.chrom.sizes for bigwig/bedgraph work.
+    Returns
+    -------
+    Path: path to chrom.sizes file (if requested), else .fai
+    """
+    fasta = Path(fasta)
+    pysam.faidx(str(fasta))   # makes fasta.fai
+    if write_chrom_sizes:
+        fai = fasta.with_suffix(fasta.suffix + ".fai")
+        chrom_sizes = fasta.with_suffix(".chrom.sizes")
+        with open(fai) as f_in, open(chrom_sizes, "w") as out:
+            for line in f_in:
+                chrom, size = line.split()[:2]
+                out.write(f"{chrom}\t{size}\n")
+        return chrom_sizes
+    return fasta.with_suffix(fasta.suffix + ".fai")

smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py RENAMED Viewed

@@ -13,10 +13,9 @@ def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
     import os
     import subprocess
-    os.chdir(mod_bed_dir)
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
     command = [
-        "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
+        "modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
         "--partition-tag", "BC",
         "--only-tabs",
         "--filter-threshold", f'{filter_threshold}',

smftools/informatics/{helpers → archived/helpers/archived}/modQC.py RENAMED Viewed

@@ -16,9 +16,9 @@ def modQC(aligned_sorted_output, thresholds):
     import subprocess
     filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
-    subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
+    subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
     command = [
-        "modkit", "summary", aligned_sorted_output,
+        "modkit", "summary", str(aligned_sorted_output),
         "--filter-threshold", f"{filter_threshold}",
         "--mod-thresholds", f"m:{m5C_threshold}",
         "--mod-thresholds", f"a:{m6A_threshold}",

smftools/informatics/archived/helpers/archived/plot_bed_histograms.py ADDED Viewed

@@ -0,0 +1,250 @@
+# plot_bed_histograms
+def plot_bed_histograms(
+    bed_file,
+    plotting_directory,
+    fasta,
+    *,
+    bins=60,
+    clip_quantiles=(0.0, 0.995),
+    cov_bin_size=1000,       # coverage bin size in bp
+    rows_per_fig=6,          # paginate if many chromosomes
+    include_mapq_quality=True,   # add MAPQ + avg read quality columns to grid
+    coordinate_mode="one_based",  # "one_based" (your BED-like) or "zero_based"
+):
+    """
+    Plot per-chromosome QC grids from a BED-like file.
+    Expects columns:
+      chrom, start, end, read_len, qname, mapq, avg_base_qual
+    For each chromosome:
+      - Column 1: Read length histogram
+      - Column 2: Coverage across the chromosome (binned)
+      - (optional) Column 3: MAPQ histogram
+      - (optional) Column 4: Avg base quality histogram
+    The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
+    Saves one PNG per page under `plotting_directory`.
+    Parameters
+    ----------
+    bed_file : str
+    plotting_directory : str
+    fasta : str
+        Reference FASTA (used to get chromosome lengths).
+    bins : int
+        Histogram bins for read length / MAPQ / quality.
+    clip_quantiles : (float, float)
+        Clip hist tails for readability (e.g., (0, 0.995)).
+    cov_bin_size : int
+        Bin size (bp) for coverage plot; bigger = faster/coarser.
+    rows_per_fig : int
+        Number of chromosomes per page.
+    include_mapq_quality : bool
+        If True, add MAPQ and avg base quality histograms as extra columns.
+    coordinate_mode : {"one_based","zero_based"}
+        One-based, inclusive (your file) vs BED-standard zero-based, half-open.
+    """
+    import os
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import pysam
+    os.makedirs(plotting_directory, exist_ok=True)
+    bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
+    print(f"[plot_bed_histograms] Loading: {bed_file}")
+    # Load BED-like table
+    cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
+    df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
+        'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
+        'mapq': float, 'avg_q': float
+    })
+    # Drop unaligned records (chrom == '*') if present
+    df = df[df['chrom'] != '*'].copy()
+    if df.empty:
+        print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
+        return
+    # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
+    # Input is typically one_based inclusive (from your writer).
+    if coordinate_mode not in {"one_based", "zero_based"}:
+        raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
+    if coordinate_mode == "one_based":
+        # convert to 0-based half-open [start0, end0)
+        start0 = df['start'].to_numpy() - 1
+        end0   = df['end'].to_numpy()   # inclusive in input -> +1 already handled by not subtracting
+    else:
+        # already 0-based half-open (assumption)
+        start0 = df['start'].to_numpy()
+        end0   = df['end'].to_numpy()
+    # Clip helper for hist tails
+    def _clip_series(s, q=(0.0, 0.995)):
+        if q is None:
+            return s.to_numpy()
+        lo = s.quantile(q[0]) if q[0] is not None else s.min()
+        hi = s.quantile(q[1]) if q[1] is not None else s.max()
+        x = s.to_numpy(dtype=float)
+        return np.clip(x, lo, hi)
+    # Load chromosome order/lengths from FASTA
+    with pysam.FastaFile(fasta) as fa:
+        ref_names = list(fa.references)
+        ref_lengths = dict(zip(ref_names, fa.lengths))
+    # Keep only chroms present in FASTA and with at least one read
+    chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
+    # Order chromosomes by FASTA order
+    chrom_order = [c for c in ref_names if c in chroms]
+    if not chrom_order:
+        print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
+        return
+    # Pagination
+    def _sanitize(name: str) -> str:
+        return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
+    cols_per_fig = 4 if include_mapq_quality else 2
+    for start_idx in range(0, len(chrom_order), rows_per_fig):
+        chunk = chrom_order[start_idx:start_idx + rows_per_fig]
+        nrows = len(chunk)
+        ncols = cols_per_fig
+        fig, axes = plt.subplots(
+            nrows=nrows, ncols=ncols,
+            figsize=(4.0 * ncols, 2.6 * nrows),
+            dpi=160,
+            squeeze=False
+        )
+        for r, chrom in enumerate(chunk):
+            chrom_len = ref_lengths[chrom]
+            mask = (df['chrom'].to_numpy() == chrom)
+            # Slice per-chrom arrays for speed
+            s0 = start0[mask]
+            e0 = end0[mask]
+            len_arr = df.loc[mask, 'read_len']
+            mapq_arr = df.loc[mask, 'mapq']
+            q_arr = df.loc[mask, 'avg_q']
+            # --- Col 1: Read length histogram (clipped) ---
+            ax = axes[r, 0]
+            ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+            if r == 0:
+                ax.set_title("Read length")
+            ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
+            ax.set_xlabel("bp")
+            ax.grid(alpha=0.25)
+            # --- Col 2: Coverage (binned over genome) ---
+            ax = axes[r, 1]
+            nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
+            # Bin edges in 0-based coords
+            edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
+            # Compute per-bin "read count coverage": number of reads overlapping each bin.
+            # Approximate by incrementing all bins touched by the interval.
+            # (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
+            cov = np.zeros(nb, dtype=np.int32)
+            # bin indices overlapped by each read (0-based half-open)
+            b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
+            b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
+            # ensure valid ordering
+            b_lo = np.minimum(b0, b1)
+            b_hi = np.maximum(b0, b1)
+            # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
+            for lo, hi in zip(b_lo, b_hi):
+                cov[lo:hi + 1] += 1
+            x_mid = (edges[:-1] + edges[1:]) / 2.0
+            ax.plot(x_mid, cov)
+            if r == 0:
+                ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
+            ax.set_xlim(0, chrom_len)
+            ax.set_xlabel("Position (bp)")
+            ax.set_ylabel("")  # already show chrom on col 1
+            ax.grid(alpha=0.25)
+            if include_mapq_quality:
+                # --- Col 3: MAPQ ---
+                ax = axes[r, 2]
+                # Clip MAPQ upper tail if needed (usually 60)
+                ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+                if r == 0:
+                    ax.set_title("MAPQ")
+                ax.set_xlabel("MAPQ")
+                ax.grid(alpha=0.25)
+                # --- Col 4: Avg base quality ---
+                ax = axes[r, 3]
+                ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
+                if r == 0:
+                    ax.set_title("Avg base qual")
+                ax.set_xlabel("Phred")
+                ax.grid(alpha=0.25)
+        fig.suptitle(
+            f"{bed_basename} — per-chromosome QC "
+            f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
+            y=0.995, fontsize=11
+        )
+        fig.tight_layout(rect=[0, 0, 1, 0.98])
+        page = start_idx // rows_per_fig + 1
+        out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
+        plt.savefig(out_png, bbox_inches="tight")
+        plt.close(fig)
+    print("[plot_bed_histograms] Done.")
+    # bed_basename = os.path.basename(bed_file).split('.bed')[0]
+    # # Load the BED file into a DataFrame
+    # print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
+    # df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name', 'mapq', 'read_quality'])
+    # # Group by chromosome
+    # grouped = df.groupby('chromosome')
+    # # for each chromosome, get the record length of that chromosome from the fasta. Use from 0 to this length for the positional coverage plot.
+    # # Change below and make a plot grid instead. For each, make row for chromsome, col for read length and coverage
+    # # Clip the outliers to make plots cleaner
+    # for chrom, group in grouped:
+    #     # Plot read length histogram
+    #     plt.figure(figsize=(12, 6))
+    #     plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
+    #     plt.title(f'Read Length Histogram of reads aligned to {chrom}')
+    #     plt.xlabel('Read Length')
+    #     plt.ylabel('Count')
+    #     plt.grid(True)
+    #     save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
+    #     plt.savefig(save_name)
+    #     plt.close()
+    #     # Compute coverage
+    #     coverage = np.zeros(group['end'].max())
+    #     for _, row in group.iterrows():
+    #         coverage[row['start']:row['end']] += 1
+    #     # Plot coverage histogram
+    #     plt.figure(figsize=(12, 6))
+    #     plt.plot(coverage, color='b')
+    #     plt.title(f'Coverage Histogram for {chrom}')
+    #     plt.xlabel('Position')
+    #     plt.ylabel('Coverage')
+    #     plt.grid(True)
+    #     save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
+    #     plt.savefig(save_name)
+    #     plt.close()

smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py RENAMED Viewed

@@ -1,6 +1,5 @@
 ## separate_bam_by_bc
-# General
 def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
     """
     Separates an input BAM file on the BC SAM tag values.
@@ -16,24 +15,26 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
             Writes out split BAM files.
     """
     import pysam
+    from pathlib import Path
     import os
-    bam_base = os.path.basename(input_bam)
-    bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
+    bam_base = input_bam.name
+    bam_base_minus_suffix = input_bam.stem
     # Open the input BAM file for reading
-    with pysam.AlignmentFile(input_bam, "rb") as bam:
+    with pysam.AlignmentFile(str(input_bam), "rb") as bam:
         # Create a dictionary to store output BAM files
         output_files = {}
         # Iterate over each read in the BAM file
         for read in bam:
             try:
                 # Get the barcode tag value
-                bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
+                bc_tag = read.get_tag("BC", with_value_type=True)[0]
+                #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
                 # Open the output BAM file corresponding to the barcode
                 if bc_tag not in output_files:
-                    output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
-                    output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
+                    output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
+                    output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
                 # Write the read to the corresponding output BAM file
                 output_files[bc_tag].write(read)
             except KeyError:

smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py RENAMED Viewed

@@ -1,36 +1,32 @@
 ## split_and_index_BAM
-def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory):
+def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
     """
     A wrapper function for splitting BAMS and indexing them.
     Parameters:
         aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
         split_dir (str): A string representing the file path to the directory to split the BAMs into.
         bam_suffix (str): A suffix to add to the bam file.
-        output_directory (str): A file path to the directory to output all the analyses.
     Returns:
         None
             Splits an input BAM file on barcode value and makes a BAM index file.
     """
-    from .. import readwrite
+    from ...readwrite import date_string, make_dirs
+    from pathlib import Path
     import os
-    import subprocess
+    import pysam
     import glob
     from .separate_bam_by_bc import separate_bam_by_bc
-    from .make_dirs import make_dirs
-    plotting_dir = os.path.join(output_directory, 'demultiplexed_bed_histograms')
-    bed_dir = os.path.join(output_directory, 'demultiplexed_read_alignment_coordinates')
-    make_dirs([plotting_dir, bed_dir])
     aligned_sorted_output = aligned_sorted_BAM + bam_suffix
-    file_prefix = readwrite.date_string()
+    file_prefix = date_string()
     separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
     # Make a BAM index file for the BAMs in that directory
     bam_pattern = '*' + bam_suffix
-    bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
-    bam_files = [bam for bam in bam_files if '.bai' not in bam]
+    bam_files = glob.glob(split_dir / bam_pattern)
+    bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
     for input_file in bam_files:
-        subprocess.run(["samtools", "index", input_file])
+        pysam.index(input_file)
     return bam_files

smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl