smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/cli_flows.py +94 -0
  5. smftools/cli/hmm_adata.py +338 -0
  6. smftools/cli/load_adata.py +577 -0
  7. smftools/cli/preprocess_adata.py +363 -0
  8. smftools/cli/spatial_adata.py +564 -0
  9. smftools/cli_entry.py +435 -0
  10. smftools/config/conversion.yaml +11 -6
  11. smftools/config/deaminase.yaml +12 -7
  12. smftools/config/default.yaml +36 -25
  13. smftools/config/direct.yaml +25 -1
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +109 -12
  16. smftools/informatics/__init__.py +13 -7
  17. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  18. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  19. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  20. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  21. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  22. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  23. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  24. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  25. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  26. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  27. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  28. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  30. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  31. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  32. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  34. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  35. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  36. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  37. smftools/informatics/bam_functions.py +812 -0
  38. smftools/informatics/basecalling.py +67 -0
  39. smftools/informatics/bed_functions.py +366 -0
  40. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  41. smftools/informatics/fasta_functions.py +255 -0
  42. smftools/informatics/h5ad_functions.py +197 -0
  43. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  44. smftools/informatics/modkit_functions.py +129 -0
  45. smftools/informatics/ohe.py +160 -0
  46. smftools/informatics/pod5_functions.py +224 -0
  47. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  48. smftools/plotting/autocorrelation_plotting.py +1 -3
  49. smftools/plotting/general_plotting.py +1037 -362
  50. smftools/preprocessing/__init__.py +2 -0
  51. smftools/preprocessing/append_base_context.py +3 -3
  52. smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
  53. smftools/preprocessing/binarize.py +17 -0
  54. smftools/preprocessing/binarize_on_Youden.py +2 -2
  55. smftools/preprocessing/calculate_position_Youden.py +1 -1
  56. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  57. smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
  58. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  59. smftools/readwrite.py +266 -140
  60. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
  61. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
  62. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  63. smftools/cli.py +0 -184
  64. smftools/informatics/fast5_to_pod5.py +0 -24
  65. smftools/informatics/helpers/__init__.py +0 -73
  66. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  67. smftools/informatics/helpers/bam_qc.py +0 -66
  68. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  69. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  70. smftools/informatics/helpers/discover_input_files.py +0 -100
  71. smftools/informatics/helpers/index_fasta.py +0 -12
  72. smftools/informatics/helpers/make_dirs.py +0 -21
  73. smftools/informatics/readwrite.py +0 -106
  74. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  75. smftools/load_adata.py +0 -1346
  76. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  77. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  78. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  79. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  80. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  81. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  82. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  83. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  84. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  85. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  86. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  87. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  88. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  89. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  90. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  91. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  92. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  93. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  94. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  95. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  96. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,7 @@
1
1
  # Direct (Nanopore modified base calling)footprinting defaults
2
2
  extends: default
3
+
4
+ ######## smftools load params #########
3
5
  filter_threshold: 0.8 # min threshold to call a canononical base
4
6
  m6A_threshold: 0.7 # min threshold to call a modified m6a base
5
7
  m5C_threshold: 0.7 # min threshold to call a modified 5mC base
@@ -12,6 +14,28 @@ thresholds:
12
14
  mod_list:
13
15
  - '5mC_5hmC'
14
16
  - '6mA' # mods to detect
17
+ mod_target_bases:
18
+ - "A"
19
+ enzyme_target_bases:
20
+ - "A"
15
21
  batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
16
22
  skip_unclassified: True # Whether to skip unclassified barcodes
17
- delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
23
+ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
24
+
25
+ ######## smftools preprocess params ########
26
+ fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
27
+ binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
28
+ positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
29
+ negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
30
+ infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
31
+ inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
32
+ fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
33
+ output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
34
+
35
+ ######## smftools spatial params #########
36
+ autocorr_site_types:
37
+ - "A"
38
+
39
+ ######## smftools hmm params #########
40
+ hmm_methbases:
41
+ - "A"
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Iterable, Union
5
+
6
+ def discover_input_files(
7
+ input_data_path: Union[str, Path],
8
+ bam_suffix: str = ".bam",
9
+ recursive: bool = False,
10
+ follow_symlinks: bool = False,
11
+ ) -> Dict[str, Any]:
12
+ """
13
+ Discover input files under `input_data_path`.
14
+
15
+ Returns a dict with:
16
+ - pod5_paths, fast5_paths, fastq_paths, bam_paths, other_paths (lists of Path)
17
+ - input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
18
+ - all_files_searched (int)
19
+
20
+ Behavior:
21
+ - If `input_data_path` is a file, returns that single file categorized.
22
+ - If a directory, scans immediate children (recursive=False) or entire tree (recursive=True).
23
+ - Handles multi-suffix files like .fastq.gz, .fq.xz, etc.
24
+ """
25
+ p = Path(input_data_path)
26
+
27
+ # normalize bam suffix with a leading dot and lower
28
+ if not bam_suffix.startswith("."):
29
+ bam_suffix = "." + bam_suffix
30
+ bam_suffix = bam_suffix.lower()
31
+
32
+ # Sets of canonical extension keys we’ll compare against
33
+ pod5_exts = {".pod5", ".p5"}
34
+ fast5_exts = {".fast5", ".f5"}
35
+ fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
36
+ h5ad_exts = {".h5ad", ".h5"}
37
+ compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
38
+
39
+ def ext_key(pp: Path) -> str:
40
+ """
41
+ A robust extension key: last suffix, or last two if the final one is a compressor (.gz/.bz2/.xz/.zst).
42
+ Examples:
43
+ a.fastq.gz -> ".fastq.gz"
44
+ a.fq.xz -> ".fq.xz"
45
+ a.bam -> ".bam"
46
+ a -> ""
47
+ """
48
+ suff = [s.lower() for s in pp.suffixes]
49
+ if not suff:
50
+ return ""
51
+ if suff[-1] in compressed_exts and len(suff) >= 2:
52
+ return suff[-2] + suff[-1]
53
+ return suff[-1]
54
+
55
+ pod5_paths: List[Path] = []
56
+ fast5_paths: List[Path] = []
57
+ fastq_paths: List[Path] = []
58
+ bam_paths: List[Path] = []
59
+ h5ad_paths: List[Path] = []
60
+ other_paths: List[Path] = []
61
+
62
+ def categorize_file(fp: Path) -> None:
63
+ key = ext_key(fp)
64
+ if key in pod5_exts:
65
+ pod5_paths.append(fp)
66
+ elif key in fast5_exts:
67
+ fast5_paths.append(fp)
68
+ elif key in fastq_exts:
69
+ fastq_paths.append(fp)
70
+ elif key in h5ad_exts:
71
+ h5ad_paths.append(fp)
72
+ elif key == bam_suffix:
73
+ bam_paths.append(fp)
74
+ else:
75
+ other_paths.append(fp)
76
+
77
+ if not p.exists():
78
+ raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
79
+
80
+ total_searched = 0
81
+
82
+ if p.is_file():
83
+ total_searched = 1
84
+ categorize_file(p)
85
+ else:
86
+ # Directory scan
87
+ if recursive:
88
+ # Python 3.12+ supports follow_symlinks in glob/rglob. Fallback for older versions.
89
+ try:
90
+ iterator = p.rglob("*", follow_symlinks=follow_symlinks) # type: ignore[call-arg]
91
+ except TypeError:
92
+ iterator = p.rglob("*") # follow_symlinks not supported
93
+ else:
94
+ iterator = p.iterdir()
95
+
96
+ for fp in iterator:
97
+ if not fp.is_file():
98
+ continue
99
+ total_searched += 1
100
+ categorize_file(fp)
101
+
102
+ return {
103
+ "pod5_paths": sorted(pod5_paths),
104
+ "fast5_paths": sorted(fast5_paths),
105
+ "fastq_paths": sorted(fastq_paths),
106
+ "bam_paths": sorted(bam_paths),
107
+ "h5ad_paths": sorted(h5ad_paths),
108
+ "other_paths": sorted(other_paths),
109
+ "input_is_pod5": len(pod5_paths) > 0,
110
+ "input_is_fast5": len(fast5_paths) > 0,
111
+ "input_is_fastq": len(fastq_paths) > 0,
112
+ "input_is_bam": len(bam_paths) > 0,
113
+ "input_is_h5ad": len(h5ad_paths) > 0,
114
+ "all_files_searched": total_searched,
115
+ }
@@ -6,6 +6,7 @@ import warnings
6
6
  from dataclasses import dataclass, field, asdict
7
7
  from pathlib import Path
8
8
  from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
9
+ from .discover_input_files import discover_input_files
9
10
 
10
11
  # Optional dependency for YAML handling
11
12
  try:
@@ -593,7 +594,10 @@ class ExperimentConfig:
593
594
  fasta: Optional[str] = None
594
595
  bam_suffix: str = ".bam"
595
596
  recursive_input_search: bool = True
597
+ input_type: Optional[str] = None
598
+ input_files: Optional[List[Path]] = None
596
599
  split_dir: str = "demultiplexed_BAMs"
600
+ split_path: Optional[str] = None
597
601
  strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
598
602
  conversions: List[str] = field(default_factory=lambda: ["unconverted"])
599
603
  fasta_regions_of_interest: Optional[str] = None
@@ -601,11 +605,16 @@ class ExperimentConfig:
601
605
  sample_sheet_mapping_column: Optional[str] = 'Barcode'
602
606
  experiment_name: Optional[str] = None
603
607
  input_already_demuxed: bool = False
608
+ summary_file: Optional[Path] = None
604
609
 
605
610
  # FASTQ input specific
606
611
  fastq_barcode_map: Optional[Dict[str, str]] = None
607
612
  fastq_auto_pairing: bool = True
608
613
 
614
+ # Remove intermediate file options
615
+ delete_intermediate_bams: bool = True
616
+ delete_intermediate_tsvs: bool = True
617
+
609
618
  # Conversion/Deamination file handling
610
619
  delete_intermediate_hdfs: bool = True
611
620
 
@@ -645,6 +654,7 @@ class ExperimentConfig:
645
654
  aligner: str = "minimap2"
646
655
  aligner_args: Optional[List[str]] = None
647
656
  make_bigwigs: bool = False
657
+ make_beds: bool = False
648
658
 
649
659
  # Anndata structure
650
660
  reference_column: Optional[str] = 'Reference_strand'
@@ -656,11 +666,21 @@ class ExperimentConfig:
656
666
 
657
667
  # Preprocessing - Read length and quality filter params
658
668
  read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
659
- read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [200, None])
660
- read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.1])
661
- read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [20, None])
669
+ read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
670
+ read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
671
+ read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
662
672
  read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
663
673
 
674
+ # Preprocessing - Direct mod detection binarization params
675
+ fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
676
+ binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
677
+ positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
678
+ negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
679
+ infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
680
+ inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
681
+ fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
682
+ output_binary_layer_name: Optional[str] = "binarized_methylation"
683
+
664
684
  # Preprocessing - Read modification filter params
665
685
  read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
666
686
  read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
@@ -680,7 +700,8 @@ class ExperimentConfig:
680
700
  duplicate_detection_hierarchical_linkage: str = "average"
681
701
  duplicate_detection_do_pca: bool = False
682
702
 
683
- # Preprocessing - Complexity analysis params
703
+ # Preprocessing - Position QC
704
+ position_max_nan_threshold: float = 0.1
684
705
 
685
706
  # Basic Analysis - Clustermap params
686
707
  layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
@@ -718,6 +739,9 @@ class ExperimentConfig:
718
739
  hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
719
740
  hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
720
741
 
742
+ # Pipeline control flow - load adata
743
+ force_redo_load_adata: bool = False
744
+
721
745
  # Pipeline control flow - preprocessing and QC
722
746
  force_redo_preprocessing: bool = False
723
747
  force_reload_sample_sheet: bool = True
@@ -860,6 +884,63 @@ class ExperimentConfig:
860
884
  if merged.get("experiment_name") is None and date_str:
861
885
  merged["experiment_name"] = f"{date_str}_SMF_experiment"
862
886
 
887
+ # Input file types and path handling
888
+ input_data_path = Path(merged['input_data_path'])
889
+
890
+ # Detect the input filetype
891
+ if input_data_path.is_file():
892
+ suffix = input_data_path.suffix.lower()
893
+ suffixes = [s.lower() for s in input_data_path.suffixes] # handles multi-part extensions
894
+
895
+ # recognize multi-suffix cases like .fastq.gz or .fq.gz
896
+ if any(s in ['.pod5', '.p5'] for s in suffixes):
897
+ input_type = "pod5"
898
+ input_files = [Path(input_data_path)]
899
+ elif any(s in ['.fast5', '.f5'] for s in suffixes):
900
+ input_type = "fast5"
901
+ input_files = [Path(input_data_path)]
902
+ elif any(s in ['.fastq', '.fq'] for s in suffixes):
903
+ input_type = "fastq"
904
+ input_files = [Path(input_data_path)]
905
+ elif any(s in ['.bam'] for s in suffixes):
906
+ input_type = "bam"
907
+ input_files = [Path(input_data_path)]
908
+ elif any(s in ['.h5ad', ".h5"] for s in suffixes):
909
+ input_type = "h5ad"
910
+ input_files = [Path(input_data_path)]
911
+ else:
912
+ print("Error detecting input file type")
913
+
914
+ elif input_data_path.is_dir():
915
+ found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
916
+
917
+ if found["input_is_pod5"]:
918
+ input_type = "pod5"
919
+ input_files = found["pod5_paths"]
920
+ elif found["input_is_fast5"]:
921
+ input_type = "fast5"
922
+ input_files = found["fast5_paths"]
923
+ elif found["input_is_fastq"]:
924
+ input_type = "fastq"
925
+ input_files = found["fastq_paths"]
926
+ elif found["input_is_bam"]:
927
+ input_type = "bam"
928
+ input_files = found["bam_paths"]
929
+ elif found["input_is_h5ad"]:
930
+ input_type = "h5ad"
931
+ input_files = found["h5ad_paths"]
932
+
933
+ print(f"Found {found['all_files_searched']} files; fastq={len(found["fastq_paths"])}, bam={len(found["bam_paths"])}, pod5={len(found["pod5_paths"])}, fast5={len(found["fast5_paths"])}, , h5ad={len(found["h5ad_paths"])}")
934
+
935
+ # summary file output path
936
+ output_dir = Path(merged['output_directory'])
937
+ summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
938
+ summary_file = output_dir / summary_file_basename
939
+
940
+ # Demultiplexing output path
941
+ split_dir = merged.get("split_dir", "demultiplexed_BAMs")
942
+ split_path = output_dir / split_dir
943
+
863
944
  # final normalization
864
945
  if "strands" in merged:
865
946
  merged["strands"] = _parse_list(merged["strands"])
@@ -936,13 +1017,15 @@ class ExperimentConfig:
936
1017
  hmm_methbases = list(hmm_methbases)
937
1018
  hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
938
1019
 
939
-
940
1020
  # instantiate dataclass
941
1021
  instance = cls(
942
1022
  smf_modality = merged.get("smf_modality"),
943
- input_data_path = merged.get("input_data_path"),
1023
+ input_data_path = input_data_path,
944
1024
  recursive_input_search = merged.get("recursive_input_search"),
945
- output_directory = merged.get("output_directory"),
1025
+ input_type = input_type,
1026
+ input_files = input_files,
1027
+ output_directory = output_dir,
1028
+ summary_file = summary_file,
946
1029
  fasta = merged.get("fasta"),
947
1030
  sequencer = merged.get("sequencer"),
948
1031
  model_dir = merged.get("model_dir"),
@@ -950,7 +1033,8 @@ class ExperimentConfig:
950
1033
  fastq_barcode_map = merged.get("fastq_barcode_map"),
951
1034
  fastq_auto_pairing = merged.get("fastq_auto_pairing"),
952
1035
  bam_suffix = merged.get("bam_suffix", ".bam"),
953
- split_dir = merged.get("split_dir", "demultiplexed_BAMs"),
1036
+ split_dir = split_dir,
1037
+ split_path = split_path,
954
1038
  strands = merged.get("strands", ["bottom","top"]),
955
1039
  conversions = merged.get("conversions", ["unconverted"]),
956
1040
  fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
@@ -963,14 +1047,17 @@ class ExperimentConfig:
963
1047
  threads = merged.get("threads"),
964
1048
  sample_sheet_path = merged.get("sample_sheet_path"),
965
1049
  sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
1050
+ delete_intermediate_bams = merged.get("delete_intermediate_bams", True),
1051
+ delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
966
1052
  aligner = merged.get("aligner", "minimap2"),
967
1053
  aligner_args = merged.get("aligner_args", None),
968
1054
  device = merged.get("device", "auto"),
969
1055
  make_bigwigs = merged.get("make_bigwigs", False),
1056
+ make_beds = merged.get("make_beds", False),
970
1057
  delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
971
1058
  mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
972
1059
  enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
973
- conversion_types = merged.get("conversion_types", ["5mC"]),
1060
+ conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
974
1061
  filter_threshold = merged.get("filter_threshold", 0.8),
975
1062
  m6A_threshold = merged.get("m6A_threshold", 0.7),
976
1063
  m5C_threshold = merged.get("m5C_threshold", 0.7),
@@ -983,6 +1070,14 @@ class ExperimentConfig:
983
1070
  reference_column = merged.get("reference_column", 'Reference_strand'),
984
1071
  sample_column = merged.get("sample_column", 'Barcode'),
985
1072
  sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
1073
+ fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
1074
+ binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
1075
+ positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
1076
+ negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
1077
+ infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
1078
+ inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
1079
+ fit_j_threshold = merged.get("fit_j_threshold", 0.5),
1080
+ output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
986
1081
  layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
987
1082
  layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
988
1083
  umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
@@ -1008,9 +1103,9 @@ class ExperimentConfig:
1008
1103
  accessible_patches = merged.get("accessible_patches", None),
1009
1104
  cpg = merged.get("cpg", None),
1010
1105
  read_coord_filter = merged.get("read_coord_filter", [None, None]),
1011
- read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [200, None]),
1012
- read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.4, 1.1]),
1013
- read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [20, None]),
1106
+ read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
1107
+ read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
1108
+ read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
1014
1109
  read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
1015
1110
  read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
1016
1111
  read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
@@ -1026,10 +1121,12 @@ class ExperimentConfig:
1026
1121
  duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
1027
1122
  duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
1028
1123
  duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
1124
+ position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
1029
1125
  correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
1030
1126
  correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1031
1127
  correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
1032
1128
  hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
1129
+ force_redo_load_adata = merged.get("force_redo_load_adata", False),
1033
1130
  force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
1034
1131
  force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
1035
1132
  bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),
@@ -1,14 +1,20 @@
1
- from . import helpers
2
- from .basecall_pod5s import basecall_pod5s
3
- from .subsample_fasta_from_bed import subsample_fasta_from_bed
4
- from .subsample_pod5 import subsample_pod5
5
- from .fast5_to_pod5 import fast5_to_pod5
6
-
1
+ from .bam_functions import align_and_sort_BAM, bam_qc, concatenate_fastqs_to_bam, count_aligned_reads, demux_and_index_BAM, extract_base_identities, extract_read_features_from_bam, extract_readnames_from_bam, separate_bam_by_bc, split_and_index_BAM
2
+ from .basecalling import canoncall, modcall
3
+ from .bed_functions import aligned_BAM_to_bed, _bed_to_bigwig, extract_read_lengths_from_bed, _plot_bed_histograms
4
+ from .converted_BAM_to_adata import converted_BAM_to_adata
5
+ from .fasta_functions import find_conversion_sites, generate_converted_FASTA, get_chromosome_lengths, get_native_references, index_fasta, subsample_fasta_from_bed
6
+ from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
7
+ from .modkit_functions import extract_mods, make_modbed, modQC
8
+ from .modkit_extract_to_adata import modkit_extract_to_adata
9
+ from .ohe import one_hot_encode, one_hot_decode, ohe_layers_decode, ohe_batching
10
+ from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
11
+ from .run_multiqc import run_multiqc
7
12
 
8
13
  __all__ = [
9
14
  "basecall_pod5s",
15
+ "converted_BAM_to_adata",
10
16
  "subsample_fasta_from_bed",
11
17
  "subsample_pod5",
12
18
  "fast5_to_pod5",
13
- "helpers"
19
+ "run_multiqc"
14
20
  ]
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+ import subprocess
3
+ from typing import Union, List
4
+
5
+ def fast5_to_pod5(
6
+ fast5_dir: Union[str, Path, List[Union[str, Path]]],
7
+ output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
8
+ ) -> None:
9
+ """
10
+ Convert Nanopore FAST5 files (single file, list of files, or directory)
11
+ into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
12
+ """
13
+
14
+ output_pod5 = str(output_pod5) # ensure string
15
+
16
+ # 1) If user gives a list of FAST5 files
17
+ if isinstance(fast5_dir, (list, tuple)):
18
+ fast5_paths = [str(Path(f)) for f in fast5_dir]
19
+ cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
20
+ subprocess.run(cmd, check=True)
21
+ return
22
+
23
+ # Ensure Path object
24
+ p = Path(fast5_dir)
25
+
26
+ # 2) If user gives a single file
27
+ if p.is_file():
28
+ cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
29
+ subprocess.run(cmd, check=True)
30
+ return
31
+
32
+ # 3) If user gives a directory → collect FAST5s
33
+ if p.is_dir():
34
+ fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
35
+ if not fast5_paths:
36
+ raise FileNotFoundError(f"No FAST5 files found in {p}")
37
+
38
+ cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
39
+ subprocess.run(cmd, check=True)
40
+ return
41
+
42
+ raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
43
+
@@ -0,0 +1,71 @@
1
+ # from .align_and_sort_BAM import align_and_sort_BAM
2
+ # from .aligned_BAM_to_bed import aligned_BAM_to_bed
3
+ # from .bam_qc import bam_qc
4
+ # from .bed_to_bigwig import bed_to_bigwig
5
+ # from .binarize_converted_base_identities import binarize_converted_base_identities
6
+ # from .canoncall import canoncall
7
+ # from .complement_base_list import complement_base_list
8
+ # from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
9
+ # from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
10
+ # from .count_aligned_reads import count_aligned_reads
11
+ # from .demux_and_index_BAM import demux_and_index_BAM
12
+ # from .discover_input_files import *
13
+ # from .extract_base_identities import extract_base_identities
14
+ # from .extract_mods import extract_mods
15
+ # from .extract_read_features_from_bam import extract_read_features_from_bam
16
+ # from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
17
+ # from .extract_readnames_from_BAM import extract_readnames_from_BAM
18
+ # from .find_conversion_sites import find_conversion_sites
19
+ # from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
20
+ # from .get_chromosome_lengths import get_chromosome_lengths
21
+ # from .get_native_references import get_native_references
22
+ # from .index_fasta import index_fasta
23
+ # from .make_modbed import make_modbed
24
+ # from .modcall import modcall
25
+ # from .modkit_extract_to_adata import modkit_extract_to_adata
26
+ # from .modQC import modQC
27
+ # from .one_hot_encode import one_hot_encode
28
+ # from .ohe_batching import ohe_batching
29
+ # from .one_hot_decode import one_hot_decode
30
+ # from .ohe_layers_decode import ohe_layers_decode
31
+ # from .plot_bed_histograms import plot_bed_histograms
32
+ # from .run_multiqc import run_multiqc
33
+ # from .separate_bam_by_bc import separate_bam_by_bc
34
+ # from .split_and_index_BAM import split_and_index_BAM
35
+
36
+ # __all__ = [
37
+ # "align_and_sort_BAM",
38
+ # "aligned_BAM_to_bed",
39
+ # "bam_qc",
40
+ # "bed_to_bigwig",
41
+ # "binarize_converted_base_identities",
42
+ # "canoncall",
43
+ # "complement_base_list",
44
+ # "converted_BAM_to_adata_II",
45
+ # "concatenate_fastqs_to_bam",
46
+ # "count_aligned_reads",
47
+ # "demux_and_index_BAM",
48
+ # "extract_base_identities",
49
+ # "extract_mods",
50
+ # "extract_read_features_from_bam",
51
+ # "extract_read_lengths_from_bed",
52
+ # "extract_readnames_from_BAM",
53
+ # "find_conversion_sites",
54
+ # "convert_FASTA_record",
55
+ # "generate_converted_FASTA",
56
+ # "get_chromosome_lengths",
57
+ # "get_native_references",
58
+ # "index_fasta",
59
+ # "make_modbed",
60
+ # "modcall",
61
+ # "modkit_extract_to_adata",
62
+ # "modQC",
63
+ # "one_hot_encode",
64
+ # "ohe_batching",
65
+ # "one_hot_decode",
66
+ # "ohe_layers_decode",
67
+ # "plot_bed_histograms",
68
+ # "run_multiqc",
69
+ # "separate_bam_by_bc",
70
+ # "split_and_index_BAM"
71
+ # ]
@@ -0,0 +1,126 @@
1
+ from pathlib import Path
2
+ import os
3
+ import subprocess
4
+ from typing import List, Optional, Union
5
+ import pysam
6
+
7
+ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
8
+ """
9
+ Minimal BAM->FASTQ using pysam. Writes unmapped or unaligned reads as-is.
10
+ """
11
+ bam_path = str(bam_path)
12
+ fastq_path = str(fastq_path)
13
+ with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w") as fq:
14
+ for r in bam.fetch(until_eof=True):
15
+ # Skip secondary/supplementary if you want (optional):
16
+ # if r.is_secondary or r.is_supplementary: continue
17
+ name = r.query_name
18
+ seq = r.query_sequence or ""
19
+ qual = r.qual or ""
20
+ fq.write(f"@{name}\n{seq}\n+\n{qual}\n")
21
+
22
+ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
23
+ in_bam, out_bam = str(in_bam), str(out_bam)
24
+ args = []
25
+ if threads:
26
+ args += ["-@", str(threads)]
27
+ args += ["-o", out_bam, in_bam]
28
+ pysam.sort(*args)
29
+
30
+ def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
31
+ bam_path = str(bam_path)
32
+ # pysam.index supports samtools-style args
33
+ if threads:
34
+ pysam.index("-@", str(threads), bam_path)
35
+ else:
36
+ pysam.index(bam_path)
37
+
38
+ def align_and_sort_BAM(fasta,
39
+ input,
40
+ bam_suffix='.bam',
41
+ output_directory='aligned_outputs',
42
+ make_bigwigs=False,
43
+ threads=None,
44
+ aligner='minimap2',
45
+ aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
46
+ """
47
+ A wrapper for running dorado aligner and samtools functions
48
+
49
+ Parameters:
50
+ fasta (str): File path to the reference genome to align to.
51
+ input (str): File path to the basecalled file to align. Works for .bam and .fastq files
52
+ bam_suffix (str): The suffix to use for the BAM file.
53
+ output_directory (str): A file path to the directory to output all the analyses.
54
+ make_bigwigs (bool): Whether to make bigwigs
55
+ threads (int): Number of additional threads to use
56
+ aligner (str): Aligner to use. minimap2 and dorado options
57
+ aligner_args (list): list of optional parameters to use for the alignment
58
+
59
+ Returns:
60
+ None
61
+ The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
62
+ """
63
+ input_basename = input.name
64
+ input_suffix = input.suffix
65
+ input_as_fastq = input.with_name(input.stem + '.fastq')
66
+
67
+ output_path_minus_suffix = output_directory / input.stem
68
+
69
+ aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
70
+ aligned_output = aligned_BAM.with_suffix(bam_suffix)
71
+ aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
72
+ aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
73
+
74
+ if threads:
75
+ threads = str(threads)
76
+ else:
77
+ pass
78
+
79
+ if aligner == 'minimap2':
80
+ print(f"Converting BAM to FASTQ: {input}")
81
+ _bam_to_fastq_with_pysam(input, input_as_fastq)
82
+ # bam_to_fastq_command = ['samtools', 'fastq', input]
83
+ # subprocess.run(bam_to_fastq_command, stdout=open(input_as_fastq, "w"))
84
+ print(f"Aligning FASTQ to Reference: {input_as_fastq}")
85
+ if threads:
86
+ minimap_command = ['minimap2'] + aligner_args + ['-t', threads, str(fasta), str(input_as_fastq)]
87
+ else:
88
+ minimap_command = ['minimap2'] + aligner_args + [str(fasta), str(input_as_fastq)]
89
+ subprocess.run(minimap_command, stdout=open(aligned_output, "w"))
90
+ os.remove(input_as_fastq)
91
+
92
+ elif aligner == 'dorado':
93
+ # Run dorado aligner
94
+ print(f"Aligning BAM to Reference: {input}")
95
+ if threads:
96
+ alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [str(fasta), str(input)]
97
+ else:
98
+ alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
99
+ subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
100
+
101
+ else:
102
+ print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
103
+ return
104
+
105
+ # --- Sort & Index with pysam ---
106
+ print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
107
+ _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
108
+
109
+ print(f"[pysam] Indexing: {aligned_sorted_output}")
110
+ _index_bam_with_pysam(aligned_sorted_output, threads=threads)
111
+
112
+ # Sort the BAM on positional coordinates
113
+ # print(f"Sorting BAM: {aligned_output}")
114
+ # if threads:
115
+ # sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
116
+ # else:
117
+ # sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
118
+ # subprocess.run(sort_command)
119
+
120
+ # # Create a BAM index file
121
+ # print(f"Indexing BAM: {aligned_sorted_output}")
122
+ # if threads:
123
+ # index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
124
+ # else:
125
+ # index_command = ["samtools", "index", aligned_sorted_output]
126
+ # subprocess.run(index_command)