smftools 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +2 -0
  7. smftools/cli/hmm_adata.py +7 -2
  8. smftools/cli/load_adata.py +130 -98
  9. smftools/cli/preprocess_adata.py +2 -0
  10. smftools/cli/spatial_adata.py +5 -1
  11. smftools/cli_entry.py +26 -1
  12. smftools/config/__init__.py +2 -0
  13. smftools/config/default.yaml +4 -1
  14. smftools/config/experiment_config.py +6 -0
  15. smftools/datasets/__init__.py +2 -0
  16. smftools/hmm/HMM.py +9 -3
  17. smftools/hmm/__init__.py +24 -13
  18. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  19. smftools/hmm/archived/calculate_distances.py +2 -0
  20. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  21. smftools/hmm/archived/train_hmm.py +2 -0
  22. smftools/hmm/call_hmm_peaks.py +5 -2
  23. smftools/hmm/display_hmm.py +4 -1
  24. smftools/hmm/hmm_readwrite.py +7 -2
  25. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  26. smftools/informatics/__init__.py +53 -34
  27. smftools/informatics/archived/bam_conversion.py +2 -0
  28. smftools/informatics/archived/bam_direct.py +2 -0
  29. smftools/informatics/archived/basecall_pod5s.py +2 -0
  30. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  31. smftools/informatics/archived/conversion_smf.py +2 -0
  32. smftools/informatics/archived/deaminase_smf.py +1 -0
  33. smftools/informatics/archived/direct_smf.py +2 -0
  34. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  35. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  36. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  37. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  38. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  39. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  40. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  41. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  42. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  43. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  44. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  45. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  48. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  49. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  50. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  51. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  52. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  53. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  54. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  55. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  56. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  57. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  58. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  59. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  60. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  61. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  62. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  63. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  64. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  65. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  66. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  67. smftools/informatics/archived/subsample_pod5.py +2 -0
  68. smftools/informatics/bam_functions.py +737 -170
  69. smftools/informatics/basecalling.py +2 -0
  70. smftools/informatics/bed_functions.py +271 -61
  71. smftools/informatics/binarize_converted_base_identities.py +3 -0
  72. smftools/informatics/complement_base_list.py +2 -0
  73. smftools/informatics/converted_BAM_to_adata.py +66 -22
  74. smftools/informatics/fasta_functions.py +94 -10
  75. smftools/informatics/h5ad_functions.py +8 -2
  76. smftools/informatics/modkit_extract_to_adata.py +16 -6
  77. smftools/informatics/modkit_functions.py +2 -0
  78. smftools/informatics/ohe.py +2 -0
  79. smftools/informatics/pod5_functions.py +3 -2
  80. smftools/machine_learning/__init__.py +22 -6
  81. smftools/machine_learning/data/__init__.py +2 -0
  82. smftools/machine_learning/data/anndata_data_module.py +18 -4
  83. smftools/machine_learning/data/preprocessing.py +2 -0
  84. smftools/machine_learning/evaluation/__init__.py +2 -0
  85. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  86. smftools/machine_learning/evaluation/evaluators.py +14 -9
  87. smftools/machine_learning/inference/__init__.py +2 -0
  88. smftools/machine_learning/inference/inference_utils.py +2 -0
  89. smftools/machine_learning/inference/lightning_inference.py +6 -1
  90. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  91. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  92. smftools/machine_learning/models/__init__.py +2 -0
  93. smftools/machine_learning/models/base.py +7 -2
  94. smftools/machine_learning/models/cnn.py +7 -2
  95. smftools/machine_learning/models/lightning_base.py +16 -11
  96. smftools/machine_learning/models/mlp.py +5 -1
  97. smftools/machine_learning/models/positional.py +7 -2
  98. smftools/machine_learning/models/rnn.py +5 -1
  99. smftools/machine_learning/models/sklearn_models.py +14 -9
  100. smftools/machine_learning/models/transformer.py +7 -2
  101. smftools/machine_learning/models/wrappers.py +6 -2
  102. smftools/machine_learning/training/__init__.py +2 -0
  103. smftools/machine_learning/training/train_lightning_model.py +13 -3
  104. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  105. smftools/machine_learning/utils/__init__.py +2 -0
  106. smftools/machine_learning/utils/device.py +5 -1
  107. smftools/machine_learning/utils/grl.py +5 -1
  108. smftools/optional_imports.py +31 -0
  109. smftools/plotting/__init__.py +32 -31
  110. smftools/plotting/autocorrelation_plotting.py +9 -5
  111. smftools/plotting/classifiers.py +16 -4
  112. smftools/plotting/general_plotting.py +6 -3
  113. smftools/plotting/hmm_plotting.py +12 -2
  114. smftools/plotting/position_stats.py +15 -7
  115. smftools/plotting/qc_plotting.py +6 -1
  116. smftools/preprocessing/__init__.py +35 -37
  117. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  118. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  119. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  120. smftools/preprocessing/archived/preprocessing.py +2 -0
  121. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  122. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  123. smftools/preprocessing/calculate_complexity_II.py +4 -1
  124. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  125. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  126. smftools/preprocessing/calculate_position_Youden.py +9 -2
  127. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  128. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  129. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  130. smftools/preprocessing/make_dirs.py +2 -1
  131. smftools/preprocessing/min_non_diagonal.py +2 -0
  132. smftools/preprocessing/recipes.py +2 -0
  133. smftools/tools/__init__.py +26 -18
  134. smftools/tools/archived/apply_hmm.py +2 -0
  135. smftools/tools/archived/classifiers.py +2 -0
  136. smftools/tools/archived/classify_methylated_features.py +2 -0
  137. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  138. smftools/tools/archived/subset_adata_v1.py +2 -0
  139. smftools/tools/archived/subset_adata_v2.py +2 -0
  140. smftools/tools/calculate_umap.py +3 -1
  141. smftools/tools/cluster_adata_on_methylation.py +7 -1
  142. smftools/tools/position_stats.py +17 -27
  143. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/METADATA +67 -33
  144. smftools-0.3.0.dist-info/RECORD +182 -0
  145. smftools-0.2.5.dist-info/RECORD +0 -181
  146. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  147. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  148. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py CHANGED
@@ -1,20 +1,52 @@
1
1
  """smftools"""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
4
6
  import warnings
7
+ from importlib import import_module
5
8
  from importlib.metadata import version
9
+ from typing import TYPE_CHECKING
6
10
 
7
- from . import cli, config, datasets, hmm
8
- from . import informatics as inform
9
- from . import machine_learning as ml
10
- from . import plotting as pl
11
- from . import preprocessing as pp
12
- from . import tools as tl
13
- from .readwrite import adata_to_df, merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad
11
+ from .readwrite import adata_to_df, safe_read_h5ad, safe_write_h5ad
14
12
 
15
13
  package_name = "smftools"
16
14
  __version__ = version(package_name)
17
15
 
16
+ if TYPE_CHECKING:
17
+ from smftools import (
18
+ cli,
19
+ config,
20
+ datasets,
21
+ hmm,
22
+ informatics,
23
+ machine_learning,
24
+ plotting,
25
+ preprocessing,
26
+ tools,
27
+ )
28
+
29
+ _LAZY_MODULES = {
30
+ "cli": "smftools.cli",
31
+ "config": "smftools.config",
32
+ "datasets": "smftools.datasets",
33
+ "hmm": "smftools.hmm",
34
+ "inform": "smftools.informatics",
35
+ "ml": "smftools.machine_learning",
36
+ "pl": "smftools.plotting",
37
+ "pp": "smftools.preprocessing",
38
+ "tl": "smftools.tools",
39
+ }
40
+
41
+
42
+ def __getattr__(name: str):
43
+ if name in _LAZY_MODULES:
44
+ module = import_module(_LAZY_MODULES[name])
45
+ globals()[name] = module
46
+ return module
47
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
48
+
49
+
18
50
  __all__ = [
19
51
  "adata_to_df",
20
52
  "inform",
smftools/_settings.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  from typing import Union
3
5
 
smftools/_version.py CHANGED
@@ -1 +1,3 @@
1
- __version__ = "0.2.5"
1
+ from __future__ import annotations
2
+
3
+ __version__ = "0.3.0"
smftools/cli/__init__.py CHANGED
@@ -0,0 +1 @@
1
+ from __future__ import annotations
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  def flow_I(config_path):
2
4
  """
3
5
  High-level function to call for converting raw sequencing data to an adata object.
smftools/cli/helpers.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from dataclasses import dataclass
2
4
  from pathlib import Path
3
5
 
smftools/cli/hmm_adata.py CHANGED
@@ -3,18 +3,23 @@ from __future__ import annotations
3
3
  import copy
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import Any, List, Optional, Sequence, Tuple, Union
6
+ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
7
7
 
8
8
  import numpy as np
9
- import torch
10
9
 
11
10
  from smftools.logging_utils import get_logger
11
+ from smftools.optional_imports import require
12
12
 
13
13
  # FIX: import _to_dense_np to avoid NameError
14
14
  from ..hmm.HMM import _safe_int_coords, _to_dense_np, create_hmm, normalize_hmm_feature_sets
15
15
 
16
16
  logger = get_logger(__name__)
17
17
 
18
+ if TYPE_CHECKING:
19
+ import torch as torch_types
20
+
21
+ torch = require("torch", extra="torch", purpose="HMM CLI")
22
+
18
23
  # =============================================================================
19
24
  # Helpers: extracting training arrays
20
25
  # =============================================================================
@@ -1,7 +1,11 @@
1
+ from __future__ import annotations
2
+
1
3
  import shutil
2
4
  from pathlib import Path
3
5
  from typing import Iterable, Union
4
6
 
7
+ import numpy as np
8
+
5
9
  from smftools.logging_utils import get_logger
6
10
 
7
11
  from .helpers import AdataPaths
@@ -76,6 +80,96 @@ def delete_tsvs(
76
80
  logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
77
81
 
78
82
 
83
+ def load_adata(config_path: str):
84
+ """
85
+ CLI-facing wrapper for the load pipeline.
86
+
87
+ - Reads config CSV into ExperimentConfig
88
+ - Computes canonical paths for all downstream AnnData stages
89
+ - Registers those in the summary CSV
90
+ - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
91
+ - If needed, calls the core pipeline to actually build the raw AnnData
92
+
93
+ Returns
94
+ -------
95
+ adata : anndata.AnnData | None
96
+ Newly created AnnData object, or None if we skipped because a later-stage
97
+ AnnData already exists.
98
+ adata_path : pathlib.Path
99
+ Path to the "current" AnnData that should be used downstream.
100
+ cfg : ExperimentConfig
101
+ Config object for downstream steps.
102
+ """
103
+ from datetime import datetime
104
+ from importlib import resources
105
+
106
+ from ..config import ExperimentConfig, LoadExperimentConfig
107
+ from ..readwrite import add_or_update_column_in_csv, make_dirs
108
+ from .helpers import get_adata_paths
109
+
110
+ date_str = datetime.today().strftime("%y%m%d")
111
+
112
+ # -----------------------------
113
+ # 1) Load config into cfg
114
+ # -----------------------------
115
+ loader = LoadExperimentConfig(config_path)
116
+ defaults_dir = resources.files("smftools").joinpath("config")
117
+ cfg, report = ExperimentConfig.from_var_dict(
118
+ loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
119
+ )
120
+
121
+ # Ensure base output dir
122
+ make_dirs([cfg.output_directory])
123
+
124
+ # -----------------------------
125
+ # 2) Compute and register paths
126
+ # -----------------------------
127
+ paths = get_adata_paths(cfg)
128
+
129
+ # experiment-level metadata in summary CSV
130
+ add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
131
+ add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
132
+ add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
133
+ add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
134
+
135
+ # AnnData stage paths
136
+ add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
137
+ add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
138
+ add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
139
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
140
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
141
+
142
+ # -----------------------------
143
+ # 3) Stage skipping logic
144
+ # -----------------------------
145
+ if not getattr(cfg, "force_redo_load_adata", False):
146
+ if paths.hmm.exists():
147
+ logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
148
+ return None, paths.hmm, cfg
149
+ if paths.spatial.exists():
150
+ logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
151
+ return None, paths.spatial, cfg
152
+ if paths.pp_dedup.exists():
153
+ logger.debug(
154
+ f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
155
+ f"Skipping smftools load"
156
+ )
157
+ return None, paths.pp_dedup, cfg
158
+ if paths.pp.exists():
159
+ logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
160
+ return None, paths.pp, cfg
161
+ if paths.raw.exists():
162
+ logger.debug(
163
+ f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
164
+ )
165
+ return None, paths.raw, cfg
166
+
167
+ # If we get here, we actually want to run the full load pipeline
168
+ adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
169
+
170
+ return adata, adata_path, cfg
171
+
172
+
79
173
  def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
80
174
  """
81
175
  Core load pipeline.
@@ -105,9 +199,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
105
199
  cfg : ExperimentConfig
106
200
  (Same object, possibly with some fields updated, e.g. fasta path.)
107
201
  """
108
- from pathlib import Path
109
-
110
- import numpy as np
111
202
 
112
203
  from ..informatics.bam_functions import (
113
204
  align_and_sort_BAM,
@@ -219,6 +310,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
219
310
  rg_sample_field=None,
220
311
  progress=False,
221
312
  auto_pair=cfg.fastq_auto_pairing,
313
+ samtools_backend=cfg.samtools_backend,
222
314
  )
223
315
 
224
316
  logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
@@ -384,7 +476,14 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
384
476
  else:
385
477
  logger.info("Making bed files from the aligned and sorted BAM file")
386
478
  aligned_BAM_to_bed(
387
- aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads
479
+ aligned_sorted_output,
480
+ cfg.output_directory,
481
+ fasta,
482
+ cfg.make_bigwigs,
483
+ cfg.threads,
484
+ samtools_backend=cfg.samtools_backend,
485
+ bedtools_backend=cfg.bedtools_backend,
486
+ bigwig_backend=cfg.bigwig_backend,
388
487
  )
389
488
  ########################################################################################################################
390
489
 
@@ -404,7 +503,12 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
404
503
  else:
405
504
  make_dirs([cfg.split_path])
406
505
  logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
407
- all_bam_files = split_and_index_BAM(aligned_sorted_BAM, cfg.split_path, cfg.bam_suffix)
506
+ all_bam_files = split_and_index_BAM(
507
+ aligned_sorted_BAM,
508
+ cfg.split_path,
509
+ cfg.bam_suffix,
510
+ samtools_backend=cfg.samtools_backend,
511
+ )
408
512
 
409
513
  unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
410
514
  bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
@@ -489,7 +593,16 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
489
593
  else:
490
594
  logger.info("Making BED files from BAM files for each sample")
491
595
  for bam in bam_files:
492
- aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
596
+ aligned_BAM_to_bed(
597
+ bam,
598
+ cfg.split_path,
599
+ fasta,
600
+ cfg.make_bigwigs,
601
+ cfg.threads,
602
+ samtools_backend=cfg.samtools_backend,
603
+ bedtools_backend=cfg.bedtools_backend,
604
+ bigwig_backend=cfg.bigwig_backend,
605
+ )
493
606
  ########################################################################################################################
494
607
 
495
608
  ################################### 6) SAMTools based BAM QC ######################################################################
@@ -501,7 +614,13 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
501
614
  else:
502
615
  make_dirs([bam_qc_dir])
503
616
  logger.info("Performing BAM QC")
504
- bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
617
+ bam_qc(
618
+ bam_files,
619
+ bam_qc_dir,
620
+ cfg.threads,
621
+ modality=cfg.smf_modality,
622
+ samtools_backend=cfg.samtools_backend,
623
+ )
505
624
  ########################################################################################################################
506
625
 
507
626
  ################################### 7) AnnData loading ######################################################################
@@ -529,6 +648,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
529
648
  deaminase_footprinting,
530
649
  delete_intermediates=cfg.delete_intermediate_hdfs,
531
650
  double_barcoded_path=double_barcoded_path,
651
+ samtools_backend=cfg.samtools_backend,
532
652
  )
533
653
  else:
534
654
  if mod_bed_dir.is_dir():
@@ -584,6 +704,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
584
704
  cfg.delete_batch_hdfs,
585
705
  cfg.threads,
586
706
  double_barcoded_path,
707
+ cfg.samtools_backend,
587
708
  )
588
709
  if cfg.delete_intermediate_tsvs:
589
710
  delete_tsvs(mod_tsv_dir)
@@ -604,6 +725,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
604
725
  extract_read_features_from_bam_callable=extract_read_features_from_bam,
605
726
  bypass=cfg.bypass_add_read_length_and_mapping_qc,
606
727
  force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
728
+ samtools_backend=cfg.samtools_backend,
607
729
  )
608
730
 
609
731
  raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
@@ -639,7 +761,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
639
761
  # multiqc ###
640
762
  mqc_dir = cfg.split_path / "multiqc"
641
763
  if mqc_dir.is_dir():
642
- logger.debug(f"{mqc_dir} already exists, skipping multiqc")
764
+ logger.info(f"{mqc_dir} already exists, skipping multiqc")
643
765
  else:
644
766
  logger.info("Running multiqc")
645
767
  run_multiqc(cfg.split_path, mqc_dir)
@@ -665,93 +787,3 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
665
787
  ########################################################################################################################
666
788
 
667
789
  return raw_adata, raw_adata_path, cfg
668
-
669
-
670
- def load_adata(config_path: str):
671
- """
672
- CLI-facing wrapper for the load pipeline.
673
-
674
- - Reads config CSV into ExperimentConfig
675
- - Computes canonical paths for all downstream AnnData stages
676
- - Registers those in the summary CSV
677
- - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
678
- - If needed, calls the core pipeline to actually build the raw AnnData
679
-
680
- Returns
681
- -------
682
- adata : anndata.AnnData | None
683
- Newly created AnnData object, or None if we skipped because a later-stage
684
- AnnData already exists.
685
- adata_path : pathlib.Path
686
- Path to the "current" AnnData that should be used downstream.
687
- cfg : ExperimentConfig
688
- Config object for downstream steps.
689
- """
690
- from datetime import datetime
691
- from importlib import resources
692
-
693
- from ..config import ExperimentConfig, LoadExperimentConfig
694
- from ..readwrite import add_or_update_column_in_csv, make_dirs
695
- from .helpers import get_adata_paths
696
-
697
- date_str = datetime.today().strftime("%y%m%d")
698
-
699
- # -----------------------------
700
- # 1) Load config into cfg
701
- # -----------------------------
702
- loader = LoadExperimentConfig(config_path)
703
- defaults_dir = resources.files("smftools").joinpath("config")
704
- cfg, report = ExperimentConfig.from_var_dict(
705
- loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
706
- )
707
-
708
- # Ensure base output dir
709
- make_dirs([cfg.output_directory])
710
-
711
- # -----------------------------
712
- # 2) Compute and register paths
713
- # -----------------------------
714
- paths = get_adata_paths(cfg)
715
-
716
- # experiment-level metadata in summary CSV
717
- add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
718
- add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
719
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
720
- add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
721
-
722
- # AnnData stage paths
723
- add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
724
- add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
725
- add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
726
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
727
- add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
728
-
729
- # -----------------------------
730
- # 3) Stage skipping logic
731
- # -----------------------------
732
- if not getattr(cfg, "force_redo_load_adata", False):
733
- if paths.hmm.exists():
734
- logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
735
- return None, paths.hmm, cfg
736
- if paths.spatial.exists():
737
- logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
738
- return None, paths.spatial, cfg
739
- if paths.pp_dedup.exists():
740
- logger.debug(
741
- f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
742
- f"Skipping smftools load"
743
- )
744
- return None, paths.pp_dedup, cfg
745
- if paths.pp.exists():
746
- logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
747
- return None, paths.pp, cfg
748
- if paths.raw.exists():
749
- logger.debug(
750
- f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
751
- )
752
- return None, paths.raw, cfg
753
-
754
- # If we get here, we actually want to run the full load pipeline
755
- adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
756
-
757
- return adata, adata_path, cfg
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  from typing import Optional, Tuple
3
5
 
@@ -1,9 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  from typing import Optional, Tuple
3
5
 
4
6
  import anndata as ad
5
7
 
6
8
  from smftools.logging_utils import get_logger
9
+ from smftools.optional_imports import require
7
10
 
8
11
  logger = get_logger(__name__)
9
12
 
@@ -153,7 +156,8 @@ def spatial_adata_core(
153
156
 
154
157
  import numpy as np
155
158
  import pandas as pd
156
- import scanpy as sc
159
+
160
+ sc = require("scanpy", extra="scanpy", purpose="spatial analyses")
157
161
 
158
162
  from ..metadata import record_smftools_metadata
159
163
  from ..plotting import (
smftools/cli_entry.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  from pathlib import Path
3
5
  from typing import Sequence
@@ -10,10 +12,32 @@ from .cli.load_adata import load_adata
10
12
  from .cli.preprocess_adata import preprocess_adata
11
13
  from .cli.spatial_adata import spatial_adata
12
14
  from .informatics.pod5_functions import subsample_pod5
13
- from .logging_utils import setup_logging
15
+ from .logging_utils import get_logger, setup_logging
14
16
  from .readwrite import concatenate_h5ads
15
17
 
16
18
 
19
+ def _configure_multiprocessing() -> None:
20
+ import multiprocessing as mp
21
+ import sys
22
+
23
+ logger = get_logger(__name__)
24
+
25
+ try:
26
+ if sys.platform == "win32":
27
+ mp.set_start_method("spawn")
28
+ logger.debug("Setting multiprocessing start method to spawn")
29
+ else:
30
+ # try forkserver first, fallback to spawn
31
+ try:
32
+ mp.set_start_method("forkserver")
33
+ logger.debug("Setting multiprocessing start method to forkserver")
34
+ except ValueError:
35
+ mp.set_start_method("spawn")
36
+ logger.debug("Setting multiprocessing start method to spawn")
37
+ except RuntimeError:
38
+ logger.warning("Could not set multiprocessing start method")
39
+
40
+
17
41
  @click.group()
18
42
  @click.option(
19
43
  "--log-file",
@@ -32,6 +56,7 @@ def cli(log_file: Path | None, log_level: str):
32
56
  """Command-line interface for smftools."""
33
57
  level = getattr(logging, log_level.upper(), logging.INFO)
34
58
  setup_logging(level=level, log_file=log_file)
59
+ _configure_multiprocessing()
35
60
 
36
61
 
37
62
  ####### Load anndata from raw data ###########
@@ -1 +1,3 @@
1
+ from __future__ import annotations
2
+
1
3
  from .experiment_config import ExperimentConfig, LoadExperimentConfig
@@ -77,6 +77,9 @@ aligner_args:
77
77
  # Sorted BAM and BED specific handling
78
78
  make_bigwigs: False # Whether to make coverage bigwigs
79
79
  make_beds: False # Whether to make beds from the aligned bams
80
+ samtools_backend: auto # auto|python|cli for samtools-compatible operations
81
+ bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
82
+ bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
80
83
 
81
84
  # Nanopore specific demultiplexing
82
85
  barcode_both_ends: False # dorado demultiplexing
@@ -370,4 +373,4 @@ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation
370
373
  bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
371
374
  force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
372
375
  bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
373
- force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
376
+ force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
@@ -736,6 +736,9 @@ class ExperimentConfig:
736
736
  aligner_args: Optional[List[str]] = None
737
737
  make_bigwigs: bool = False
738
738
  make_beds: bool = False
739
+ samtools_backend: str = "auto"
740
+ bedtools_backend: str = "auto"
741
+ bigwig_backend: str = "auto"
739
742
 
740
743
  # Anndata structure
741
744
  reference_column: Optional[str] = REF_COL
@@ -1264,6 +1267,9 @@ class ExperimentConfig:
1264
1267
  device=merged.get("device", "auto"),
1265
1268
  make_bigwigs=merged.get("make_bigwigs", False),
1266
1269
  make_beds=merged.get("make_beds", False),
1270
+ samtools_backend=merged.get("samtools_backend", "auto"),
1271
+ bedtools_backend=merged.get("bedtools_backend", "auto"),
1272
+ bigwig_backend=merged.get("bigwig_backend", "auto"),
1267
1273
  delete_intermediate_hdfs=merged.get("delete_intermediate_hdfs", True),
1268
1274
  mod_target_bases=merged.get("mod_target_bases", ["GpC", "CpG"]),
1269
1275
  enzyme_target_bases=merged.get("enzyme_target_bases", ["GpC"]),
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from .datasets import Kissiov_and_McKenna_2025, dCas9_kinetics
2
4
 
3
5
  __all__ = ["dCas9_kinetics", "Kissiov_and_McKenna_2025"]
smftools/hmm/HMM.py CHANGED
@@ -3,14 +3,20 @@ from __future__ import annotations
3
3
  import ast
4
4
  import json
5
5
  from pathlib import Path
6
- from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
7
7
 
8
8
  import numpy as np
9
- import torch
10
- import torch.nn as nn
11
9
  from scipy.sparse import issparse
12
10
 
13
11
  from smftools.logging_utils import get_logger
12
+ from smftools.optional_imports import require
13
+
14
+ if TYPE_CHECKING:
15
+ import torch as torch_types
16
+ import torch.nn as nn_types
17
+
18
+ torch = require("torch", extra="torch", purpose="HMM modeling")
19
+ nn = torch.nn
14
20
 
15
21
  logger = get_logger(__name__)
16
22
  # =============================================================================
smftools/hmm/__init__.py CHANGED
@@ -1,13 +1,24 @@
1
- from .call_hmm_peaks import call_hmm_peaks
2
- from .display_hmm import display_hmm
3
- from .hmm_readwrite import load_hmm, save_hmm
4
- from .nucleosome_hmm_refinement import infer_nucleosomes_in_large_bound, refine_nucleosome_calls
5
-
6
- __all__ = [
7
- "call_hmm_peaks",
8
- "display_hmm",
9
- "load_hmm",
10
- "refine_nucleosome_calls",
11
- "infer_nucleosomes_in_large_bound",
12
- "save_hmm",
13
- ]
1
+ from __future__ import annotations
2
+
3
+ from importlib import import_module
4
+
5
+ _LAZY_ATTRS = {
6
+ "call_hmm_peaks": "smftools.hmm.call_hmm_peaks",
7
+ "display_hmm": "smftools.hmm.display_hmm",
8
+ "load_hmm": "smftools.hmm.hmm_readwrite",
9
+ "save_hmm": "smftools.hmm.hmm_readwrite",
10
+ "infer_nucleosomes_in_large_bound": "smftools.hmm.nucleosome_hmm_refinement",
11
+ "refine_nucleosome_calls": "smftools.hmm.nucleosome_hmm_refinement",
12
+ }
13
+
14
+
15
+ def __getattr__(name: str):
16
+ if name in _LAZY_ATTRS:
17
+ module = import_module(_LAZY_ATTRS[name])
18
+ attr = getattr(module, name)
19
+ globals()[name] = attr
20
+ return attr
21
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
22
+
23
+
24
+ __all__ = list(_LAZY_ATTRS.keys())
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import numpy as np
2
4
  import pandas as pd
3
5
  import torch
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # calculate_distances
2
4
 
3
5
  def calculate_distances(intervals, threshold=0.9):
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  def call_hmm_peaks(
2
4
  adata,
3
5
  feature_configs,
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  def train_hmm(
2
4
  data,
3
5
  emission_probs=[[0.8, 0.2], [0.2, 0.8]],
@@ -1,9 +1,11 @@
1
- # FILE: smftools/hmm/call_hmm_peaks.py
1
+ from __future__ import annotations
2
2
 
3
+ # FILE: smftools/hmm/call_hmm_peaks.py
3
4
  from pathlib import Path
4
5
  from typing import Any, Dict, Optional, Sequence, Union
5
6
 
6
7
  from smftools.logging_utils import get_logger
8
+ from smftools.optional_imports import require
7
9
 
8
10
  logger = get_logger(__name__)
9
11
 
@@ -35,12 +37,13 @@ def call_hmm_peaks(
35
37
  - adata.var["is_in_any_{layer}_peak_{ref}"]
36
38
  - adata.var["is_in_any_peak"] (global)
37
39
  """
38
- import matplotlib.pyplot as plt
39
40
  import numpy as np
40
41
  import pandas as pd
41
42
  from scipy.signal import find_peaks
42
43
  from scipy.sparse import issparse
43
44
 
45
+ plt = require("matplotlib.pyplot", extra="plotting", purpose="HMM peak plots")
46
+
44
47
  if not inplace:
45
48
  adata = adata.copy()
46
49