smftools 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +2 -0
- smftools/cli/hmm_adata.py +7 -2
- smftools/cli/load_adata.py +130 -98
- smftools/cli/preprocess_adata.py +2 -0
- smftools/cli/spatial_adata.py +5 -1
- smftools/cli_entry.py +26 -1
- smftools/config/__init__.py +2 -0
- smftools/config/default.yaml +4 -1
- smftools/config/experiment_config.py +6 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +9 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +53 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +737 -170
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +66 -22
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +8 -2
- smftools/informatics/modkit_extract_to_adata.py +16 -6
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +6 -3
- smftools/plotting/hmm_plotting.py +12 -2
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/tools/__init__.py +26 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_umap.py +3 -1
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/METADATA +67 -33
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py
CHANGED
|
@@ -1,20 +1,52 @@
|
|
|
1
1
|
"""smftools"""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
4
6
|
import warnings
|
|
7
|
+
from importlib import import_module
|
|
5
8
|
from importlib.metadata import version
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
6
10
|
|
|
7
|
-
from . import
|
|
8
|
-
from . import informatics as inform
|
|
9
|
-
from . import machine_learning as ml
|
|
10
|
-
from . import plotting as pl
|
|
11
|
-
from . import preprocessing as pp
|
|
12
|
-
from . import tools as tl
|
|
13
|
-
from .readwrite import adata_to_df, merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad
|
|
11
|
+
from .readwrite import adata_to_df, safe_read_h5ad, safe_write_h5ad
|
|
14
12
|
|
|
15
13
|
package_name = "smftools"
|
|
16
14
|
__version__ = version(package_name)
|
|
17
15
|
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from smftools import (
|
|
18
|
+
cli,
|
|
19
|
+
config,
|
|
20
|
+
datasets,
|
|
21
|
+
hmm,
|
|
22
|
+
informatics,
|
|
23
|
+
machine_learning,
|
|
24
|
+
plotting,
|
|
25
|
+
preprocessing,
|
|
26
|
+
tools,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
_LAZY_MODULES = {
|
|
30
|
+
"cli": "smftools.cli",
|
|
31
|
+
"config": "smftools.config",
|
|
32
|
+
"datasets": "smftools.datasets",
|
|
33
|
+
"hmm": "smftools.hmm",
|
|
34
|
+
"inform": "smftools.informatics",
|
|
35
|
+
"ml": "smftools.machine_learning",
|
|
36
|
+
"pl": "smftools.plotting",
|
|
37
|
+
"pp": "smftools.preprocessing",
|
|
38
|
+
"tl": "smftools.tools",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def __getattr__(name: str):
|
|
43
|
+
if name in _LAZY_MODULES:
|
|
44
|
+
module = import_module(_LAZY_MODULES[name])
|
|
45
|
+
globals()[name] = module
|
|
46
|
+
return module
|
|
47
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
48
|
+
|
|
49
|
+
|
|
18
50
|
__all__ = [
|
|
19
51
|
"adata_to_df",
|
|
20
52
|
"inform",
|
smftools/_settings.py
CHANGED
smftools/_version.py
CHANGED
smftools/cli/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from __future__ import annotations
|
smftools/cli/helpers.py
CHANGED
smftools/cli/hmm_adata.py
CHANGED
|
@@ -3,18 +3,23 @@ from __future__ import annotations
|
|
|
3
3
|
import copy
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, List, Optional, Sequence, Tuple, Union
|
|
6
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
|
-
import torch
|
|
10
9
|
|
|
11
10
|
from smftools.logging_utils import get_logger
|
|
11
|
+
from smftools.optional_imports import require
|
|
12
12
|
|
|
13
13
|
# FIX: import _to_dense_np to avoid NameError
|
|
14
14
|
from ..hmm.HMM import _safe_int_coords, _to_dense_np, create_hmm, normalize_hmm_feature_sets
|
|
15
15
|
|
|
16
16
|
logger = get_logger(__name__)
|
|
17
17
|
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
import torch as torch_types
|
|
20
|
+
|
|
21
|
+
torch = require("torch", extra="torch", purpose="HMM CLI")
|
|
22
|
+
|
|
18
23
|
# =============================================================================
|
|
19
24
|
# Helpers: extracting training arrays
|
|
20
25
|
# =============================================================================
|
smftools/cli/load_adata.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import shutil
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
from typing import Iterable, Union
|
|
4
6
|
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
5
9
|
from smftools.logging_utils import get_logger
|
|
6
10
|
|
|
7
11
|
from .helpers import AdataPaths
|
|
@@ -76,6 +80,96 @@ def delete_tsvs(
|
|
|
76
80
|
logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
|
|
77
81
|
|
|
78
82
|
|
|
83
|
+
def load_adata(config_path: str):
|
|
84
|
+
"""
|
|
85
|
+
CLI-facing wrapper for the load pipeline.
|
|
86
|
+
|
|
87
|
+
- Reads config CSV into ExperimentConfig
|
|
88
|
+
- Computes canonical paths for all downstream AnnData stages
|
|
89
|
+
- Registers those in the summary CSV
|
|
90
|
+
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
91
|
+
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
adata : anndata.AnnData | None
|
|
96
|
+
Newly created AnnData object, or None if we skipped because a later-stage
|
|
97
|
+
AnnData already exists.
|
|
98
|
+
adata_path : pathlib.Path
|
|
99
|
+
Path to the "current" AnnData that should be used downstream.
|
|
100
|
+
cfg : ExperimentConfig
|
|
101
|
+
Config object for downstream steps.
|
|
102
|
+
"""
|
|
103
|
+
from datetime import datetime
|
|
104
|
+
from importlib import resources
|
|
105
|
+
|
|
106
|
+
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
107
|
+
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
108
|
+
from .helpers import get_adata_paths
|
|
109
|
+
|
|
110
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
111
|
+
|
|
112
|
+
# -----------------------------
|
|
113
|
+
# 1) Load config into cfg
|
|
114
|
+
# -----------------------------
|
|
115
|
+
loader = LoadExperimentConfig(config_path)
|
|
116
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
117
|
+
cfg, report = ExperimentConfig.from_var_dict(
|
|
118
|
+
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Ensure base output dir
|
|
122
|
+
make_dirs([cfg.output_directory])
|
|
123
|
+
|
|
124
|
+
# -----------------------------
|
|
125
|
+
# 2) Compute and register paths
|
|
126
|
+
# -----------------------------
|
|
127
|
+
paths = get_adata_paths(cfg)
|
|
128
|
+
|
|
129
|
+
# experiment-level metadata in summary CSV
|
|
130
|
+
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
131
|
+
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
132
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
133
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
134
|
+
|
|
135
|
+
# AnnData stage paths
|
|
136
|
+
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
137
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
138
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
139
|
+
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
140
|
+
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
141
|
+
|
|
142
|
+
# -----------------------------
|
|
143
|
+
# 3) Stage skipping logic
|
|
144
|
+
# -----------------------------
|
|
145
|
+
if not getattr(cfg, "force_redo_load_adata", False):
|
|
146
|
+
if paths.hmm.exists():
|
|
147
|
+
logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
148
|
+
return None, paths.hmm, cfg
|
|
149
|
+
if paths.spatial.exists():
|
|
150
|
+
logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
151
|
+
return None, paths.spatial, cfg
|
|
152
|
+
if paths.pp_dedup.exists():
|
|
153
|
+
logger.debug(
|
|
154
|
+
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
155
|
+
f"Skipping smftools load"
|
|
156
|
+
)
|
|
157
|
+
return None, paths.pp_dedup, cfg
|
|
158
|
+
if paths.pp.exists():
|
|
159
|
+
logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
160
|
+
return None, paths.pp, cfg
|
|
161
|
+
if paths.raw.exists():
|
|
162
|
+
logger.debug(
|
|
163
|
+
f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
|
|
164
|
+
)
|
|
165
|
+
return None, paths.raw, cfg
|
|
166
|
+
|
|
167
|
+
# If we get here, we actually want to run the full load pipeline
|
|
168
|
+
adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
|
|
169
|
+
|
|
170
|
+
return adata, adata_path, cfg
|
|
171
|
+
|
|
172
|
+
|
|
79
173
|
def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
80
174
|
"""
|
|
81
175
|
Core load pipeline.
|
|
@@ -105,9 +199,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
105
199
|
cfg : ExperimentConfig
|
|
106
200
|
(Same object, possibly with some fields updated, e.g. fasta path.)
|
|
107
201
|
"""
|
|
108
|
-
from pathlib import Path
|
|
109
|
-
|
|
110
|
-
import numpy as np
|
|
111
202
|
|
|
112
203
|
from ..informatics.bam_functions import (
|
|
113
204
|
align_and_sort_BAM,
|
|
@@ -219,6 +310,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
219
310
|
rg_sample_field=None,
|
|
220
311
|
progress=False,
|
|
221
312
|
auto_pair=cfg.fastq_auto_pairing,
|
|
313
|
+
samtools_backend=cfg.samtools_backend,
|
|
222
314
|
)
|
|
223
315
|
|
|
224
316
|
logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
|
|
@@ -384,7 +476,14 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
384
476
|
else:
|
|
385
477
|
logger.info("Making bed files from the aligned and sorted BAM file")
|
|
386
478
|
aligned_BAM_to_bed(
|
|
387
|
-
aligned_sorted_output,
|
|
479
|
+
aligned_sorted_output,
|
|
480
|
+
cfg.output_directory,
|
|
481
|
+
fasta,
|
|
482
|
+
cfg.make_bigwigs,
|
|
483
|
+
cfg.threads,
|
|
484
|
+
samtools_backend=cfg.samtools_backend,
|
|
485
|
+
bedtools_backend=cfg.bedtools_backend,
|
|
486
|
+
bigwig_backend=cfg.bigwig_backend,
|
|
388
487
|
)
|
|
389
488
|
########################################################################################################################
|
|
390
489
|
|
|
@@ -404,7 +503,12 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
404
503
|
else:
|
|
405
504
|
make_dirs([cfg.split_path])
|
|
406
505
|
logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
|
|
407
|
-
all_bam_files = split_and_index_BAM(
|
|
506
|
+
all_bam_files = split_and_index_BAM(
|
|
507
|
+
aligned_sorted_BAM,
|
|
508
|
+
cfg.split_path,
|
|
509
|
+
cfg.bam_suffix,
|
|
510
|
+
samtools_backend=cfg.samtools_backend,
|
|
511
|
+
)
|
|
408
512
|
|
|
409
513
|
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
410
514
|
bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
|
|
@@ -489,7 +593,16 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
489
593
|
else:
|
|
490
594
|
logger.info("Making BED files from BAM files for each sample")
|
|
491
595
|
for bam in bam_files:
|
|
492
|
-
aligned_BAM_to_bed(
|
|
596
|
+
aligned_BAM_to_bed(
|
|
597
|
+
bam,
|
|
598
|
+
cfg.split_path,
|
|
599
|
+
fasta,
|
|
600
|
+
cfg.make_bigwigs,
|
|
601
|
+
cfg.threads,
|
|
602
|
+
samtools_backend=cfg.samtools_backend,
|
|
603
|
+
bedtools_backend=cfg.bedtools_backend,
|
|
604
|
+
bigwig_backend=cfg.bigwig_backend,
|
|
605
|
+
)
|
|
493
606
|
########################################################################################################################
|
|
494
607
|
|
|
495
608
|
################################### 6) SAMTools based BAM QC ######################################################################
|
|
@@ -501,7 +614,13 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
501
614
|
else:
|
|
502
615
|
make_dirs([bam_qc_dir])
|
|
503
616
|
logger.info("Performing BAM QC")
|
|
504
|
-
bam_qc(
|
|
617
|
+
bam_qc(
|
|
618
|
+
bam_files,
|
|
619
|
+
bam_qc_dir,
|
|
620
|
+
cfg.threads,
|
|
621
|
+
modality=cfg.smf_modality,
|
|
622
|
+
samtools_backend=cfg.samtools_backend,
|
|
623
|
+
)
|
|
505
624
|
########################################################################################################################
|
|
506
625
|
|
|
507
626
|
################################### 7) AnnData loading ######################################################################
|
|
@@ -529,6 +648,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
529
648
|
deaminase_footprinting,
|
|
530
649
|
delete_intermediates=cfg.delete_intermediate_hdfs,
|
|
531
650
|
double_barcoded_path=double_barcoded_path,
|
|
651
|
+
samtools_backend=cfg.samtools_backend,
|
|
532
652
|
)
|
|
533
653
|
else:
|
|
534
654
|
if mod_bed_dir.is_dir():
|
|
@@ -584,6 +704,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
584
704
|
cfg.delete_batch_hdfs,
|
|
585
705
|
cfg.threads,
|
|
586
706
|
double_barcoded_path,
|
|
707
|
+
cfg.samtools_backend,
|
|
587
708
|
)
|
|
588
709
|
if cfg.delete_intermediate_tsvs:
|
|
589
710
|
delete_tsvs(mod_tsv_dir)
|
|
@@ -604,6 +725,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
604
725
|
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
605
726
|
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
606
727
|
force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
|
|
728
|
+
samtools_backend=cfg.samtools_backend,
|
|
607
729
|
)
|
|
608
730
|
|
|
609
731
|
raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
|
|
@@ -639,7 +761,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
639
761
|
# multiqc ###
|
|
640
762
|
mqc_dir = cfg.split_path / "multiqc"
|
|
641
763
|
if mqc_dir.is_dir():
|
|
642
|
-
logger.
|
|
764
|
+
logger.info(f"{mqc_dir} already exists, skipping multiqc")
|
|
643
765
|
else:
|
|
644
766
|
logger.info("Running multiqc")
|
|
645
767
|
run_multiqc(cfg.split_path, mqc_dir)
|
|
@@ -665,93 +787,3 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
665
787
|
########################################################################################################################
|
|
666
788
|
|
|
667
789
|
return raw_adata, raw_adata_path, cfg
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
def load_adata(config_path: str):
|
|
671
|
-
"""
|
|
672
|
-
CLI-facing wrapper for the load pipeline.
|
|
673
|
-
|
|
674
|
-
- Reads config CSV into ExperimentConfig
|
|
675
|
-
- Computes canonical paths for all downstream AnnData stages
|
|
676
|
-
- Registers those in the summary CSV
|
|
677
|
-
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
678
|
-
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
679
|
-
|
|
680
|
-
Returns
|
|
681
|
-
-------
|
|
682
|
-
adata : anndata.AnnData | None
|
|
683
|
-
Newly created AnnData object, or None if we skipped because a later-stage
|
|
684
|
-
AnnData already exists.
|
|
685
|
-
adata_path : pathlib.Path
|
|
686
|
-
Path to the "current" AnnData that should be used downstream.
|
|
687
|
-
cfg : ExperimentConfig
|
|
688
|
-
Config object for downstream steps.
|
|
689
|
-
"""
|
|
690
|
-
from datetime import datetime
|
|
691
|
-
from importlib import resources
|
|
692
|
-
|
|
693
|
-
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
694
|
-
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
695
|
-
from .helpers import get_adata_paths
|
|
696
|
-
|
|
697
|
-
date_str = datetime.today().strftime("%y%m%d")
|
|
698
|
-
|
|
699
|
-
# -----------------------------
|
|
700
|
-
# 1) Load config into cfg
|
|
701
|
-
# -----------------------------
|
|
702
|
-
loader = LoadExperimentConfig(config_path)
|
|
703
|
-
defaults_dir = resources.files("smftools").joinpath("config")
|
|
704
|
-
cfg, report = ExperimentConfig.from_var_dict(
|
|
705
|
-
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
706
|
-
)
|
|
707
|
-
|
|
708
|
-
# Ensure base output dir
|
|
709
|
-
make_dirs([cfg.output_directory])
|
|
710
|
-
|
|
711
|
-
# -----------------------------
|
|
712
|
-
# 2) Compute and register paths
|
|
713
|
-
# -----------------------------
|
|
714
|
-
paths = get_adata_paths(cfg)
|
|
715
|
-
|
|
716
|
-
# experiment-level metadata in summary CSV
|
|
717
|
-
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
718
|
-
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
719
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
720
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
721
|
-
|
|
722
|
-
# AnnData stage paths
|
|
723
|
-
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
724
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
725
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
726
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
727
|
-
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
728
|
-
|
|
729
|
-
# -----------------------------
|
|
730
|
-
# 3) Stage skipping logic
|
|
731
|
-
# -----------------------------
|
|
732
|
-
if not getattr(cfg, "force_redo_load_adata", False):
|
|
733
|
-
if paths.hmm.exists():
|
|
734
|
-
logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
735
|
-
return None, paths.hmm, cfg
|
|
736
|
-
if paths.spatial.exists():
|
|
737
|
-
logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
738
|
-
return None, paths.spatial, cfg
|
|
739
|
-
if paths.pp_dedup.exists():
|
|
740
|
-
logger.debug(
|
|
741
|
-
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
742
|
-
f"Skipping smftools load"
|
|
743
|
-
)
|
|
744
|
-
return None, paths.pp_dedup, cfg
|
|
745
|
-
if paths.pp.exists():
|
|
746
|
-
logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
747
|
-
return None, paths.pp, cfg
|
|
748
|
-
if paths.raw.exists():
|
|
749
|
-
logger.debug(
|
|
750
|
-
f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
|
|
751
|
-
)
|
|
752
|
-
return None, paths.raw, cfg
|
|
753
|
-
|
|
754
|
-
# If we get here, we actually want to run the full load pipeline
|
|
755
|
-
adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
|
|
756
|
-
|
|
757
|
-
return adata, adata_path, cfg
|
smftools/cli/preprocess_adata.py
CHANGED
smftools/cli/spatial_adata.py
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
4
|
from typing import Optional, Tuple
|
|
3
5
|
|
|
4
6
|
import anndata as ad
|
|
5
7
|
|
|
6
8
|
from smftools.logging_utils import get_logger
|
|
9
|
+
from smftools.optional_imports import require
|
|
7
10
|
|
|
8
11
|
logger = get_logger(__name__)
|
|
9
12
|
|
|
@@ -153,7 +156,8 @@ def spatial_adata_core(
|
|
|
153
156
|
|
|
154
157
|
import numpy as np
|
|
155
158
|
import pandas as pd
|
|
156
|
-
|
|
159
|
+
|
|
160
|
+
sc = require("scanpy", extra="scanpy", purpose="spatial analyses")
|
|
157
161
|
|
|
158
162
|
from ..metadata import record_smftools_metadata
|
|
159
163
|
from ..plotting import (
|
smftools/cli_entry.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
from typing import Sequence
|
|
@@ -10,10 +12,32 @@ from .cli.load_adata import load_adata
|
|
|
10
12
|
from .cli.preprocess_adata import preprocess_adata
|
|
11
13
|
from .cli.spatial_adata import spatial_adata
|
|
12
14
|
from .informatics.pod5_functions import subsample_pod5
|
|
13
|
-
from .logging_utils import setup_logging
|
|
15
|
+
from .logging_utils import get_logger, setup_logging
|
|
14
16
|
from .readwrite import concatenate_h5ads
|
|
15
17
|
|
|
16
18
|
|
|
19
|
+
def _configure_multiprocessing() -> None:
|
|
20
|
+
import multiprocessing as mp
|
|
21
|
+
import sys
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
if sys.platform == "win32":
|
|
27
|
+
mp.set_start_method("spawn")
|
|
28
|
+
logger.debug("Setting multiprocessing start method to spawn")
|
|
29
|
+
else:
|
|
30
|
+
# try forkserver first, fallback to spawn
|
|
31
|
+
try:
|
|
32
|
+
mp.set_start_method("forkserver")
|
|
33
|
+
logger.debug("Setting multiprocessing start method to forkserver")
|
|
34
|
+
except ValueError:
|
|
35
|
+
mp.set_start_method("spawn")
|
|
36
|
+
logger.debug("Setting multiprocessing start method to spawn")
|
|
37
|
+
except RuntimeError:
|
|
38
|
+
logger.warning("Could not set multiprocessing start method")
|
|
39
|
+
|
|
40
|
+
|
|
17
41
|
@click.group()
|
|
18
42
|
@click.option(
|
|
19
43
|
"--log-file",
|
|
@@ -32,6 +56,7 @@ def cli(log_file: Path | None, log_level: str):
|
|
|
32
56
|
"""Command-line interface for smftools."""
|
|
33
57
|
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
34
58
|
setup_logging(level=level, log_file=log_file)
|
|
59
|
+
_configure_multiprocessing()
|
|
35
60
|
|
|
36
61
|
|
|
37
62
|
####### Load anndata from raw data ###########
|
smftools/config/__init__.py
CHANGED
smftools/config/default.yaml
CHANGED
|
@@ -77,6 +77,9 @@ aligner_args:
|
|
|
77
77
|
# Sorted BAM and BED specific handling
|
|
78
78
|
make_bigwigs: False # Whether to make coverage bigwigs
|
|
79
79
|
make_beds: False # Whether to make beds from the aligned bams
|
|
80
|
+
samtools_backend: auto # auto|python|cli for samtools-compatible operations
|
|
81
|
+
bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
|
|
82
|
+
bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
|
|
80
83
|
|
|
81
84
|
# Nanopore specific demultiplexing
|
|
82
85
|
barcode_both_ends: False # dorado demultiplexing
|
|
@@ -370,4 +373,4 @@ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation
|
|
|
370
373
|
bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
|
|
371
374
|
force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
|
|
372
375
|
bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
|
|
373
|
-
force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
|
|
376
|
+
force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
|
|
@@ -736,6 +736,9 @@ class ExperimentConfig:
|
|
|
736
736
|
aligner_args: Optional[List[str]] = None
|
|
737
737
|
make_bigwigs: bool = False
|
|
738
738
|
make_beds: bool = False
|
|
739
|
+
samtools_backend: str = "auto"
|
|
740
|
+
bedtools_backend: str = "auto"
|
|
741
|
+
bigwig_backend: str = "auto"
|
|
739
742
|
|
|
740
743
|
# Anndata structure
|
|
741
744
|
reference_column: Optional[str] = REF_COL
|
|
@@ -1264,6 +1267,9 @@ class ExperimentConfig:
|
|
|
1264
1267
|
device=merged.get("device", "auto"),
|
|
1265
1268
|
make_bigwigs=merged.get("make_bigwigs", False),
|
|
1266
1269
|
make_beds=merged.get("make_beds", False),
|
|
1270
|
+
samtools_backend=merged.get("samtools_backend", "auto"),
|
|
1271
|
+
bedtools_backend=merged.get("bedtools_backend", "auto"),
|
|
1272
|
+
bigwig_backend=merged.get("bigwig_backend", "auto"),
|
|
1267
1273
|
delete_intermediate_hdfs=merged.get("delete_intermediate_hdfs", True),
|
|
1268
1274
|
mod_target_bases=merged.get("mod_target_bases", ["GpC", "CpG"]),
|
|
1269
1275
|
enzyme_target_bases=merged.get("enzyme_target_bases", ["GpC"]),
|
smftools/datasets/__init__.py
CHANGED
smftools/hmm/HMM.py
CHANGED
|
@@ -3,14 +3,20 @@ from __future__ import annotations
|
|
|
3
3
|
import ast
|
|
4
4
|
import json
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
|
-
import torch
|
|
10
|
-
import torch.nn as nn
|
|
11
9
|
from scipy.sparse import issparse
|
|
12
10
|
|
|
13
11
|
from smftools.logging_utils import get_logger
|
|
12
|
+
from smftools.optional_imports import require
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import torch as torch_types
|
|
16
|
+
import torch.nn as nn_types
|
|
17
|
+
|
|
18
|
+
torch = require("torch", extra="torch", purpose="HMM modeling")
|
|
19
|
+
nn = torch.nn
|
|
14
20
|
|
|
15
21
|
logger = get_logger(__name__)
|
|
16
22
|
# =============================================================================
|
smftools/hmm/__init__.py
CHANGED
|
@@ -1,13 +1,24 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
|
|
13
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from importlib import import_module
|
|
4
|
+
|
|
5
|
+
_LAZY_ATTRS = {
|
|
6
|
+
"call_hmm_peaks": "smftools.hmm.call_hmm_peaks",
|
|
7
|
+
"display_hmm": "smftools.hmm.display_hmm",
|
|
8
|
+
"load_hmm": "smftools.hmm.hmm_readwrite",
|
|
9
|
+
"save_hmm": "smftools.hmm.hmm_readwrite",
|
|
10
|
+
"infer_nucleosomes_in_large_bound": "smftools.hmm.nucleosome_hmm_refinement",
|
|
11
|
+
"refine_nucleosome_calls": "smftools.hmm.nucleosome_hmm_refinement",
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __getattr__(name: str):
|
|
16
|
+
if name in _LAZY_ATTRS:
|
|
17
|
+
module = import_module(_LAZY_ATTRS[name])
|
|
18
|
+
attr = getattr(module, name)
|
|
19
|
+
globals()[name] = attr
|
|
20
|
+
return attr
|
|
21
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
__all__ = list(_LAZY_ATTRS.keys())
|
smftools/hmm/call_hmm_peaks.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
# FILE: smftools/hmm/call_hmm_peaks.py
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any, Dict, Optional, Sequence, Union
|
|
5
6
|
|
|
6
7
|
from smftools.logging_utils import get_logger
|
|
8
|
+
from smftools.optional_imports import require
|
|
7
9
|
|
|
8
10
|
logger = get_logger(__name__)
|
|
9
11
|
|
|
@@ -35,12 +37,13 @@ def call_hmm_peaks(
|
|
|
35
37
|
- adata.var["is_in_any_{layer}_peak_{ref}"]
|
|
36
38
|
- adata.var["is_in_any_peak"] (global)
|
|
37
39
|
"""
|
|
38
|
-
import matplotlib.pyplot as plt
|
|
39
40
|
import numpy as np
|
|
40
41
|
import pandas as pd
|
|
41
42
|
from scipy.signal import find_peaks
|
|
42
43
|
from scipy.sparse import issparse
|
|
43
44
|
|
|
45
|
+
plt = require("matplotlib.pyplot", extra="plotting", purpose="HMM peak plots")
|
|
46
|
+
|
|
44
47
|
if not inplace:
|
|
45
48
|
adata = adata.copy()
|
|
46
49
|
|