smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py
CHANGED
|
@@ -1,19 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Sequence
|
|
6
|
+
|
|
1
7
|
import click
|
|
2
8
|
import pandas as pd
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Dict, Optional, Sequence
|
|
5
9
|
|
|
10
|
+
from .cli.hmm_adata import hmm_adata
|
|
6
11
|
from .cli.load_adata import load_adata
|
|
7
12
|
from .cli.preprocess_adata import preprocess_adata
|
|
8
13
|
from .cli.spatial_adata import spatial_adata
|
|
9
|
-
from .
|
|
14
|
+
from .informatics.pod5_functions import subsample_pod5
|
|
15
|
+
from .logging_utils import get_logger, setup_logging
|
|
16
|
+
from .readwrite import concatenate_h5ads
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _configure_multiprocessing() -> None:
|
|
20
|
+
import multiprocessing as mp
|
|
21
|
+
import sys
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
if sys.platform == "win32":
|
|
27
|
+
mp.set_start_method("spawn")
|
|
28
|
+
logger.debug("Setting multiprocessing start method to spawn")
|
|
29
|
+
else:
|
|
30
|
+
# try forkserver first, fallback to spawn
|
|
31
|
+
try:
|
|
32
|
+
mp.set_start_method("forkserver")
|
|
33
|
+
logger.debug("Setting multiprocessing start method to forkserver")
|
|
34
|
+
except ValueError:
|
|
35
|
+
mp.set_start_method("spawn")
|
|
36
|
+
logger.debug("Setting multiprocessing start method to spawn")
|
|
37
|
+
except RuntimeError:
|
|
38
|
+
logger.warning("Could not set multiprocessing start method")
|
|
10
39
|
|
|
11
|
-
from .readwrite import safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
|
|
12
40
|
|
|
13
41
|
@click.group()
|
|
14
|
-
|
|
42
|
+
@click.option(
|
|
43
|
+
"--log-file",
|
|
44
|
+
type=click.Path(dir_okay=False, writable=True, path_type=Path),
|
|
45
|
+
default=None,
|
|
46
|
+
help="Optional file path to write smftools logs.",
|
|
47
|
+
)
|
|
48
|
+
@click.option(
|
|
49
|
+
"--log-level",
|
|
50
|
+
type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"], case_sensitive=False),
|
|
51
|
+
default="INFO",
|
|
52
|
+
show_default=True,
|
|
53
|
+
help="Logging level for smftools output.",
|
|
54
|
+
)
|
|
55
|
+
def cli(log_file: Path | None, log_level: str):
|
|
15
56
|
"""Command-line interface for smftools."""
|
|
16
|
-
|
|
57
|
+
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
58
|
+
setup_logging(level=level, log_file=log_file)
|
|
59
|
+
_configure_multiprocessing()
|
|
60
|
+
|
|
17
61
|
|
|
18
62
|
####### Load anndata from raw data ###########
|
|
19
63
|
@cli.command()
|
|
@@ -21,32 +65,44 @@ def cli():
|
|
|
21
65
|
def load(config_path):
|
|
22
66
|
"""Load and process data from CONFIG_PATH."""
|
|
23
67
|
load_adata(config_path)
|
|
68
|
+
|
|
69
|
+
|
|
24
70
|
##########################################
|
|
25
71
|
|
|
72
|
+
|
|
26
73
|
####### Preprocessing ###########
|
|
27
74
|
@cli.command()
|
|
28
75
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
29
76
|
def preprocess(config_path):
|
|
30
77
|
"""Preprocess data from CONFIG_PATH."""
|
|
31
78
|
preprocess_adata(config_path)
|
|
79
|
+
|
|
80
|
+
|
|
32
81
|
##########################################
|
|
33
82
|
|
|
83
|
+
|
|
34
84
|
####### Spatial ###########
|
|
35
85
|
@cli.command()
|
|
36
86
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
37
87
|
def spatial(config_path):
|
|
38
88
|
"""Process data from CONFIG_PATH."""
|
|
39
89
|
spatial_adata(config_path)
|
|
90
|
+
|
|
91
|
+
|
|
40
92
|
##########################################
|
|
41
93
|
|
|
94
|
+
|
|
42
95
|
####### HMM ###########
|
|
43
96
|
@cli.command()
|
|
44
97
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
45
98
|
def hmm(config_path):
|
|
46
99
|
"""Process data from CONFIG_PATH."""
|
|
47
100
|
hmm_adata(config_path)
|
|
101
|
+
|
|
102
|
+
|
|
48
103
|
##########################################
|
|
49
104
|
|
|
105
|
+
|
|
50
106
|
####### batch command ###########
|
|
51
107
|
@cli.command()
|
|
52
108
|
@click.argument(
|
|
@@ -125,7 +181,9 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
125
181
|
dtype=str,
|
|
126
182
|
)
|
|
127
183
|
except Exception as e:
|
|
128
|
-
raise click.ClickException(
|
|
184
|
+
raise click.ClickException(
|
|
185
|
+
f"Failed to read {config_table} as headerless list: {e}"
|
|
186
|
+
) from e
|
|
129
187
|
|
|
130
188
|
config_series = df[column]
|
|
131
189
|
else:
|
|
@@ -136,12 +194,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
136
194
|
)
|
|
137
195
|
config_series = df[column]
|
|
138
196
|
|
|
139
|
-
config_paths = (
|
|
140
|
-
config_series.dropna()
|
|
141
|
-
.map(str)
|
|
142
|
-
.map(lambda p: Path(p).expanduser())
|
|
143
|
-
.tolist()
|
|
144
|
-
)
|
|
197
|
+
config_paths = config_series.dropna().map(str).map(lambda p: Path(p).expanduser()).tolist()
|
|
145
198
|
|
|
146
199
|
# ----------------------------
|
|
147
200
|
# Validate config paths
|
|
@@ -162,9 +215,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
162
215
|
|
|
163
216
|
func = task_funcs[task]
|
|
164
217
|
|
|
165
|
-
click.echo(
|
|
166
|
-
f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
|
|
167
|
-
)
|
|
218
|
+
click.echo(f"Running task '{task}' on {len(config_paths)} config paths from {config_table}")
|
|
168
219
|
|
|
169
220
|
# ----------------------------
|
|
170
221
|
# Loop over paths
|
|
@@ -177,13 +228,16 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
177
228
|
click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
|
|
178
229
|
|
|
179
230
|
try:
|
|
180
|
-
func(str(cfg))
|
|
231
|
+
func(str(cfg)) # underlying functions take a string path
|
|
181
232
|
except Exception as e:
|
|
182
233
|
click.echo(f" ERROR on {cfg}: {e}")
|
|
183
234
|
|
|
184
235
|
click.echo("Batch processing complete.")
|
|
236
|
+
|
|
237
|
+
|
|
185
238
|
##########################################
|
|
186
239
|
|
|
240
|
+
|
|
187
241
|
####### concatenate command ###########
|
|
188
242
|
@cli.command("concatenate")
|
|
189
243
|
@click.argument(
|
|
@@ -269,166 +323,53 @@ def concatenate_cmd(
|
|
|
269
323
|
|
|
270
324
|
except Exception as e:
|
|
271
325
|
raise click.ClickException(str(e)) from e
|
|
326
|
+
|
|
327
|
+
|
|
272
328
|
##########################################
|
|
273
329
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
#
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
# raise click.ClickException("Config CSV contains only a header row.")
|
|
321
|
-
|
|
322
|
-
# # Build dict; last occurrence of a key wins
|
|
323
|
-
# cfg = {}
|
|
324
|
-
# for k, v in zip(df["key"], df["value"]):
|
|
325
|
-
# cfg[k] = v
|
|
326
|
-
|
|
327
|
-
# # Validate required keys
|
|
328
|
-
# missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
|
|
329
|
-
# if missing:
|
|
330
|
-
# raise click.ClickException(
|
|
331
|
-
# "Missing required keys in CSV: "
|
|
332
|
-
# + ", ".join(missing)
|
|
333
|
-
# + "\nExpected keys:\n - "
|
|
334
|
-
# + "\n - ".join(REQUIRED_KEYS)
|
|
335
|
-
# + "\nOptional keys:\n - "
|
|
336
|
-
# + "\n - ".join(OPTIONAL_KEYS)
|
|
337
|
-
# )
|
|
338
|
-
|
|
339
|
-
# return cfg
|
|
340
|
-
|
|
341
|
-
# def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
|
|
342
|
-
# """Decide on the output .h5ad path based on CSV; create directories if needed."""
|
|
343
|
-
# merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
|
|
344
|
-
# if not merged_filename.endswith(".h5ad"):
|
|
345
|
-
# merged_filename += ".h5ad"
|
|
346
|
-
|
|
347
|
-
# output_path_raw = cfg.get("output_path", "").strip()
|
|
348
|
-
|
|
349
|
-
# if not output_path_raw:
|
|
350
|
-
# out_dir = Path.cwd() / "merged_output"
|
|
351
|
-
# out_dir.mkdir(parents=True, exist_ok=True)
|
|
352
|
-
# return out_dir / merged_filename
|
|
353
|
-
|
|
354
|
-
# output_path = Path(output_path_raw)
|
|
355
|
-
|
|
356
|
-
# if output_path.suffix.lower() == ".h5ad":
|
|
357
|
-
# output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
358
|
-
# return output_path
|
|
359
|
-
|
|
360
|
-
# # Treat as directory
|
|
361
|
-
# output_path.mkdir(parents=True, exist_ok=True)
|
|
362
|
-
# return output_path / merged_filename
|
|
363
|
-
|
|
364
|
-
# def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
|
|
365
|
-
|
|
366
|
-
# if backups:
|
|
367
|
-
# click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
|
|
368
|
-
# return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
|
|
369
|
-
# else:
|
|
370
|
-
# click.echo(f"Loading {label} from {primary} with backups disabled ...")
|
|
371
|
-
# return safe_read_h5ad(primary, restore_backups=False)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
# @cli.command()
|
|
375
|
-
# @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
|
|
376
|
-
# def merge_barcoded_anndatas(config_path: Path):
|
|
377
|
-
# """
|
|
378
|
-
# Merge two AnnData objects from the same experiment that were demultiplexed
|
|
379
|
-
# under different end-barcoding requirements, using a 1-row CSV for config.
|
|
380
|
-
|
|
381
|
-
# CSV must include:
|
|
382
|
-
# - adata_single_path
|
|
383
|
-
# - adata_double_path
|
|
384
|
-
|
|
385
|
-
# Optional columns:
|
|
386
|
-
# - adata_single_backups_path
|
|
387
|
-
# - adata_double_backups_path
|
|
388
|
-
# - output_path (file or directory; default: ./merged_output/)
|
|
389
|
-
# - merged_filename (default: merged_<single>__<double>.h5ad)
|
|
390
|
-
|
|
391
|
-
# Example CSV:
|
|
392
|
-
|
|
393
|
-
# adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
|
|
394
|
-
# /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
|
|
395
|
-
# """
|
|
396
|
-
# try:
|
|
397
|
-
# cfg = _read_config_csv(config_path)
|
|
398
|
-
|
|
399
|
-
# single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
|
|
400
|
-
# double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
|
|
401
|
-
|
|
402
|
-
# for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
|
|
403
|
-
# if not p.exists():
|
|
404
|
-
# raise click.ClickException(f"{label} does not exist: {p}")
|
|
405
|
-
|
|
406
|
-
# single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
|
|
407
|
-
# double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
|
|
408
|
-
|
|
409
|
-
# if single_backups and not single_backups.exists():
|
|
410
|
-
# raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
|
|
411
|
-
# if double_backups and not double_backups.exists():
|
|
412
|
-
# raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
|
|
413
|
-
|
|
414
|
-
# output_path = _resolve_output_path(cfg, single_path, double_path)
|
|
415
|
-
|
|
416
|
-
# # Load
|
|
417
|
-
# adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
|
|
418
|
-
# adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
|
|
419
|
-
|
|
420
|
-
# click.echo("Merging AnnDatas ...")
|
|
421
|
-
# merged = merge_barcoded_anndatas_core(adata_single, adata_double)
|
|
422
|
-
|
|
423
|
-
# click.echo(f"Writing merged AnnData to: {output_path}")
|
|
424
|
-
# backup_dir = output_path.cwd() / "merged_backups"
|
|
425
|
-
# safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
|
|
426
|
-
|
|
427
|
-
# click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
|
|
428
|
-
|
|
429
|
-
# except click.ClickException:
|
|
430
|
-
# raise
|
|
431
|
-
# except Exception as e:
|
|
432
|
-
# # Surface unexpected errors cleanly
|
|
433
|
-
# raise click.ClickException(f"Unexpected error: {e}") from e
|
|
434
|
-
################################################################################################################
|
|
330
|
+
|
|
331
|
+
####### subsample pod5 command ###########
|
|
332
|
+
@cli.command("subsample-pod5")
|
|
333
|
+
@click.argument(
|
|
334
|
+
"pod5_path",
|
|
335
|
+
type=click.Path(exists=True, path_type=Path),
|
|
336
|
+
)
|
|
337
|
+
@click.option(
|
|
338
|
+
"--read-names",
|
|
339
|
+
"-r",
|
|
340
|
+
type=click.Path(exists=True, path_type=Path),
|
|
341
|
+
default=None,
|
|
342
|
+
help="Text file with one read_id per line.",
|
|
343
|
+
)
|
|
344
|
+
@click.option(
|
|
345
|
+
"--n-reads",
|
|
346
|
+
"-n",
|
|
347
|
+
type=int,
|
|
348
|
+
default=None,
|
|
349
|
+
help="Randomly subsample N reads.",
|
|
350
|
+
)
|
|
351
|
+
@click.option(
|
|
352
|
+
"--outdir",
|
|
353
|
+
"-o",
|
|
354
|
+
type=click.Path(path_type=Path, file_okay=False),
|
|
355
|
+
required=True,
|
|
356
|
+
help="Output directory for subsampled POD5.",
|
|
357
|
+
)
|
|
358
|
+
def subsample_pod5_cmd(pod5_path, read_names, n_reads, outdir):
|
|
359
|
+
"""
|
|
360
|
+
Subsample POD5 file(s) by read ID list or random sampling.
|
|
361
|
+
"""
|
|
362
|
+
|
|
363
|
+
# --- Validate mutually exclusive options ---
|
|
364
|
+
if (read_names is None and n_reads is None) or (read_names and n_reads):
|
|
365
|
+
raise click.UsageError("You must specify exactly ONE of --read-names or --n-reads.")
|
|
366
|
+
|
|
367
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
|
368
|
+
|
|
369
|
+
subsample_arg = str(read_names) if read_names else n_reads
|
|
370
|
+
|
|
371
|
+
subsample_pod5(
|
|
372
|
+
pod5_path=str(pod5_path),
|
|
373
|
+
read_name_path=subsample_arg,
|
|
374
|
+
output_directory=str(outdir),
|
|
375
|
+
)
|
smftools/config/__init__.py
CHANGED
smftools/config/conversion.yaml
CHANGED
|
@@ -9,6 +9,10 @@ conversion_types:
|
|
|
9
9
|
# Read QC Params
|
|
10
10
|
read_mod_filtering_use_other_c_as_background: True
|
|
11
11
|
|
|
12
|
+
# Spatial Analysis - Autocorr params
|
|
13
|
+
autocorr_site_types:
|
|
14
|
+
- "GpC"
|
|
15
|
+
|
|
12
16
|
# Spatial Analysis - Clustermap params
|
|
13
17
|
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
14
18
|
clustermap_cmap_c: "coolwarm"
|
|
@@ -42,4 +46,4 @@ hmm_feature_sets:
|
|
|
42
46
|
cpg_patch: [0, inf]
|
|
43
47
|
|
|
44
48
|
hmm_merge_layer_features:
|
|
45
|
-
- ["
|
|
49
|
+
- ["all_accessible_features", 60]
|
smftools/config/deaminase.yaml
CHANGED
smftools/config/default.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# General
|
|
2
2
|
sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
|
|
3
|
-
sample_sheet_mapping_column: '
|
|
4
|
-
sample_name_col_for_plotting: '
|
|
3
|
+
sample_sheet_mapping_column: 'Experiment_name_and_barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
|
|
4
|
+
sample_name_col_for_plotting: 'Experiment_name_and_barcode'
|
|
5
5
|
|
|
6
6
|
# Compute params
|
|
7
7
|
threads: 4
|
|
@@ -9,9 +9,7 @@ device: "auto"
|
|
|
9
9
|
|
|
10
10
|
######## smftools load params #########
|
|
11
11
|
# Generic i/o
|
|
12
|
-
bam_suffix: ".bam"
|
|
13
12
|
recursive_input_search: True
|
|
14
|
-
split_dir: "demultiplexed_BAMs"
|
|
15
13
|
strands:
|
|
16
14
|
- bottom
|
|
17
15
|
- top
|
|
@@ -21,7 +19,7 @@ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barc
|
|
|
21
19
|
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
22
20
|
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
23
21
|
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
24
|
-
delete_intermediate_bams:
|
|
22
|
+
delete_intermediate_bams: True # Whether to delete intermediate BAM files.
|
|
25
23
|
delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
|
|
26
24
|
|
|
27
25
|
# Sequencing modality and general experiment params
|
|
@@ -53,7 +51,6 @@ aligner_args:
|
|
|
53
51
|
- '-y'
|
|
54
52
|
- '-N'
|
|
55
53
|
- '5'
|
|
56
|
-
- '--secondary=no'
|
|
57
54
|
pacbio:
|
|
58
55
|
- '-a'
|
|
59
56
|
- '-x'
|
|
@@ -63,7 +60,6 @@ aligner_args:
|
|
|
63
60
|
- '-y'
|
|
64
61
|
- '-N'
|
|
65
62
|
- '5'
|
|
66
|
-
- '--secondary=no'
|
|
67
63
|
illumina:
|
|
68
64
|
- '-a'
|
|
69
65
|
- '-x'
|
|
@@ -73,7 +69,6 @@ aligner_args:
|
|
|
73
69
|
- '-y'
|
|
74
70
|
- '-N'
|
|
75
71
|
- '5'
|
|
76
|
-
- '--secondary=no'
|
|
77
72
|
dorado:
|
|
78
73
|
ont:
|
|
79
74
|
- "--mm2-opts"
|
|
@@ -82,15 +77,18 @@ aligner_args:
|
|
|
82
77
|
# Sorted BAM and BED specific handling
|
|
83
78
|
make_bigwigs: False # Whether to make coverage bigwigs
|
|
84
79
|
make_beds: False # Whether to make beds from the aligned bams
|
|
80
|
+
samtools_backend: auto # auto|python|cli for samtools-compatible operations
|
|
81
|
+
bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
|
|
82
|
+
bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
|
|
85
83
|
|
|
86
84
|
# Nanopore specific demultiplexing
|
|
87
85
|
barcode_both_ends: False # dorado demultiplexing
|
|
88
86
|
trim: False # dorado adapter and barcode removal during demultiplexing
|
|
89
87
|
|
|
90
88
|
# Anndata structure
|
|
91
|
-
mapping_threshold: 0.
|
|
89
|
+
mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
|
|
92
90
|
reference_column: 'Reference_strand'
|
|
93
|
-
sample_column: '
|
|
91
|
+
sample_column: 'Experiment_name_and_barcode'
|
|
94
92
|
|
|
95
93
|
######## smftools preprocess params #########
|
|
96
94
|
# Read length, quality, and mapping filtering params
|
|
@@ -101,7 +99,7 @@ read_len_filter_thresholds:
|
|
|
101
99
|
- 100
|
|
102
100
|
- null
|
|
103
101
|
read_len_to_ref_ratio_filter_thresholds:
|
|
104
|
-
-
|
|
102
|
+
- null
|
|
105
103
|
- null
|
|
106
104
|
read_quality_filter_thresholds:
|
|
107
105
|
- 15
|
|
@@ -179,13 +177,12 @@ umap_layers_to_plot:
|
|
|
179
177
|
- "Raw_modification_signal"
|
|
180
178
|
|
|
181
179
|
# Spatial Analysis - Spatial Autocorrelation params
|
|
180
|
+
autocorr_normalization_method: "pearson" # options are pearson or sum
|
|
182
181
|
rows_per_qc_autocorr_grid: 6
|
|
183
182
|
autocorr_rolling_window_size: 25
|
|
184
183
|
autocorr_max_lag: 800
|
|
185
184
|
autocorr_site_types:
|
|
186
185
|
- "GpC"
|
|
187
|
-
- "CpG"
|
|
188
|
-
- "C"
|
|
189
186
|
|
|
190
187
|
# Spatial Analysis - Correlation Matrix params
|
|
191
188
|
correlation_matrix_types:
|
|
@@ -210,10 +207,19 @@ hmm_init_start_probs:
|
|
|
210
207
|
- 0.5
|
|
211
208
|
- 0.5
|
|
212
209
|
hmm_eps: 1e-8
|
|
210
|
+
# Fitting strategy
|
|
211
|
+
hmm_fit_strategy: "per_group" # "per_group" | "shared_transitions"
|
|
212
|
+
hmm_shared_scope: ["reference", "methbase"]
|
|
213
|
+
hmm_groupby: ["sample", "reference", "methbase"]
|
|
214
|
+
# If hmm_fit_strategy == shared_transitions
|
|
215
|
+
hmm_adapt_emissions: true
|
|
216
|
+
hmm_adapt_startprobs: true
|
|
217
|
+
hmm_emission_adapt_iters: 5
|
|
218
|
+
hmm_emission_adapt_tol: 1.0e-4
|
|
213
219
|
hmm_dtype: "float64"
|
|
214
|
-
hmm_annotation_threshold: 0.5
|
|
215
|
-
hmm_batch_size: 1024
|
|
216
|
-
hmm_use_viterbi: False
|
|
220
|
+
hmm_annotation_threshold: 0.5 # The minimum probability threshold of a feature interval to accept it for layer annotation.
|
|
221
|
+
hmm_batch_size: 1024 # hmm batch size
|
|
222
|
+
hmm_use_viterbi: False # Whether to use viterbi decoding. If False, uses forward-backward gammas. Viterbi is smoother, but less sensitive.
|
|
217
223
|
footprints: True # whether to use the default HMM footprint params
|
|
218
224
|
accessible_patches: True # whether to use the default HMM accessible patch params
|
|
219
225
|
cpg: False # whether to use the default HMM endogenous CpG patch params
|
|
@@ -238,7 +244,7 @@ hmm_feature_sets:
|
|
|
238
244
|
large_accessible_patch: [40, 110]
|
|
239
245
|
nucleosome_depleted_region: [110, inf]
|
|
240
246
|
hmm_merge_layer_features:
|
|
241
|
-
- [
|
|
247
|
+
- ["all_accessible_features", 60]
|
|
242
248
|
clustermap_cmap_hmm: "coolwarm"
|
|
243
249
|
hmm_clustermap_feature_layers:
|
|
244
250
|
- all_accessible_features
|
|
@@ -246,7 +252,9 @@ hmm_clustermap_feature_layers:
|
|
|
246
252
|
- small_accessible_patch
|
|
247
253
|
- mid_accessible_patch
|
|
248
254
|
- large_accessible_patch
|
|
255
|
+
- large_accessible_patch_merged
|
|
249
256
|
- nucleosome_depleted_region
|
|
257
|
+
- nucleosome_depleted_region_merged
|
|
250
258
|
- small_bound_stretch
|
|
251
259
|
- medium_bound_stretch
|
|
252
260
|
- putative_nucleosome
|
|
@@ -365,4 +373,4 @@ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation
|
|
|
365
373
|
bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
|
|
366
374
|
force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
|
|
367
375
|
bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
|
|
368
|
-
force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
|
|
376
|
+
force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
|
smftools/config/direct.yaml
CHANGED
|
@@ -27,10 +27,10 @@ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs afte
|
|
|
27
27
|
|
|
28
28
|
######## smftools preprocess params ########
|
|
29
29
|
fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
30
|
-
binarize_on_fixed_methlyation_threshold: 0.
|
|
30
|
+
binarize_on_fixed_methlyation_threshold: 0.5 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
31
31
|
positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
|
|
32
32
|
negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
|
|
33
|
-
infer_on_percentile_sample_methylation_fitting:
|
|
33
|
+
infer_on_percentile_sample_methylation_fitting: 5 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
34
34
|
inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
35
35
|
fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
36
36
|
output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
|
|
@@ -39,6 +39,11 @@ output_binary_layer_name: "binarized_methylation" # The layer to store the binar
|
|
|
39
39
|
autocorr_site_types:
|
|
40
40
|
- "A"
|
|
41
41
|
|
|
42
|
+
spatial_clustermap_sortby: "a"
|
|
43
|
+
|
|
42
44
|
######## smftools hmm params #########
|
|
43
45
|
hmm_methbases:
|
|
44
|
-
- "A"
|
|
46
|
+
- "A"
|
|
47
|
+
|
|
48
|
+
hmm_merge_layer_features:
|
|
49
|
+
- ["A_all_accessible_features", 60]
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Dict, List,
|
|
4
|
+
from typing import Any, Dict, List, Union
|
|
5
|
+
|
|
6
|
+
from smftools.constants import BAM_SUFFIX
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
def discover_input_files(
|
|
7
10
|
input_data_path: Union[str, Path],
|
|
8
|
-
bam_suffix: str =
|
|
11
|
+
bam_suffix: str = BAM_SUFFIX,
|
|
9
12
|
recursive: bool = False,
|
|
10
13
|
follow_symlinks: bool = False,
|
|
11
14
|
) -> Dict[str, Any]:
|
|
@@ -30,10 +33,21 @@ def discover_input_files(
|
|
|
30
33
|
bam_suffix = bam_suffix.lower()
|
|
31
34
|
|
|
32
35
|
# Sets of canonical extension keys we’ll compare against
|
|
33
|
-
pod5_exts
|
|
36
|
+
pod5_exts = {".pod5", ".p5"}
|
|
34
37
|
fast5_exts = {".fast5", ".f5"}
|
|
35
|
-
fastq_exts = {
|
|
36
|
-
|
|
38
|
+
fastq_exts = {
|
|
39
|
+
".fastq",
|
|
40
|
+
".fq",
|
|
41
|
+
".fastq.gz",
|
|
42
|
+
".fq.gz",
|
|
43
|
+
".fastq.bz2",
|
|
44
|
+
".fq.bz2",
|
|
45
|
+
".fastq.xz",
|
|
46
|
+
".fq.xz",
|
|
47
|
+
".fastq.zst",
|
|
48
|
+
".fq.zst",
|
|
49
|
+
}
|
|
50
|
+
h5ad_exts = {".h5ad", ".h5"}
|
|
37
51
|
compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
|
|
38
52
|
|
|
39
53
|
def ext_key(pp: Path) -> str:
|