smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +7 -1
- smftools/cli/hmm_adata.py +902 -244
- smftools/cli/load_adata.py +318 -198
- smftools/cli/preprocess_adata.py +285 -171
- smftools/cli/spatial_adata.py +137 -53
- smftools/cli_entry.py +94 -178
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +22 -17
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +505 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2125 -1426
- smftools/hmm/__init__.py +2 -3
- smftools/hmm/archived/call_hmm_peaks.py +16 -1
- smftools/hmm/call_hmm_peaks.py +173 -193
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +379 -156
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +195 -29
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +347 -168
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +145 -85
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +8 -8
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +103 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +688 -271
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.4.dist-info/RECORD +0 -176
- /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py
CHANGED
|
@@ -1,19 +1,38 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Sequence
|
|
4
|
+
|
|
1
5
|
import click
|
|
2
6
|
import pandas as pd
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Dict, Optional, Sequence
|
|
5
7
|
|
|
8
|
+
from .cli.hmm_adata import hmm_adata
|
|
6
9
|
from .cli.load_adata import load_adata
|
|
7
10
|
from .cli.preprocess_adata import preprocess_adata
|
|
8
11
|
from .cli.spatial_adata import spatial_adata
|
|
9
|
-
from .
|
|
12
|
+
from .informatics.pod5_functions import subsample_pod5
|
|
13
|
+
from .logging_utils import setup_logging
|
|
14
|
+
from .readwrite import concatenate_h5ads
|
|
10
15
|
|
|
11
|
-
from .readwrite import safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
|
|
12
16
|
|
|
13
17
|
@click.group()
|
|
14
|
-
|
|
18
|
+
@click.option(
|
|
19
|
+
"--log-file",
|
|
20
|
+
type=click.Path(dir_okay=False, writable=True, path_type=Path),
|
|
21
|
+
default=None,
|
|
22
|
+
help="Optional file path to write smftools logs.",
|
|
23
|
+
)
|
|
24
|
+
@click.option(
|
|
25
|
+
"--log-level",
|
|
26
|
+
type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"], case_sensitive=False),
|
|
27
|
+
default="INFO",
|
|
28
|
+
show_default=True,
|
|
29
|
+
help="Logging level for smftools output.",
|
|
30
|
+
)
|
|
31
|
+
def cli(log_file: Path | None, log_level: str):
|
|
15
32
|
"""Command-line interface for smftools."""
|
|
16
|
-
|
|
33
|
+
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
34
|
+
setup_logging(level=level, log_file=log_file)
|
|
35
|
+
|
|
17
36
|
|
|
18
37
|
####### Load anndata from raw data ###########
|
|
19
38
|
@cli.command()
|
|
@@ -21,32 +40,44 @@ def cli():
|
|
|
21
40
|
def load(config_path):
|
|
22
41
|
"""Load and process data from CONFIG_PATH."""
|
|
23
42
|
load_adata(config_path)
|
|
43
|
+
|
|
44
|
+
|
|
24
45
|
##########################################
|
|
25
46
|
|
|
47
|
+
|
|
26
48
|
####### Preprocessing ###########
|
|
27
49
|
@cli.command()
|
|
28
50
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
29
51
|
def preprocess(config_path):
|
|
30
52
|
"""Preprocess data from CONFIG_PATH."""
|
|
31
53
|
preprocess_adata(config_path)
|
|
54
|
+
|
|
55
|
+
|
|
32
56
|
##########################################
|
|
33
57
|
|
|
58
|
+
|
|
34
59
|
####### Spatial ###########
|
|
35
60
|
@cli.command()
|
|
36
61
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
37
62
|
def spatial(config_path):
|
|
38
63
|
"""Process data from CONFIG_PATH."""
|
|
39
64
|
spatial_adata(config_path)
|
|
65
|
+
|
|
66
|
+
|
|
40
67
|
##########################################
|
|
41
68
|
|
|
69
|
+
|
|
42
70
|
####### HMM ###########
|
|
43
71
|
@cli.command()
|
|
44
72
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
45
73
|
def hmm(config_path):
|
|
46
74
|
"""Process data from CONFIG_PATH."""
|
|
47
75
|
hmm_adata(config_path)
|
|
76
|
+
|
|
77
|
+
|
|
48
78
|
##########################################
|
|
49
79
|
|
|
80
|
+
|
|
50
81
|
####### batch command ###########
|
|
51
82
|
@cli.command()
|
|
52
83
|
@click.argument(
|
|
@@ -125,7 +156,9 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
125
156
|
dtype=str,
|
|
126
157
|
)
|
|
127
158
|
except Exception as e:
|
|
128
|
-
raise click.ClickException(
|
|
159
|
+
raise click.ClickException(
|
|
160
|
+
f"Failed to read {config_table} as headerless list: {e}"
|
|
161
|
+
) from e
|
|
129
162
|
|
|
130
163
|
config_series = df[column]
|
|
131
164
|
else:
|
|
@@ -136,12 +169,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
136
169
|
)
|
|
137
170
|
config_series = df[column]
|
|
138
171
|
|
|
139
|
-
config_paths = (
|
|
140
|
-
config_series.dropna()
|
|
141
|
-
.map(str)
|
|
142
|
-
.map(lambda p: Path(p).expanduser())
|
|
143
|
-
.tolist()
|
|
144
|
-
)
|
|
172
|
+
config_paths = config_series.dropna().map(str).map(lambda p: Path(p).expanduser()).tolist()
|
|
145
173
|
|
|
146
174
|
# ----------------------------
|
|
147
175
|
# Validate config paths
|
|
@@ -162,9 +190,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
162
190
|
|
|
163
191
|
func = task_funcs[task]
|
|
164
192
|
|
|
165
|
-
click.echo(
|
|
166
|
-
f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
|
|
167
|
-
)
|
|
193
|
+
click.echo(f"Running task '{task}' on {len(config_paths)} config paths from {config_table}")
|
|
168
194
|
|
|
169
195
|
# ----------------------------
|
|
170
196
|
# Loop over paths
|
|
@@ -177,13 +203,16 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
177
203
|
click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
|
|
178
204
|
|
|
179
205
|
try:
|
|
180
|
-
func(str(cfg))
|
|
206
|
+
func(str(cfg)) # underlying functions take a string path
|
|
181
207
|
except Exception as e:
|
|
182
208
|
click.echo(f" ERROR on {cfg}: {e}")
|
|
183
209
|
|
|
184
210
|
click.echo("Batch processing complete.")
|
|
211
|
+
|
|
212
|
+
|
|
185
213
|
##########################################
|
|
186
214
|
|
|
215
|
+
|
|
187
216
|
####### concatenate command ###########
|
|
188
217
|
@cli.command("concatenate")
|
|
189
218
|
@click.argument(
|
|
@@ -269,166 +298,53 @@ def concatenate_cmd(
|
|
|
269
298
|
|
|
270
299
|
except Exception as e:
|
|
271
300
|
raise click.ClickException(str(e)) from e
|
|
301
|
+
|
|
302
|
+
|
|
272
303
|
##########################################
|
|
273
304
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
#
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
# raise click.ClickException("Config CSV contains only a header row.")
|
|
321
|
-
|
|
322
|
-
# # Build dict; last occurrence of a key wins
|
|
323
|
-
# cfg = {}
|
|
324
|
-
# for k, v in zip(df["key"], df["value"]):
|
|
325
|
-
# cfg[k] = v
|
|
326
|
-
|
|
327
|
-
# # Validate required keys
|
|
328
|
-
# missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
|
|
329
|
-
# if missing:
|
|
330
|
-
# raise click.ClickException(
|
|
331
|
-
# "Missing required keys in CSV: "
|
|
332
|
-
# + ", ".join(missing)
|
|
333
|
-
# + "\nExpected keys:\n - "
|
|
334
|
-
# + "\n - ".join(REQUIRED_KEYS)
|
|
335
|
-
# + "\nOptional keys:\n - "
|
|
336
|
-
# + "\n - ".join(OPTIONAL_KEYS)
|
|
337
|
-
# )
|
|
338
|
-
|
|
339
|
-
# return cfg
|
|
340
|
-
|
|
341
|
-
# def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
|
|
342
|
-
# """Decide on the output .h5ad path based on CSV; create directories if needed."""
|
|
343
|
-
# merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
|
|
344
|
-
# if not merged_filename.endswith(".h5ad"):
|
|
345
|
-
# merged_filename += ".h5ad"
|
|
346
|
-
|
|
347
|
-
# output_path_raw = cfg.get("output_path", "").strip()
|
|
348
|
-
|
|
349
|
-
# if not output_path_raw:
|
|
350
|
-
# out_dir = Path.cwd() / "merged_output"
|
|
351
|
-
# out_dir.mkdir(parents=True, exist_ok=True)
|
|
352
|
-
# return out_dir / merged_filename
|
|
353
|
-
|
|
354
|
-
# output_path = Path(output_path_raw)
|
|
355
|
-
|
|
356
|
-
# if output_path.suffix.lower() == ".h5ad":
|
|
357
|
-
# output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
358
|
-
# return output_path
|
|
359
|
-
|
|
360
|
-
# # Treat as directory
|
|
361
|
-
# output_path.mkdir(parents=True, exist_ok=True)
|
|
362
|
-
# return output_path / merged_filename
|
|
363
|
-
|
|
364
|
-
# def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
|
|
365
|
-
|
|
366
|
-
# if backups:
|
|
367
|
-
# click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
|
|
368
|
-
# return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
|
|
369
|
-
# else:
|
|
370
|
-
# click.echo(f"Loading {label} from {primary} with backups disabled ...")
|
|
371
|
-
# return safe_read_h5ad(primary, restore_backups=False)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
# @cli.command()
|
|
375
|
-
# @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
|
|
376
|
-
# def merge_barcoded_anndatas(config_path: Path):
|
|
377
|
-
# """
|
|
378
|
-
# Merge two AnnData objects from the same experiment that were demultiplexed
|
|
379
|
-
# under different end-barcoding requirements, using a 1-row CSV for config.
|
|
380
|
-
|
|
381
|
-
# CSV must include:
|
|
382
|
-
# - adata_single_path
|
|
383
|
-
# - adata_double_path
|
|
384
|
-
|
|
385
|
-
# Optional columns:
|
|
386
|
-
# - adata_single_backups_path
|
|
387
|
-
# - adata_double_backups_path
|
|
388
|
-
# - output_path (file or directory; default: ./merged_output/)
|
|
389
|
-
# - merged_filename (default: merged_<single>__<double>.h5ad)
|
|
390
|
-
|
|
391
|
-
# Example CSV:
|
|
392
|
-
|
|
393
|
-
# adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
|
|
394
|
-
# /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
|
|
395
|
-
# """
|
|
396
|
-
# try:
|
|
397
|
-
# cfg = _read_config_csv(config_path)
|
|
398
|
-
|
|
399
|
-
# single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
|
|
400
|
-
# double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
|
|
401
|
-
|
|
402
|
-
# for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
|
|
403
|
-
# if not p.exists():
|
|
404
|
-
# raise click.ClickException(f"{label} does not exist: {p}")
|
|
405
|
-
|
|
406
|
-
# single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
|
|
407
|
-
# double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
|
|
408
|
-
|
|
409
|
-
# if single_backups and not single_backups.exists():
|
|
410
|
-
# raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
|
|
411
|
-
# if double_backups and not double_backups.exists():
|
|
412
|
-
# raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
|
|
413
|
-
|
|
414
|
-
# output_path = _resolve_output_path(cfg, single_path, double_path)
|
|
415
|
-
|
|
416
|
-
# # Load
|
|
417
|
-
# adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
|
|
418
|
-
# adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
|
|
419
|
-
|
|
420
|
-
# click.echo("Merging AnnDatas ...")
|
|
421
|
-
# merged = merge_barcoded_anndatas_core(adata_single, adata_double)
|
|
422
|
-
|
|
423
|
-
# click.echo(f"Writing merged AnnData to: {output_path}")
|
|
424
|
-
# backup_dir = output_path.cwd() / "merged_backups"
|
|
425
|
-
# safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
|
|
426
|
-
|
|
427
|
-
# click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
|
|
428
|
-
|
|
429
|
-
# except click.ClickException:
|
|
430
|
-
# raise
|
|
431
|
-
# except Exception as e:
|
|
432
|
-
# # Surface unexpected errors cleanly
|
|
433
|
-
# raise click.ClickException(f"Unexpected error: {e}") from e
|
|
434
|
-
################################################################################################################
|
|
305
|
+
|
|
306
|
+
####### subsample pod5 command ###########
|
|
307
|
+
@cli.command("subsample-pod5")
|
|
308
|
+
@click.argument(
|
|
309
|
+
"pod5_path",
|
|
310
|
+
type=click.Path(exists=True, path_type=Path),
|
|
311
|
+
)
|
|
312
|
+
@click.option(
|
|
313
|
+
"--read-names",
|
|
314
|
+
"-r",
|
|
315
|
+
type=click.Path(exists=True, path_type=Path),
|
|
316
|
+
default=None,
|
|
317
|
+
help="Text file with one read_id per line.",
|
|
318
|
+
)
|
|
319
|
+
@click.option(
|
|
320
|
+
"--n-reads",
|
|
321
|
+
"-n",
|
|
322
|
+
type=int,
|
|
323
|
+
default=None,
|
|
324
|
+
help="Randomly subsample N reads.",
|
|
325
|
+
)
|
|
326
|
+
@click.option(
|
|
327
|
+
"--outdir",
|
|
328
|
+
"-o",
|
|
329
|
+
type=click.Path(path_type=Path, file_okay=False),
|
|
330
|
+
required=True,
|
|
331
|
+
help="Output directory for subsampled POD5.",
|
|
332
|
+
)
|
|
333
|
+
def subsample_pod5_cmd(pod5_path, read_names, n_reads, outdir):
|
|
334
|
+
"""
|
|
335
|
+
Subsample POD5 file(s) by read ID list or random sampling.
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
# --- Validate mutually exclusive options ---
|
|
339
|
+
if (read_names is None and n_reads is None) or (read_names and n_reads):
|
|
340
|
+
raise click.UsageError("You must specify exactly ONE of --read-names or --n-reads.")
|
|
341
|
+
|
|
342
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
|
343
|
+
|
|
344
|
+
subsample_arg = str(read_names) if read_names else n_reads
|
|
345
|
+
|
|
346
|
+
subsample_pod5(
|
|
347
|
+
pod5_path=str(pod5_path),
|
|
348
|
+
read_name_path=subsample_arg,
|
|
349
|
+
output_directory=str(outdir),
|
|
350
|
+
)
|
smftools/config/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .experiment_config import
|
|
1
|
+
from .experiment_config import ExperimentConfig, LoadExperimentConfig
|
smftools/config/conversion.yaml
CHANGED
|
@@ -9,6 +9,10 @@ conversion_types:
|
|
|
9
9
|
# Read QC Params
|
|
10
10
|
read_mod_filtering_use_other_c_as_background: True
|
|
11
11
|
|
|
12
|
+
# Spatial Analysis - Autocorr params
|
|
13
|
+
autocorr_site_types:
|
|
14
|
+
- "GpC"
|
|
15
|
+
|
|
12
16
|
# Spatial Analysis - Clustermap params
|
|
13
17
|
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
14
18
|
clustermap_cmap_c: "coolwarm"
|
|
@@ -42,4 +46,4 @@ hmm_feature_sets:
|
|
|
42
46
|
cpg_patch: [0, inf]
|
|
43
47
|
|
|
44
48
|
hmm_merge_layer_features:
|
|
45
|
-
- ["
|
|
49
|
+
- ["all_accessible_features", 60]
|
smftools/config/deaminase.yaml
CHANGED
smftools/config/default.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# General
|
|
2
2
|
sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
|
|
3
|
-
sample_sheet_mapping_column: '
|
|
4
|
-
sample_name_col_for_plotting: '
|
|
3
|
+
sample_sheet_mapping_column: 'Experiment_name_and_barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
|
|
4
|
+
sample_name_col_for_plotting: 'Experiment_name_and_barcode'
|
|
5
5
|
|
|
6
6
|
# Compute params
|
|
7
7
|
threads: 4
|
|
@@ -9,9 +9,7 @@ device: "auto"
|
|
|
9
9
|
|
|
10
10
|
######## smftools load params #########
|
|
11
11
|
# Generic i/o
|
|
12
|
-
bam_suffix: ".bam"
|
|
13
12
|
recursive_input_search: True
|
|
14
|
-
split_dir: "demultiplexed_BAMs"
|
|
15
13
|
strands:
|
|
16
14
|
- bottom
|
|
17
15
|
- top
|
|
@@ -21,7 +19,7 @@ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barc
|
|
|
21
19
|
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
22
20
|
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
23
21
|
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
24
|
-
delete_intermediate_bams:
|
|
22
|
+
delete_intermediate_bams: True # Whether to delete intermediate BAM files.
|
|
25
23
|
delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
|
|
26
24
|
|
|
27
25
|
# Sequencing modality and general experiment params
|
|
@@ -53,7 +51,6 @@ aligner_args:
|
|
|
53
51
|
- '-y'
|
|
54
52
|
- '-N'
|
|
55
53
|
- '5'
|
|
56
|
-
- '--secondary=no'
|
|
57
54
|
pacbio:
|
|
58
55
|
- '-a'
|
|
59
56
|
- '-x'
|
|
@@ -63,7 +60,6 @@ aligner_args:
|
|
|
63
60
|
- '-y'
|
|
64
61
|
- '-N'
|
|
65
62
|
- '5'
|
|
66
|
-
- '--secondary=no'
|
|
67
63
|
illumina:
|
|
68
64
|
- '-a'
|
|
69
65
|
- '-x'
|
|
@@ -73,7 +69,6 @@ aligner_args:
|
|
|
73
69
|
- '-y'
|
|
74
70
|
- '-N'
|
|
75
71
|
- '5'
|
|
76
|
-
- '--secondary=no'
|
|
77
72
|
dorado:
|
|
78
73
|
ont:
|
|
79
74
|
- "--mm2-opts"
|
|
@@ -88,9 +83,9 @@ barcode_both_ends: False # dorado demultiplexing
|
|
|
88
83
|
trim: False # dorado adapter and barcode removal during demultiplexing
|
|
89
84
|
|
|
90
85
|
# Anndata structure
|
|
91
|
-
mapping_threshold: 0.
|
|
86
|
+
mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
|
|
92
87
|
reference_column: 'Reference_strand'
|
|
93
|
-
sample_column: '
|
|
88
|
+
sample_column: 'Experiment_name_and_barcode'
|
|
94
89
|
|
|
95
90
|
######## smftools preprocess params #########
|
|
96
91
|
# Read length, quality, and mapping filtering params
|
|
@@ -101,7 +96,7 @@ read_len_filter_thresholds:
|
|
|
101
96
|
- 100
|
|
102
97
|
- null
|
|
103
98
|
read_len_to_ref_ratio_filter_thresholds:
|
|
104
|
-
-
|
|
99
|
+
- null
|
|
105
100
|
- null
|
|
106
101
|
read_quality_filter_thresholds:
|
|
107
102
|
- 15
|
|
@@ -179,13 +174,12 @@ umap_layers_to_plot:
|
|
|
179
174
|
- "Raw_modification_signal"
|
|
180
175
|
|
|
181
176
|
# Spatial Analysis - Spatial Autocorrelation params
|
|
177
|
+
autocorr_normalization_method: "pearson" # options are pearson or sum
|
|
182
178
|
rows_per_qc_autocorr_grid: 6
|
|
183
179
|
autocorr_rolling_window_size: 25
|
|
184
180
|
autocorr_max_lag: 800
|
|
185
181
|
autocorr_site_types:
|
|
186
182
|
- "GpC"
|
|
187
|
-
- "CpG"
|
|
188
|
-
- "C"
|
|
189
183
|
|
|
190
184
|
# Spatial Analysis - Correlation Matrix params
|
|
191
185
|
correlation_matrix_types:
|
|
@@ -210,10 +204,19 @@ hmm_init_start_probs:
|
|
|
210
204
|
- 0.5
|
|
211
205
|
- 0.5
|
|
212
206
|
hmm_eps: 1e-8
|
|
207
|
+
# Fitting strategy
|
|
208
|
+
hmm_fit_strategy: "per_group" # "per_group" | "shared_transitions"
|
|
209
|
+
hmm_shared_scope: ["reference", "methbase"]
|
|
210
|
+
hmm_groupby: ["sample", "reference", "methbase"]
|
|
211
|
+
# If hmm_fit_strategy == shared_transitions
|
|
212
|
+
hmm_adapt_emissions: true
|
|
213
|
+
hmm_adapt_startprobs: true
|
|
214
|
+
hmm_emission_adapt_iters: 5
|
|
215
|
+
hmm_emission_adapt_tol: 1.0e-4
|
|
213
216
|
hmm_dtype: "float64"
|
|
214
|
-
hmm_annotation_threshold: 0.5
|
|
215
|
-
hmm_batch_size: 1024
|
|
216
|
-
hmm_use_viterbi: False
|
|
217
|
+
hmm_annotation_threshold: 0.5 # The minimum probability threshold of a feature interval to accept it for layer annotation.
|
|
218
|
+
hmm_batch_size: 1024 # hmm batch size
|
|
219
|
+
hmm_use_viterbi: False # Whether to use viterbi decoding. If False, uses forward-backward gammas. Viterbi is smoother, but less sensitive.
|
|
217
220
|
footprints: True # whether to use the default HMM footprint params
|
|
218
221
|
accessible_patches: True # whether to use the default HMM accessible patch params
|
|
219
222
|
cpg: False # whether to use the default HMM endogenous CpG patch params
|
|
@@ -238,7 +241,7 @@ hmm_feature_sets:
|
|
|
238
241
|
large_accessible_patch: [40, 110]
|
|
239
242
|
nucleosome_depleted_region: [110, inf]
|
|
240
243
|
hmm_merge_layer_features:
|
|
241
|
-
- [
|
|
244
|
+
- ["all_accessible_features", 60]
|
|
242
245
|
clustermap_cmap_hmm: "coolwarm"
|
|
243
246
|
hmm_clustermap_feature_layers:
|
|
244
247
|
- all_accessible_features
|
|
@@ -246,7 +249,9 @@ hmm_clustermap_feature_layers:
|
|
|
246
249
|
- small_accessible_patch
|
|
247
250
|
- mid_accessible_patch
|
|
248
251
|
- large_accessible_patch
|
|
252
|
+
- large_accessible_patch_merged
|
|
249
253
|
- nucleosome_depleted_region
|
|
254
|
+
- nucleosome_depleted_region_merged
|
|
250
255
|
- small_bound_stretch
|
|
251
256
|
- medium_bound_stretch
|
|
252
257
|
- putative_nucleosome
|
smftools/config/direct.yaml
CHANGED
|
@@ -27,10 +27,10 @@ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs afte
|
|
|
27
27
|
|
|
28
28
|
######## smftools preprocess params ########
|
|
29
29
|
fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
30
|
-
binarize_on_fixed_methlyation_threshold: 0.
|
|
30
|
+
binarize_on_fixed_methlyation_threshold: 0.5 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
31
31
|
positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
|
|
32
32
|
negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
|
|
33
|
-
infer_on_percentile_sample_methylation_fitting:
|
|
33
|
+
infer_on_percentile_sample_methylation_fitting: 5 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
34
34
|
inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
35
35
|
fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
36
36
|
output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
|
|
@@ -39,6 +39,11 @@ output_binary_layer_name: "binarized_methylation" # The layer to store the binar
|
|
|
39
39
|
autocorr_site_types:
|
|
40
40
|
- "A"
|
|
41
41
|
|
|
42
|
+
spatial_clustermap_sortby: "a"
|
|
43
|
+
|
|
42
44
|
######## smftools hmm params #########
|
|
43
45
|
hmm_methbases:
|
|
44
|
-
- "A"
|
|
46
|
+
- "A"
|
|
47
|
+
|
|
48
|
+
hmm_merge_layer_features:
|
|
49
|
+
- ["A_all_accessible_features", 60]
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Dict, List,
|
|
4
|
+
from typing import Any, Dict, List, Union
|
|
5
|
+
|
|
6
|
+
from smftools.constants import BAM_SUFFIX
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
def discover_input_files(
|
|
7
10
|
input_data_path: Union[str, Path],
|
|
8
|
-
bam_suffix: str =
|
|
11
|
+
bam_suffix: str = BAM_SUFFIX,
|
|
9
12
|
recursive: bool = False,
|
|
10
13
|
follow_symlinks: bool = False,
|
|
11
14
|
) -> Dict[str, Any]:
|
|
@@ -30,10 +33,21 @@ def discover_input_files(
|
|
|
30
33
|
bam_suffix = bam_suffix.lower()
|
|
31
34
|
|
|
32
35
|
# Sets of canonical extension keys we’ll compare against
|
|
33
|
-
pod5_exts
|
|
36
|
+
pod5_exts = {".pod5", ".p5"}
|
|
34
37
|
fast5_exts = {".fast5", ".f5"}
|
|
35
|
-
fastq_exts = {
|
|
36
|
-
|
|
38
|
+
fastq_exts = {
|
|
39
|
+
".fastq",
|
|
40
|
+
".fq",
|
|
41
|
+
".fastq.gz",
|
|
42
|
+
".fq.gz",
|
|
43
|
+
".fastq.bz2",
|
|
44
|
+
".fq.bz2",
|
|
45
|
+
".fastq.xz",
|
|
46
|
+
".fq.xz",
|
|
47
|
+
".fastq.zst",
|
|
48
|
+
".fq.zst",
|
|
49
|
+
}
|
|
50
|
+
h5ad_exts = {".h5ad", ".h5"}
|
|
37
51
|
compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
|
|
38
52
|
|
|
39
53
|
def ext_key(pp: Path) -> str:
|