smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/conversion.yaml +11 -6
- smftools/config/deaminase.yaml +12 -7
- smftools/config/default.yaml +36 -25
- smftools/config/direct.yaml +25 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +109 -12
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1037 -362
- smftools/preprocessing/__init__.py +2 -0
- smftools/preprocessing/append_base_context.py +3 -3
- smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/readwrite.py +266 -140
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
import click
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Optional, Sequence
|
|
5
|
+
|
|
6
|
+
from .cli.load_adata import load_adata
|
|
7
|
+
from .cli.cli_flows import flow_I
|
|
8
|
+
from .cli.preprocess_adata import preprocess_adata
|
|
9
|
+
from .cli.spatial_adata import spatial_adata
|
|
10
|
+
from .cli.hmm_adata import hmm_adata
|
|
11
|
+
|
|
12
|
+
from .readwrite import merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
|
|
13
|
+
|
|
14
|
+
@click.group()
|
|
15
|
+
def cli():
|
|
16
|
+
"""Command-line interface for smftools."""
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
####### Load anndata from raw data ###########
|
|
20
|
+
@cli.command()
|
|
21
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
22
|
+
def load(config_path):
|
|
23
|
+
"""Load and process data from CONFIG_PATH."""
|
|
24
|
+
load_adata(config_path)
|
|
25
|
+
##########################################
|
|
26
|
+
|
|
27
|
+
####### Preprocessing ###########
|
|
28
|
+
@cli.command()
|
|
29
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
30
|
+
def preprocess(config_path):
|
|
31
|
+
"""Preprocess data from CONFIG_PATH."""
|
|
32
|
+
preprocess_adata(config_path)
|
|
33
|
+
##########################################
|
|
34
|
+
|
|
35
|
+
####### Spatial ###########
|
|
36
|
+
@cli.command()
|
|
37
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
38
|
+
def spatial(config_path):
|
|
39
|
+
"""Process data from CONFIG_PATH."""
|
|
40
|
+
spatial_adata(config_path)
|
|
41
|
+
##########################################
|
|
42
|
+
|
|
43
|
+
####### HMM ###########
|
|
44
|
+
@cli.command()
|
|
45
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
46
|
+
def hmm(config_path):
|
|
47
|
+
"""Process data from CONFIG_PATH."""
|
|
48
|
+
hmm_adata(config_path)
|
|
49
|
+
##########################################
|
|
50
|
+
|
|
51
|
+
####### batch command ###########
|
|
52
|
+
@cli.command()
|
|
53
|
+
@click.argument(
|
|
54
|
+
"task",
|
|
55
|
+
type=click.Choice(["load", "preprocess", "spatial", "hmm"], case_sensitive=False),
|
|
56
|
+
)
|
|
57
|
+
@click.argument(
|
|
58
|
+
"config_table",
|
|
59
|
+
type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path),
|
|
60
|
+
)
|
|
61
|
+
@click.option(
|
|
62
|
+
"--column",
|
|
63
|
+
"-c",
|
|
64
|
+
default="config_path",
|
|
65
|
+
show_default=True,
|
|
66
|
+
help="Column name containing config paths (ignored for plain TXT).",
|
|
67
|
+
)
|
|
68
|
+
@click.option(
|
|
69
|
+
"--sep",
|
|
70
|
+
default=None,
|
|
71
|
+
help="Field separator: default auto-detect (.tsv -> '\\t', .csv -> ',', others treated as TXT).",
|
|
72
|
+
)
|
|
73
|
+
def batch(task, config_table: Path, column: str, sep: str | None):
|
|
74
|
+
"""
|
|
75
|
+
Run a TASK (load, preprocess, spatial, hmm) on multiple CONFIG_PATHs
|
|
76
|
+
listed in a CSV/TSV or plain TXT file.
|
|
77
|
+
|
|
78
|
+
Plain text format: one config path per line, no header.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
# ----------------------------
|
|
82
|
+
# Decide file type
|
|
83
|
+
# ----------------------------
|
|
84
|
+
suffix = config_table.suffix.lower()
|
|
85
|
+
|
|
86
|
+
# TXT mode → each line is a config path
|
|
87
|
+
if suffix in {".txt", ".list"}:
|
|
88
|
+
paths = []
|
|
89
|
+
with config_table.open() as f:
|
|
90
|
+
for line in f:
|
|
91
|
+
line = line.strip()
|
|
92
|
+
if line:
|
|
93
|
+
paths.append(Path(line).expanduser())
|
|
94
|
+
|
|
95
|
+
if not paths:
|
|
96
|
+
raise click.ClickException(f"No config paths found in text file: {config_table}")
|
|
97
|
+
|
|
98
|
+
config_paths = paths
|
|
99
|
+
|
|
100
|
+
else:
|
|
101
|
+
# CSV / TSV mode
|
|
102
|
+
# auto-detect separator if not provided
|
|
103
|
+
if sep is None:
|
|
104
|
+
if suffix in {".tsv", ".tab"}:
|
|
105
|
+
sep = "\t"
|
|
106
|
+
else:
|
|
107
|
+
sep = ","
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
df = pd.read_csv(config_table, sep=sep, dtype=str)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise click.ClickException(f"Failed to read table {config_table}: {e}") from e
|
|
113
|
+
|
|
114
|
+
if df.empty:
|
|
115
|
+
raise click.ClickException(f"Config table is empty: {config_table}")
|
|
116
|
+
|
|
117
|
+
# If table has no header or only one column, treat it as raw paths
|
|
118
|
+
if df.shape[1] == 1 and column not in df.columns:
|
|
119
|
+
# re-read as headerless single-column list, so we don't drop the first path
|
|
120
|
+
try:
|
|
121
|
+
df = pd.read_csv(
|
|
122
|
+
config_table,
|
|
123
|
+
sep=sep,
|
|
124
|
+
header=None,
|
|
125
|
+
names=[column],
|
|
126
|
+
dtype=str,
|
|
127
|
+
)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
raise click.ClickException(f"Failed to read {config_table} as headerless list: {e}") from e
|
|
130
|
+
|
|
131
|
+
config_series = df[column]
|
|
132
|
+
else:
|
|
133
|
+
if column not in df.columns:
|
|
134
|
+
raise click.ClickException(
|
|
135
|
+
f"Column '{column}' not found in {config_table}. "
|
|
136
|
+
f"Available columns: {', '.join(df.columns)}"
|
|
137
|
+
)
|
|
138
|
+
config_series = df[column]
|
|
139
|
+
|
|
140
|
+
config_paths = (
|
|
141
|
+
config_series.dropna()
|
|
142
|
+
.map(str)
|
|
143
|
+
.map(lambda p: Path(p).expanduser())
|
|
144
|
+
.tolist()
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# ----------------------------
|
|
148
|
+
# Validate config paths
|
|
149
|
+
# ----------------------------
|
|
150
|
+
if not config_paths:
|
|
151
|
+
raise click.ClickException("No config paths found.")
|
|
152
|
+
|
|
153
|
+
# ----------------------------
|
|
154
|
+
# Map task to function
|
|
155
|
+
# ----------------------------
|
|
156
|
+
task = task.lower()
|
|
157
|
+
task_funcs = {
|
|
158
|
+
"load": load_adata,
|
|
159
|
+
"preprocess": preprocess_adata,
|
|
160
|
+
"spatial": spatial_adata,
|
|
161
|
+
"hmm": hmm_adata,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
func = task_funcs[task]
|
|
165
|
+
|
|
166
|
+
click.echo(
|
|
167
|
+
f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# ----------------------------
|
|
171
|
+
# Loop over paths
|
|
172
|
+
# ----------------------------
|
|
173
|
+
for i, cfg in enumerate(config_paths, start=1):
|
|
174
|
+
if not cfg.exists():
|
|
175
|
+
click.echo(f"[{i}/{len(config_paths)}] SKIP (missing): {cfg}")
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
func(str(cfg)) # underlying functions take a string path
|
|
182
|
+
except Exception as e:
|
|
183
|
+
click.echo(f" ERROR on {cfg}: {e}")
|
|
184
|
+
|
|
185
|
+
click.echo("Batch processing complete.")
|
|
186
|
+
##########################################
|
|
187
|
+
|
|
188
|
+
####### concatenate command ###########
|
|
189
|
+
@cli.command("concatenate")
|
|
190
|
+
@click.argument(
|
|
191
|
+
"output_path",
|
|
192
|
+
type=click.Path(path_type=Path, dir_okay=False),
|
|
193
|
+
)
|
|
194
|
+
@click.option(
|
|
195
|
+
"--input-dir",
|
|
196
|
+
"-d",
|
|
197
|
+
type=click.Path(path_type=Path, file_okay=False),
|
|
198
|
+
default=None,
|
|
199
|
+
help="Directory containing .h5ad/.h5ad.gz files to concatenate.",
|
|
200
|
+
)
|
|
201
|
+
@click.option(
|
|
202
|
+
"--csv-path",
|
|
203
|
+
"-c",
|
|
204
|
+
type=click.Path(path_type=Path, dir_okay=False),
|
|
205
|
+
default=None,
|
|
206
|
+
help="CSV/TSV/TXT containing file paths of h5ad files.",
|
|
207
|
+
)
|
|
208
|
+
@click.option(
|
|
209
|
+
"--csv-column",
|
|
210
|
+
"-C",
|
|
211
|
+
default="h5ad_path",
|
|
212
|
+
help="Column in the CSV containing file paths (ignored for TXT).",
|
|
213
|
+
show_default=True,
|
|
214
|
+
)
|
|
215
|
+
@click.option(
|
|
216
|
+
"--suffix",
|
|
217
|
+
"-s",
|
|
218
|
+
multiple=True,
|
|
219
|
+
default=[".h5ad", ".h5ad.gz"],
|
|
220
|
+
help="Allowed file suffixes (repeatable).",
|
|
221
|
+
show_default=True,
|
|
222
|
+
)
|
|
223
|
+
@click.option(
|
|
224
|
+
"--delete",
|
|
225
|
+
is_flag=False,
|
|
226
|
+
help="Delete input .h5ad files after concatenation.",
|
|
227
|
+
)
|
|
228
|
+
@click.option(
|
|
229
|
+
"--restore",
|
|
230
|
+
is_flag=True,
|
|
231
|
+
help="Restore .h5ad backups during reading.",
|
|
232
|
+
)
|
|
233
|
+
def concatenate_cmd(
|
|
234
|
+
output_path: Path,
|
|
235
|
+
input_dir: Path | None,
|
|
236
|
+
csv_path: Path | None,
|
|
237
|
+
csv_column: str,
|
|
238
|
+
suffix: Sequence[str],
|
|
239
|
+
delete: bool,
|
|
240
|
+
restore: bool,
|
|
241
|
+
):
|
|
242
|
+
"""
|
|
243
|
+
Concatenate multiple .h5ad files into a single output file.
|
|
244
|
+
|
|
245
|
+
Two modes:
|
|
246
|
+
|
|
247
|
+
smftools concatenate out.h5ad --input-dir ./dir
|
|
248
|
+
|
|
249
|
+
smftools concatenate out.h5ad --csv-path paths.csv --csv-column h5ad_path
|
|
250
|
+
|
|
251
|
+
TXT input also works (one file path per line).
|
|
252
|
+
|
|
253
|
+
Uses safe_read_h5ad() and safe_write_h5ad().
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
if input_dir and csv_path:
|
|
257
|
+
raise click.ClickException("Provide only ONE of --input-dir or --csv-path.")
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
out = concatenate_h5ads(
|
|
261
|
+
output_path=output_path,
|
|
262
|
+
input_dir=input_dir,
|
|
263
|
+
csv_path=csv_path,
|
|
264
|
+
csv_column=csv_column,
|
|
265
|
+
file_suffixes=tuple(suffix),
|
|
266
|
+
delete_inputs=delete,
|
|
267
|
+
restore_backups=restore,
|
|
268
|
+
)
|
|
269
|
+
click.echo(f"✓ Concatenated file written to: {out}")
|
|
270
|
+
|
|
271
|
+
except Exception as e:
|
|
272
|
+
raise click.ClickException(str(e)) from e
|
|
273
|
+
##########################################
|
|
274
|
+
|
|
275
|
+
####### Merging existing anndatas from an experiment that used two different demultiplexing rules #######
|
|
276
|
+
# REQUIRED_KEYS = ("adata_single_path", "adata_double_path")
|
|
277
|
+
# OPTIONAL_KEYS = (
|
|
278
|
+
# "adata_single_backups_path",
|
|
279
|
+
# "adata_double_backups_path",
|
|
280
|
+
# "output_path",
|
|
281
|
+
# "merged_filename",
|
|
282
|
+
# )
|
|
283
|
+
|
|
284
|
+
# def _read_config_csv(csv_path: Path) -> Dict[str, str]:
|
|
285
|
+
# """
|
|
286
|
+
# Read a multi-row, two-column CSV of key,value pairs into a dict.
|
|
287
|
+
|
|
288
|
+
# Supported features:
|
|
289
|
+
# - Optional header ("key,value") or none.
|
|
290
|
+
# - Comments starting with '#' and blank lines are ignored.
|
|
291
|
+
# - If duplicate keys occur, the last one wins.
|
|
292
|
+
# - Keys are matched literally against REQUIRED_KEYS/OPTIONAL_KEYS.
|
|
293
|
+
# """
|
|
294
|
+
# try:
|
|
295
|
+
# # Read as two columns regardless of header; comments ignored.
|
|
296
|
+
# df = pd.read_csv(
|
|
297
|
+
# csv_path,
|
|
298
|
+
# dtype=str,
|
|
299
|
+
# comment="#",
|
|
300
|
+
# header=None, # treat everything as rows; we'll normalize below
|
|
301
|
+
# usecols=[0, 1],
|
|
302
|
+
# names=["key", "value"]
|
|
303
|
+
# )
|
|
304
|
+
# except Exception as e:
|
|
305
|
+
# raise click.ClickException(f"Failed to read CSV: {e}") from e
|
|
306
|
+
|
|
307
|
+
# # Drop completely empty rows
|
|
308
|
+
# df = df.fillna("").astype(str)
|
|
309
|
+
# df["key"] = df["key"].str.strip()
|
|
310
|
+
# df["value"] = df["value"].str.strip()
|
|
311
|
+
# df = df[(df["key"] != "") & (df["key"].notna())]
|
|
312
|
+
|
|
313
|
+
# if df.empty:
|
|
314
|
+
# raise click.ClickException("Config CSV is empty after removing comments/blank lines.")
|
|
315
|
+
|
|
316
|
+
# # Remove an optional header row if present
|
|
317
|
+
# if df.iloc[0]["key"].lower() in {"key", "keys"}:
|
|
318
|
+
# df = df.iloc[1:]
|
|
319
|
+
# df = df[(df["key"] != "") & (df["key"].notna())]
|
|
320
|
+
# if df.empty:
|
|
321
|
+
# raise click.ClickException("Config CSV contains only a header row.")
|
|
322
|
+
|
|
323
|
+
# # Build dict; last occurrence of a key wins
|
|
324
|
+
# cfg = {}
|
|
325
|
+
# for k, v in zip(df["key"], df["value"]):
|
|
326
|
+
# cfg[k] = v
|
|
327
|
+
|
|
328
|
+
# # Validate required keys
|
|
329
|
+
# missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
|
|
330
|
+
# if missing:
|
|
331
|
+
# raise click.ClickException(
|
|
332
|
+
# "Missing required keys in CSV: "
|
|
333
|
+
# + ", ".join(missing)
|
|
334
|
+
# + "\nExpected keys:\n - "
|
|
335
|
+
# + "\n - ".join(REQUIRED_KEYS)
|
|
336
|
+
# + "\nOptional keys:\n - "
|
|
337
|
+
# + "\n - ".join(OPTIONAL_KEYS)
|
|
338
|
+
# )
|
|
339
|
+
|
|
340
|
+
# return cfg
|
|
341
|
+
|
|
342
|
+
# def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
|
|
343
|
+
# """Decide on the output .h5ad path based on CSV; create directories if needed."""
|
|
344
|
+
# merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
|
|
345
|
+
# if not merged_filename.endswith(".h5ad"):
|
|
346
|
+
# merged_filename += ".h5ad"
|
|
347
|
+
|
|
348
|
+
# output_path_raw = cfg.get("output_path", "").strip()
|
|
349
|
+
|
|
350
|
+
# if not output_path_raw:
|
|
351
|
+
# out_dir = Path.cwd() / "merged_output"
|
|
352
|
+
# out_dir.mkdir(parents=True, exist_ok=True)
|
|
353
|
+
# return out_dir / merged_filename
|
|
354
|
+
|
|
355
|
+
# output_path = Path(output_path_raw)
|
|
356
|
+
|
|
357
|
+
# if output_path.suffix.lower() == ".h5ad":
|
|
358
|
+
# output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
359
|
+
# return output_path
|
|
360
|
+
|
|
361
|
+
# # Treat as directory
|
|
362
|
+
# output_path.mkdir(parents=True, exist_ok=True)
|
|
363
|
+
# return output_path / merged_filename
|
|
364
|
+
|
|
365
|
+
# def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
|
|
366
|
+
|
|
367
|
+
# if backups:
|
|
368
|
+
# click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
|
|
369
|
+
# return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
|
|
370
|
+
# else:
|
|
371
|
+
# click.echo(f"Loading {label} from {primary} with backups disabled ...")
|
|
372
|
+
# return safe_read_h5ad(primary, restore_backups=False)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
# @cli.command()
|
|
376
|
+
# @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
|
|
377
|
+
# def merge_barcoded_anndatas(config_path: Path):
|
|
378
|
+
# """
|
|
379
|
+
# Merge two AnnData objects from the same experiment that were demultiplexed
|
|
380
|
+
# under different end-barcoding requirements, using a 1-row CSV for config.
|
|
381
|
+
|
|
382
|
+
# CSV must include:
|
|
383
|
+
# - adata_single_path
|
|
384
|
+
# - adata_double_path
|
|
385
|
+
|
|
386
|
+
# Optional columns:
|
|
387
|
+
# - adata_single_backups_path
|
|
388
|
+
# - adata_double_backups_path
|
|
389
|
+
# - output_path (file or directory; default: ./merged_output/)
|
|
390
|
+
# - merged_filename (default: merged_<single>__<double>.h5ad)
|
|
391
|
+
|
|
392
|
+
# Example CSV:
|
|
393
|
+
|
|
394
|
+
# adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
|
|
395
|
+
# /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
|
|
396
|
+
# """
|
|
397
|
+
# try:
|
|
398
|
+
# cfg = _read_config_csv(config_path)
|
|
399
|
+
|
|
400
|
+
# single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
|
|
401
|
+
# double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
|
|
402
|
+
|
|
403
|
+
# for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
|
|
404
|
+
# if not p.exists():
|
|
405
|
+
# raise click.ClickException(f"{label} does not exist: {p}")
|
|
406
|
+
|
|
407
|
+
# single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
|
|
408
|
+
# double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
|
|
409
|
+
|
|
410
|
+
# if single_backups and not single_backups.exists():
|
|
411
|
+
# raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
|
|
412
|
+
# if double_backups and not double_backups.exists():
|
|
413
|
+
# raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
|
|
414
|
+
|
|
415
|
+
# output_path = _resolve_output_path(cfg, single_path, double_path)
|
|
416
|
+
|
|
417
|
+
# # Load
|
|
418
|
+
# adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
|
|
419
|
+
# adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
|
|
420
|
+
|
|
421
|
+
# click.echo("Merging AnnDatas ...")
|
|
422
|
+
# merged = merge_barcoded_anndatas_core(adata_single, adata_double)
|
|
423
|
+
|
|
424
|
+
# click.echo(f"Writing merged AnnData to: {output_path}")
|
|
425
|
+
# backup_dir = output_path.cwd() / "merged_backups"
|
|
426
|
+
# safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
|
|
427
|
+
|
|
428
|
+
# click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
|
|
429
|
+
|
|
430
|
+
# except click.ClickException:
|
|
431
|
+
# raise
|
|
432
|
+
# except Exception as e:
|
|
433
|
+
# # Surface unexpected errors cleanly
|
|
434
|
+
# raise click.ClickException(f"Unexpected error: {e}") from e
|
|
435
|
+
################################################################################################################
|
smftools/config/conversion.yaml
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
# Conversion (Bisulfite/APOBEC)footprinting defaults
|
|
2
2
|
extends: default
|
|
3
|
+
|
|
4
|
+
######## smftools load params #########
|
|
3
5
|
conversion_types:
|
|
4
6
|
- '5mC' # 5mC
|
|
5
7
|
|
|
8
|
+
######## smftools preprocess params #########
|
|
6
9
|
# Read QC Params
|
|
7
10
|
read_mod_filtering_use_other_c_as_background: True
|
|
8
11
|
|
|
12
|
+
######## smftools hmm params #########
|
|
9
13
|
# HMM
|
|
10
14
|
cpg: True # whether to use the default HMM endogenous CpG patch params
|
|
11
15
|
hmm_methbases:
|
|
@@ -14,16 +18,17 @@ hmm_feature_sets:
|
|
|
14
18
|
footprint:
|
|
15
19
|
state: "Non-Modified"
|
|
16
20
|
features:
|
|
17
|
-
small_bound_stretch: [
|
|
18
|
-
medium_bound_stretch: [
|
|
19
|
-
putative_nucleosome: [
|
|
21
|
+
small_bound_stretch: [10, 30]
|
|
22
|
+
medium_bound_stretch: [30, 110]
|
|
23
|
+
putative_nucleosome: [110, 200]
|
|
20
24
|
large_bound_stretch: [200, inf]
|
|
21
25
|
accessible:
|
|
22
26
|
state: "Modified"
|
|
23
27
|
features:
|
|
24
|
-
small_accessible_patch: [
|
|
25
|
-
mid_accessible_patch: [20,
|
|
26
|
-
|
|
28
|
+
small_accessible_patch: [3, 20]
|
|
29
|
+
mid_accessible_patch: [20, 40]
|
|
30
|
+
mid_large_accessible_patch: [40, 130]
|
|
31
|
+
large_accessible_patch: [130, inf]
|
|
27
32
|
cpg:
|
|
28
33
|
state: "Modified"
|
|
29
34
|
features:
|
smftools/config/deaminase.yaml
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
# Deaminase footprinting defaults
|
|
2
2
|
extends: default
|
|
3
|
+
|
|
4
|
+
######## smftools load params #########
|
|
3
5
|
conversion_types:
|
|
4
6
|
- '5mC' # 5mC
|
|
5
7
|
|
|
6
8
|
mod_target_bases:
|
|
7
9
|
- "C"
|
|
8
10
|
|
|
11
|
+
######## smftools preprocess params #########
|
|
9
12
|
read_mod_filtering_gpc_thresholds:
|
|
10
13
|
- null
|
|
11
14
|
- null
|
|
@@ -25,6 +28,7 @@ read_mod_filtering_use_other_c_as_background: False
|
|
|
25
28
|
duplicate_detection_site_types:
|
|
26
29
|
- "any_C"
|
|
27
30
|
|
|
31
|
+
######## smftools analyze params #########
|
|
28
32
|
# Autocorrelation params
|
|
29
33
|
autocorr_site_types:
|
|
30
34
|
- "any_C"
|
|
@@ -33,7 +37,7 @@ autocorr_site_types:
|
|
|
33
37
|
correlation_matrix_site_types:
|
|
34
38
|
- "any_C_site"
|
|
35
39
|
|
|
36
|
-
#
|
|
40
|
+
# ######## smftools hmm params #########
|
|
37
41
|
cpg: False # whether to use the default HMM endogenous CpG patch params
|
|
38
42
|
hmm_methbases:
|
|
39
43
|
- "C"
|
|
@@ -41,16 +45,17 @@ hmm_feature_sets:
|
|
|
41
45
|
footprint:
|
|
42
46
|
state: "Non-Modified"
|
|
43
47
|
features:
|
|
44
|
-
small_bound_stretch: [
|
|
45
|
-
medium_bound_stretch: [
|
|
46
|
-
putative_nucleosome: [
|
|
48
|
+
small_bound_stretch: [10, 30]
|
|
49
|
+
medium_bound_stretch: [30, 110]
|
|
50
|
+
putative_nucleosome: [110, 200]
|
|
47
51
|
large_bound_stretch: [200, inf]
|
|
48
52
|
accessible:
|
|
49
53
|
state: "Modified"
|
|
50
54
|
features:
|
|
51
|
-
small_accessible_patch: [
|
|
52
|
-
mid_accessible_patch: [20,
|
|
53
|
-
|
|
55
|
+
small_accessible_patch: [3, 20]
|
|
56
|
+
mid_accessible_patch: [20, 40]
|
|
57
|
+
mid_large_accessible_patch: [40, 130]
|
|
58
|
+
large_accessible_patch: [130, inf]
|
|
54
59
|
|
|
55
60
|
hmm_merge_layer_features:
|
|
56
61
|
- ["C_all_accessible_features", 80]
|
smftools/config/default.yaml
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
# General
|
|
2
|
+
sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
|
|
3
|
+
sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
|
|
4
|
+
sample_name_col_for_plotting: 'Barcode'
|
|
5
|
+
|
|
6
|
+
# Compute params
|
|
7
|
+
threads: 4
|
|
8
|
+
device: "auto"
|
|
9
|
+
|
|
10
|
+
######## smftools load params #########
|
|
1
11
|
# Generic i/o
|
|
2
12
|
bam_suffix: ".bam"
|
|
3
13
|
recursive_input_search: True
|
|
@@ -7,16 +17,12 @@ strands:
|
|
|
7
17
|
- top
|
|
8
18
|
conversions:
|
|
9
19
|
- unconverted
|
|
10
|
-
sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
|
|
11
|
-
sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
|
|
12
20
|
fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
|
|
13
21
|
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
14
22
|
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
15
23
|
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
16
|
-
|
|
17
|
-
#
|
|
18
|
-
threads: 4
|
|
19
|
-
device: "auto"
|
|
24
|
+
delete_intermediate_bams: True # Whether to delete intermediate BAM files.
|
|
25
|
+
delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
|
|
20
26
|
|
|
21
27
|
# Sequencing modality and general experiment params
|
|
22
28
|
smf_modality: 'conversion' # conversion, deaminase, direct
|
|
@@ -70,11 +76,11 @@ aligner_args:
|
|
|
70
76
|
dorado:
|
|
71
77
|
ont:
|
|
72
78
|
- "--mm2-opts"
|
|
73
|
-
- "-N"
|
|
74
|
-
- "5"
|
|
79
|
+
- "-N 5"
|
|
75
80
|
|
|
76
81
|
# Sorted BAM and BED specific handling
|
|
77
82
|
make_bigwigs: False # Whether to make coverage bigwigs
|
|
83
|
+
make_beds: False # Whether to make beds from the aligned bams
|
|
78
84
|
|
|
79
85
|
# Nanopore specific demultiplexing
|
|
80
86
|
barcode_both_ends: False # dorado demultiplexing
|
|
@@ -85,24 +91,25 @@ mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall w
|
|
|
85
91
|
reference_column: 'Reference_strand'
|
|
86
92
|
sample_column: 'Barcode'
|
|
87
93
|
|
|
88
|
-
|
|
94
|
+
######## smftools preprocess params #########
|
|
95
|
+
# Read length, quality, and mapping filtering params
|
|
89
96
|
read_coord_filter:
|
|
90
97
|
- null
|
|
91
98
|
- null
|
|
92
99
|
read_len_filter_thresholds:
|
|
93
|
-
-
|
|
100
|
+
- 100
|
|
94
101
|
- null
|
|
95
102
|
read_len_to_ref_ratio_filter_thresholds:
|
|
96
|
-
- 0.
|
|
103
|
+
- 0.5
|
|
97
104
|
- null
|
|
98
105
|
read_quality_filter_thresholds:
|
|
99
|
-
-
|
|
106
|
+
- 15
|
|
100
107
|
- null
|
|
101
108
|
read_mapping_quality_filter_thresholds:
|
|
102
109
|
- null
|
|
103
110
|
- null
|
|
104
111
|
|
|
105
|
-
#
|
|
112
|
+
# Read modification filtering params
|
|
106
113
|
read_mod_filtering_gpc_thresholds:
|
|
107
114
|
- 0.025
|
|
108
115
|
- 0.975
|
|
@@ -116,9 +123,9 @@ read_mod_filtering_a_thresholds:
|
|
|
116
123
|
- 0.025
|
|
117
124
|
- 0.975
|
|
118
125
|
read_mod_filtering_use_other_c_as_background: False
|
|
119
|
-
min_valid_fraction_positions_in_read_vs_ref: 0.
|
|
126
|
+
min_valid_fraction_positions_in_read_vs_ref: 0.5
|
|
120
127
|
|
|
121
|
-
#
|
|
128
|
+
# Duplicate detection params
|
|
122
129
|
duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
|
|
123
130
|
- "GpC"
|
|
124
131
|
- "CpG"
|
|
@@ -133,11 +140,10 @@ duplicate_detection_do_hierarchical: True # Whether to follow up fwd/rev lexicog
|
|
|
133
140
|
duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical clustering distance calculation
|
|
134
141
|
duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
|
|
135
142
|
|
|
136
|
-
#
|
|
137
|
-
|
|
138
|
-
# General Plotting params
|
|
139
|
-
sample_name_col_for_plotting: 'Barcode'
|
|
143
|
+
# Position QC params
|
|
144
|
+
position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
|
|
140
145
|
|
|
146
|
+
######## smftools analyze params #########
|
|
141
147
|
# Basic Analysis - QC Plotting params
|
|
142
148
|
rows_per_qc_histogram_grid: 12
|
|
143
149
|
|
|
@@ -169,6 +175,7 @@ correlation_matrix_cmaps:
|
|
|
169
175
|
correlation_matrix_site_types:
|
|
170
176
|
- "GpC_site"
|
|
171
177
|
|
|
178
|
+
######## smftools hmm params #########
|
|
172
179
|
# HMM params
|
|
173
180
|
hmm_n_states: 2 # Number of HMM states
|
|
174
181
|
hmm_init_emission_probs:
|
|
@@ -197,19 +204,23 @@ hmm_feature_sets:
|
|
|
197
204
|
footprint:
|
|
198
205
|
state: "Non-Modified"
|
|
199
206
|
features:
|
|
200
|
-
small_bound_stretch: [
|
|
201
|
-
medium_bound_stretch: [
|
|
202
|
-
putative_nucleosome: [
|
|
207
|
+
small_bound_stretch: [10, 40]
|
|
208
|
+
medium_bound_stretch: [40, 110]
|
|
209
|
+
putative_nucleosome: [110, 200]
|
|
203
210
|
large_bound_stretch: [200, inf]
|
|
204
211
|
accessible:
|
|
205
212
|
state: "Modified"
|
|
206
213
|
features:
|
|
207
|
-
small_accessible_patch: [
|
|
208
|
-
mid_accessible_patch: [20,
|
|
209
|
-
|
|
214
|
+
small_accessible_patch: [3, 20]
|
|
215
|
+
mid_accessible_patch: [20, 40]
|
|
216
|
+
mid_large_accessible_patch: [40, 110]
|
|
217
|
+
large_accessible_patch: [110, inf]
|
|
210
218
|
hmm_merge_layer_features:
|
|
211
219
|
- [null, 80]
|
|
212
220
|
|
|
221
|
+
# Pipeline control flow - load adata
|
|
222
|
+
force_redo_load_adata: False # Whether to perform load adata command from start
|
|
223
|
+
|
|
213
224
|
# Pipeline control flow - Preprocessing and QC
|
|
214
225
|
force_redo_preprocessing: False # Whether to force redo the entire preprocessing workflow from the initial raw anndata.
|
|
215
226
|
force_reload_sample_sheet: True # Whether to force redo sample sheet loading
|