smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py
CHANGED
|
@@ -1,20 +1,38 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Sequence
|
|
4
|
+
|
|
1
5
|
import click
|
|
2
6
|
import pandas as pd
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Dict, Optional, Sequence
|
|
5
7
|
|
|
8
|
+
from .cli.hmm_adata import hmm_adata
|
|
6
9
|
from .cli.load_adata import load_adata
|
|
7
|
-
from .cli.cli_flows import flow_I
|
|
8
10
|
from .cli.preprocess_adata import preprocess_adata
|
|
9
11
|
from .cli.spatial_adata import spatial_adata
|
|
10
|
-
from .
|
|
12
|
+
from .informatics.pod5_functions import subsample_pod5
|
|
13
|
+
from .logging_utils import setup_logging
|
|
14
|
+
from .readwrite import concatenate_h5ads
|
|
11
15
|
|
|
12
|
-
from .readwrite import merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
|
|
13
16
|
|
|
14
17
|
@click.group()
|
|
15
|
-
|
|
18
|
+
@click.option(
|
|
19
|
+
"--log-file",
|
|
20
|
+
type=click.Path(dir_okay=False, writable=True, path_type=Path),
|
|
21
|
+
default=None,
|
|
22
|
+
help="Optional file path to write smftools logs.",
|
|
23
|
+
)
|
|
24
|
+
@click.option(
|
|
25
|
+
"--log-level",
|
|
26
|
+
type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"], case_sensitive=False),
|
|
27
|
+
default="INFO",
|
|
28
|
+
show_default=True,
|
|
29
|
+
help="Logging level for smftools output.",
|
|
30
|
+
)
|
|
31
|
+
def cli(log_file: Path | None, log_level: str):
|
|
16
32
|
"""Command-line interface for smftools."""
|
|
17
|
-
|
|
33
|
+
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
34
|
+
setup_logging(level=level, log_file=log_file)
|
|
35
|
+
|
|
18
36
|
|
|
19
37
|
####### Load anndata from raw data ###########
|
|
20
38
|
@cli.command()
|
|
@@ -22,32 +40,44 @@ def cli():
|
|
|
22
40
|
def load(config_path):
|
|
23
41
|
"""Load and process data from CONFIG_PATH."""
|
|
24
42
|
load_adata(config_path)
|
|
43
|
+
|
|
44
|
+
|
|
25
45
|
##########################################
|
|
26
46
|
|
|
47
|
+
|
|
27
48
|
####### Preprocessing ###########
|
|
28
49
|
@cli.command()
|
|
29
50
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
30
51
|
def preprocess(config_path):
|
|
31
52
|
"""Preprocess data from CONFIG_PATH."""
|
|
32
53
|
preprocess_adata(config_path)
|
|
54
|
+
|
|
55
|
+
|
|
33
56
|
##########################################
|
|
34
57
|
|
|
58
|
+
|
|
35
59
|
####### Spatial ###########
|
|
36
60
|
@cli.command()
|
|
37
61
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
38
62
|
def spatial(config_path):
|
|
39
63
|
"""Process data from CONFIG_PATH."""
|
|
40
64
|
spatial_adata(config_path)
|
|
65
|
+
|
|
66
|
+
|
|
41
67
|
##########################################
|
|
42
68
|
|
|
69
|
+
|
|
43
70
|
####### HMM ###########
|
|
44
71
|
@cli.command()
|
|
45
72
|
@click.argument("config_path", type=click.Path(exists=True))
|
|
46
73
|
def hmm(config_path):
|
|
47
74
|
"""Process data from CONFIG_PATH."""
|
|
48
75
|
hmm_adata(config_path)
|
|
76
|
+
|
|
77
|
+
|
|
49
78
|
##########################################
|
|
50
79
|
|
|
80
|
+
|
|
51
81
|
####### batch command ###########
|
|
52
82
|
@cli.command()
|
|
53
83
|
@click.argument(
|
|
@@ -126,7 +156,9 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
126
156
|
dtype=str,
|
|
127
157
|
)
|
|
128
158
|
except Exception as e:
|
|
129
|
-
raise click.ClickException(
|
|
159
|
+
raise click.ClickException(
|
|
160
|
+
f"Failed to read {config_table} as headerless list: {e}"
|
|
161
|
+
) from e
|
|
130
162
|
|
|
131
163
|
config_series = df[column]
|
|
132
164
|
else:
|
|
@@ -137,12 +169,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
137
169
|
)
|
|
138
170
|
config_series = df[column]
|
|
139
171
|
|
|
140
|
-
config_paths = (
|
|
141
|
-
config_series.dropna()
|
|
142
|
-
.map(str)
|
|
143
|
-
.map(lambda p: Path(p).expanduser())
|
|
144
|
-
.tolist()
|
|
145
|
-
)
|
|
172
|
+
config_paths = config_series.dropna().map(str).map(lambda p: Path(p).expanduser()).tolist()
|
|
146
173
|
|
|
147
174
|
# ----------------------------
|
|
148
175
|
# Validate config paths
|
|
@@ -163,9 +190,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
163
190
|
|
|
164
191
|
func = task_funcs[task]
|
|
165
192
|
|
|
166
|
-
click.echo(
|
|
167
|
-
f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
|
|
168
|
-
)
|
|
193
|
+
click.echo(f"Running task '{task}' on {len(config_paths)} config paths from {config_table}")
|
|
169
194
|
|
|
170
195
|
# ----------------------------
|
|
171
196
|
# Loop over paths
|
|
@@ -178,13 +203,16 @@ def batch(task, config_table: Path, column: str, sep: str | None):
|
|
|
178
203
|
click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
|
|
179
204
|
|
|
180
205
|
try:
|
|
181
|
-
func(str(cfg))
|
|
206
|
+
func(str(cfg)) # underlying functions take a string path
|
|
182
207
|
except Exception as e:
|
|
183
208
|
click.echo(f" ERROR on {cfg}: {e}")
|
|
184
209
|
|
|
185
210
|
click.echo("Batch processing complete.")
|
|
211
|
+
|
|
212
|
+
|
|
186
213
|
##########################################
|
|
187
214
|
|
|
215
|
+
|
|
188
216
|
####### concatenate command ###########
|
|
189
217
|
@cli.command("concatenate")
|
|
190
218
|
@click.argument(
|
|
@@ -244,9 +272,9 @@ def concatenate_cmd(
|
|
|
244
272
|
|
|
245
273
|
Two modes:
|
|
246
274
|
|
|
247
|
-
smftools concatenate out.h5ad --input-dir ./dir
|
|
275
|
+
smftools concatenate out.h5ad.gz --input-dir ./dir
|
|
248
276
|
|
|
249
|
-
smftools concatenate out.h5ad --csv-path paths.csv --csv-column h5ad_path
|
|
277
|
+
smftools concatenate out.h5ad.gz --csv-path paths.csv --csv-column h5ad_path
|
|
250
278
|
|
|
251
279
|
TXT input also works (one file path per line).
|
|
252
280
|
|
|
@@ -266,170 +294,57 @@ def concatenate_cmd(
|
|
|
266
294
|
delete_inputs=delete,
|
|
267
295
|
restore_backups=restore,
|
|
268
296
|
)
|
|
269
|
-
click.echo(f"
|
|
297
|
+
click.echo(f"Concatenated file written to: {out}")
|
|
270
298
|
|
|
271
299
|
except Exception as e:
|
|
272
300
|
raise click.ClickException(str(e)) from e
|
|
301
|
+
|
|
302
|
+
|
|
273
303
|
##########################################
|
|
274
304
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
#
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
# raise click.ClickException("Config CSV contains only a header row.")
|
|
322
|
-
|
|
323
|
-
# # Build dict; last occurrence of a key wins
|
|
324
|
-
# cfg = {}
|
|
325
|
-
# for k, v in zip(df["key"], df["value"]):
|
|
326
|
-
# cfg[k] = v
|
|
327
|
-
|
|
328
|
-
# # Validate required keys
|
|
329
|
-
# missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
|
|
330
|
-
# if missing:
|
|
331
|
-
# raise click.ClickException(
|
|
332
|
-
# "Missing required keys in CSV: "
|
|
333
|
-
# + ", ".join(missing)
|
|
334
|
-
# + "\nExpected keys:\n - "
|
|
335
|
-
# + "\n - ".join(REQUIRED_KEYS)
|
|
336
|
-
# + "\nOptional keys:\n - "
|
|
337
|
-
# + "\n - ".join(OPTIONAL_KEYS)
|
|
338
|
-
# )
|
|
339
|
-
|
|
340
|
-
# return cfg
|
|
341
|
-
|
|
342
|
-
# def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
|
|
343
|
-
# """Decide on the output .h5ad path based on CSV; create directories if needed."""
|
|
344
|
-
# merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
|
|
345
|
-
# if not merged_filename.endswith(".h5ad"):
|
|
346
|
-
# merged_filename += ".h5ad"
|
|
347
|
-
|
|
348
|
-
# output_path_raw = cfg.get("output_path", "").strip()
|
|
349
|
-
|
|
350
|
-
# if not output_path_raw:
|
|
351
|
-
# out_dir = Path.cwd() / "merged_output"
|
|
352
|
-
# out_dir.mkdir(parents=True, exist_ok=True)
|
|
353
|
-
# return out_dir / merged_filename
|
|
354
|
-
|
|
355
|
-
# output_path = Path(output_path_raw)
|
|
356
|
-
|
|
357
|
-
# if output_path.suffix.lower() == ".h5ad":
|
|
358
|
-
# output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
359
|
-
# return output_path
|
|
360
|
-
|
|
361
|
-
# # Treat as directory
|
|
362
|
-
# output_path.mkdir(parents=True, exist_ok=True)
|
|
363
|
-
# return output_path / merged_filename
|
|
364
|
-
|
|
365
|
-
# def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
|
|
366
|
-
|
|
367
|
-
# if backups:
|
|
368
|
-
# click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
|
|
369
|
-
# return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
|
|
370
|
-
# else:
|
|
371
|
-
# click.echo(f"Loading {label} from {primary} with backups disabled ...")
|
|
372
|
-
# return safe_read_h5ad(primary, restore_backups=False)
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
# @cli.command()
|
|
376
|
-
# @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
|
|
377
|
-
# def merge_barcoded_anndatas(config_path: Path):
|
|
378
|
-
# """
|
|
379
|
-
# Merge two AnnData objects from the same experiment that were demultiplexed
|
|
380
|
-
# under different end-barcoding requirements, using a 1-row CSV for config.
|
|
381
|
-
|
|
382
|
-
# CSV must include:
|
|
383
|
-
# - adata_single_path
|
|
384
|
-
# - adata_double_path
|
|
385
|
-
|
|
386
|
-
# Optional columns:
|
|
387
|
-
# - adata_single_backups_path
|
|
388
|
-
# - adata_double_backups_path
|
|
389
|
-
# - output_path (file or directory; default: ./merged_output/)
|
|
390
|
-
# - merged_filename (default: merged_<single>__<double>.h5ad)
|
|
391
|
-
|
|
392
|
-
# Example CSV:
|
|
393
|
-
|
|
394
|
-
# adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
|
|
395
|
-
# /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
|
|
396
|
-
# """
|
|
397
|
-
# try:
|
|
398
|
-
# cfg = _read_config_csv(config_path)
|
|
399
|
-
|
|
400
|
-
# single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
|
|
401
|
-
# double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
|
|
402
|
-
|
|
403
|
-
# for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
|
|
404
|
-
# if not p.exists():
|
|
405
|
-
# raise click.ClickException(f"{label} does not exist: {p}")
|
|
406
|
-
|
|
407
|
-
# single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
|
|
408
|
-
# double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
|
|
409
|
-
|
|
410
|
-
# if single_backups and not single_backups.exists():
|
|
411
|
-
# raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
|
|
412
|
-
# if double_backups and not double_backups.exists():
|
|
413
|
-
# raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
|
|
414
|
-
|
|
415
|
-
# output_path = _resolve_output_path(cfg, single_path, double_path)
|
|
416
|
-
|
|
417
|
-
# # Load
|
|
418
|
-
# adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
|
|
419
|
-
# adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
|
|
420
|
-
|
|
421
|
-
# click.echo("Merging AnnDatas ...")
|
|
422
|
-
# merged = merge_barcoded_anndatas_core(adata_single, adata_double)
|
|
423
|
-
|
|
424
|
-
# click.echo(f"Writing merged AnnData to: {output_path}")
|
|
425
|
-
# backup_dir = output_path.cwd() / "merged_backups"
|
|
426
|
-
# safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
|
|
427
|
-
|
|
428
|
-
# click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
|
|
429
|
-
|
|
430
|
-
# except click.ClickException:
|
|
431
|
-
# raise
|
|
432
|
-
# except Exception as e:
|
|
433
|
-
# # Surface unexpected errors cleanly
|
|
434
|
-
# raise click.ClickException(f"Unexpected error: {e}") from e
|
|
435
|
-
################################################################################################################
|
|
305
|
+
|
|
306
|
+
####### subsample pod5 command ###########
|
|
307
|
+
@cli.command("subsample-pod5")
|
|
308
|
+
@click.argument(
|
|
309
|
+
"pod5_path",
|
|
310
|
+
type=click.Path(exists=True, path_type=Path),
|
|
311
|
+
)
|
|
312
|
+
@click.option(
|
|
313
|
+
"--read-names",
|
|
314
|
+
"-r",
|
|
315
|
+
type=click.Path(exists=True, path_type=Path),
|
|
316
|
+
default=None,
|
|
317
|
+
help="Text file with one read_id per line.",
|
|
318
|
+
)
|
|
319
|
+
@click.option(
|
|
320
|
+
"--n-reads",
|
|
321
|
+
"-n",
|
|
322
|
+
type=int,
|
|
323
|
+
default=None,
|
|
324
|
+
help="Randomly subsample N reads.",
|
|
325
|
+
)
|
|
326
|
+
@click.option(
|
|
327
|
+
"--outdir",
|
|
328
|
+
"-o",
|
|
329
|
+
type=click.Path(path_type=Path, file_okay=False),
|
|
330
|
+
required=True,
|
|
331
|
+
help="Output directory for subsampled POD5.",
|
|
332
|
+
)
|
|
333
|
+
def subsample_pod5_cmd(pod5_path, read_names, n_reads, outdir):
|
|
334
|
+
"""
|
|
335
|
+
Subsample POD5 file(s) by read ID list or random sampling.
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
# --- Validate mutually exclusive options ---
|
|
339
|
+
if (read_names is None and n_reads is None) or (read_names and n_reads):
|
|
340
|
+
raise click.UsageError("You must specify exactly ONE of --read-names or --n-reads.")
|
|
341
|
+
|
|
342
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
|
343
|
+
|
|
344
|
+
subsample_arg = str(read_names) if read_names else n_reads
|
|
345
|
+
|
|
346
|
+
subsample_pod5(
|
|
347
|
+
pod5_path=str(pod5_path),
|
|
348
|
+
read_name_path=subsample_arg,
|
|
349
|
+
output_directory=str(outdir),
|
|
350
|
+
)
|
smftools/config/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .experiment_config import
|
|
1
|
+
from .experiment_config import ExperimentConfig, LoadExperimentConfig
|
smftools/config/conversion.yaml
CHANGED
|
@@ -9,6 +9,17 @@ conversion_types:
|
|
|
9
9
|
# Read QC Params
|
|
10
10
|
read_mod_filtering_use_other_c_as_background: True
|
|
11
11
|
|
|
12
|
+
# Spatial Analysis - Autocorr params
|
|
13
|
+
autocorr_site_types:
|
|
14
|
+
- "GpC"
|
|
15
|
+
|
|
16
|
+
# Spatial Analysis - Clustermap params
|
|
17
|
+
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
18
|
+
clustermap_cmap_c: "coolwarm"
|
|
19
|
+
clustermap_cmap_gpc: "coolwarm"
|
|
20
|
+
clustermap_cmap_cpg: "viridis"
|
|
21
|
+
clustermap_cmap_a: "coolwarm"
|
|
22
|
+
|
|
12
23
|
######## smftools hmm params #########
|
|
13
24
|
# HMM
|
|
14
25
|
cpg: True # whether to use the default HMM endogenous CpG patch params
|
|
@@ -18,21 +29,21 @@ hmm_feature_sets:
|
|
|
18
29
|
footprint:
|
|
19
30
|
state: "Non-Modified"
|
|
20
31
|
features:
|
|
21
|
-
small_bound_stretch: [
|
|
22
|
-
medium_bound_stretch: [
|
|
23
|
-
putative_nucleosome: [
|
|
32
|
+
small_bound_stretch: [6, 40]
|
|
33
|
+
medium_bound_stretch: [40, 100]
|
|
34
|
+
putative_nucleosome: [100, 200]
|
|
24
35
|
large_bound_stretch: [200, inf]
|
|
25
36
|
accessible:
|
|
26
37
|
state: "Modified"
|
|
27
38
|
features:
|
|
28
39
|
small_accessible_patch: [3, 20]
|
|
29
40
|
mid_accessible_patch: [20, 40]
|
|
30
|
-
|
|
31
|
-
|
|
41
|
+
large_accessible_patch: [40, 110]
|
|
42
|
+
nucleosome_depleted_region: [110, inf]
|
|
32
43
|
cpg:
|
|
33
44
|
state: "Modified"
|
|
34
45
|
features:
|
|
35
46
|
cpg_patch: [0, inf]
|
|
36
47
|
|
|
37
48
|
hmm_merge_layer_features:
|
|
38
|
-
- ["
|
|
49
|
+
- ["all_accessible_features", 60]
|
smftools/config/deaminase.yaml
CHANGED
|
@@ -7,6 +7,8 @@ conversion_types:
|
|
|
7
7
|
|
|
8
8
|
mod_target_bases:
|
|
9
9
|
- "C"
|
|
10
|
+
enzyme_target_bases:
|
|
11
|
+
- "C"
|
|
10
12
|
|
|
11
13
|
######## smftools preprocess params #########
|
|
12
14
|
read_mod_filtering_gpc_thresholds:
|
|
@@ -15,7 +17,7 @@ read_mod_filtering_gpc_thresholds:
|
|
|
15
17
|
read_mod_filtering_cpg_thresholds:
|
|
16
18
|
- null
|
|
17
19
|
- null
|
|
18
|
-
|
|
20
|
+
read_mod_filtering_c_thresholds:
|
|
19
21
|
- 0.01
|
|
20
22
|
- 0.99
|
|
21
23
|
read_mod_filtering_a_thresholds:
|
|
@@ -26,16 +28,16 @@ read_mod_filtering_use_other_c_as_background: False
|
|
|
26
28
|
|
|
27
29
|
# Duplicate Detection Params
|
|
28
30
|
duplicate_detection_site_types:
|
|
29
|
-
- "
|
|
31
|
+
- "C"
|
|
30
32
|
|
|
31
33
|
######## smftools analyze params #########
|
|
32
34
|
# Autocorrelation params
|
|
33
35
|
autocorr_site_types:
|
|
34
|
-
- "
|
|
36
|
+
- "C"
|
|
35
37
|
|
|
36
38
|
# Correlation matrix params
|
|
37
39
|
correlation_matrix_site_types:
|
|
38
|
-
- "
|
|
40
|
+
- "C_site"
|
|
39
41
|
|
|
40
42
|
# ######## smftools hmm params #########
|
|
41
43
|
cpg: False # whether to use the default HMM endogenous CpG patch params
|
|
@@ -45,17 +47,17 @@ hmm_feature_sets:
|
|
|
45
47
|
footprint:
|
|
46
48
|
state: "Non-Modified"
|
|
47
49
|
features:
|
|
48
|
-
small_bound_stretch: [
|
|
49
|
-
medium_bound_stretch: [
|
|
50
|
-
putative_nucleosome: [
|
|
50
|
+
small_bound_stretch: [6, 40]
|
|
51
|
+
medium_bound_stretch: [40, 100]
|
|
52
|
+
putative_nucleosome: [100, 200]
|
|
51
53
|
large_bound_stretch: [200, inf]
|
|
52
54
|
accessible:
|
|
53
55
|
state: "Modified"
|
|
54
56
|
features:
|
|
55
57
|
small_accessible_patch: [3, 20]
|
|
56
58
|
mid_accessible_patch: [20, 40]
|
|
57
|
-
|
|
58
|
-
|
|
59
|
+
large_accessible_patch: [40, 110]
|
|
60
|
+
nucleosome_depleted_region: [110, inf]
|
|
59
61
|
|
|
60
62
|
hmm_merge_layer_features:
|
|
61
|
-
- ["
|
|
63
|
+
- ["all_accessible_features", 60]
|