smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/cli_flows.py +94 -0
  5. smftools/cli/hmm_adata.py +338 -0
  6. smftools/cli/load_adata.py +577 -0
  7. smftools/cli/preprocess_adata.py +363 -0
  8. smftools/cli/spatial_adata.py +564 -0
  9. smftools/cli_entry.py +435 -0
  10. smftools/config/conversion.yaml +11 -6
  11. smftools/config/deaminase.yaml +12 -7
  12. smftools/config/default.yaml +36 -25
  13. smftools/config/direct.yaml +25 -1
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +109 -12
  16. smftools/informatics/__init__.py +13 -7
  17. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  18. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  19. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  20. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  21. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  22. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  23. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  24. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  25. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  26. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  27. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  28. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  30. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  31. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  32. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  34. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  35. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  36. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  37. smftools/informatics/bam_functions.py +812 -0
  38. smftools/informatics/basecalling.py +67 -0
  39. smftools/informatics/bed_functions.py +366 -0
  40. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  41. smftools/informatics/fasta_functions.py +255 -0
  42. smftools/informatics/h5ad_functions.py +197 -0
  43. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  44. smftools/informatics/modkit_functions.py +129 -0
  45. smftools/informatics/ohe.py +160 -0
  46. smftools/informatics/pod5_functions.py +224 -0
  47. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  48. smftools/plotting/autocorrelation_plotting.py +1 -3
  49. smftools/plotting/general_plotting.py +1037 -362
  50. smftools/preprocessing/__init__.py +2 -0
  51. smftools/preprocessing/append_base_context.py +3 -3
  52. smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
  53. smftools/preprocessing/binarize.py +17 -0
  54. smftools/preprocessing/binarize_on_Youden.py +2 -2
  55. smftools/preprocessing/calculate_position_Youden.py +1 -1
  56. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  57. smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
  58. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  59. smftools/readwrite.py +266 -140
  60. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
  61. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
  62. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  63. smftools/cli.py +0 -184
  64. smftools/informatics/fast5_to_pod5.py +0 -24
  65. smftools/informatics/helpers/__init__.py +0 -73
  66. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  67. smftools/informatics/helpers/bam_qc.py +0 -66
  68. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  69. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  70. smftools/informatics/helpers/discover_input_files.py +0 -100
  71. smftools/informatics/helpers/index_fasta.py +0 -12
  72. smftools/informatics/helpers/make_dirs.py +0 -21
  73. smftools/informatics/readwrite.py +0 -106
  74. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  75. smftools/load_adata.py +0 -1346
  76. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  77. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  78. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  79. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  80. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  81. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  82. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  83. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  84. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  85. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  86. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  87. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  88. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  89. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  90. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  91. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  92. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  93. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  94. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  95. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  96. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py ADDED
@@ -0,0 +1,435 @@
1
+ import click
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from typing import Dict, Optional, Sequence
5
+
6
+ from .cli.load_adata import load_adata
7
+ from .cli.cli_flows import flow_I
8
+ from .cli.preprocess_adata import preprocess_adata
9
+ from .cli.spatial_adata import spatial_adata
10
+ from .cli.hmm_adata import hmm_adata
11
+
12
+ from .readwrite import merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
13
+
14
+ @click.group()
15
+ def cli():
16
+ """Command-line interface for smftools."""
17
+ pass
18
+
19
+ ####### Load anndata from raw data ###########
20
+ @cli.command()
21
+ @click.argument("config_path", type=click.Path(exists=True))
22
+ def load(config_path):
23
+ """Load and process data from CONFIG_PATH."""
24
+ load_adata(config_path)
25
+ ##########################################
26
+
27
+ ####### Preprocessing ###########
28
+ @cli.command()
29
+ @click.argument("config_path", type=click.Path(exists=True))
30
+ def preprocess(config_path):
31
+ """Preprocess data from CONFIG_PATH."""
32
+ preprocess_adata(config_path)
33
+ ##########################################
34
+
35
+ ####### Spatial ###########
36
+ @cli.command()
37
+ @click.argument("config_path", type=click.Path(exists=True))
38
+ def spatial(config_path):
39
+ """Process data from CONFIG_PATH."""
40
+ spatial_adata(config_path)
41
+ ##########################################
42
+
43
+ ####### HMM ###########
44
+ @cli.command()
45
+ @click.argument("config_path", type=click.Path(exists=True))
46
+ def hmm(config_path):
47
+ """Process data from CONFIG_PATH."""
48
+ hmm_adata(config_path)
49
+ ##########################################
50
+
51
+ ####### batch command ###########
52
+ @cli.command()
53
+ @click.argument(
54
+ "task",
55
+ type=click.Choice(["load", "preprocess", "spatial", "hmm"], case_sensitive=False),
56
+ )
57
+ @click.argument(
58
+ "config_table",
59
+ type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path),
60
+ )
61
+ @click.option(
62
+ "--column",
63
+ "-c",
64
+ default="config_path",
65
+ show_default=True,
66
+ help="Column name containing config paths (ignored for plain TXT).",
67
+ )
68
+ @click.option(
69
+ "--sep",
70
+ default=None,
71
+ help="Field separator: default auto-detect (.tsv -> '\\t', .csv -> ',', others treated as TXT).",
72
+ )
73
+ def batch(task, config_table: Path, column: str, sep: str | None):
74
+ """
75
+ Run a TASK (load, preprocess, spatial, hmm) on multiple CONFIG_PATHs
76
+ listed in a CSV/TSV or plain TXT file.
77
+
78
+ Plain text format: one config path per line, no header.
79
+ """
80
+
81
+ # ----------------------------
82
+ # Decide file type
83
+ # ----------------------------
84
+ suffix = config_table.suffix.lower()
85
+
86
+ # TXT mode → each line is a config path
87
+ if suffix in {".txt", ".list"}:
88
+ paths = []
89
+ with config_table.open() as f:
90
+ for line in f:
91
+ line = line.strip()
92
+ if line:
93
+ paths.append(Path(line).expanduser())
94
+
95
+ if not paths:
96
+ raise click.ClickException(f"No config paths found in text file: {config_table}")
97
+
98
+ config_paths = paths
99
+
100
+ else:
101
+ # CSV / TSV mode
102
+ # auto-detect separator if not provided
103
+ if sep is None:
104
+ if suffix in {".tsv", ".tab"}:
105
+ sep = "\t"
106
+ else:
107
+ sep = ","
108
+
109
+ try:
110
+ df = pd.read_csv(config_table, sep=sep, dtype=str)
111
+ except Exception as e:
112
+ raise click.ClickException(f"Failed to read table {config_table}: {e}") from e
113
+
114
+ if df.empty:
115
+ raise click.ClickException(f"Config table is empty: {config_table}")
116
+
117
+ # If table has no header or only one column, treat it as raw paths
118
+ if df.shape[1] == 1 and column not in df.columns:
119
+ # re-read as headerless single-column list, so we don't drop the first path
120
+ try:
121
+ df = pd.read_csv(
122
+ config_table,
123
+ sep=sep,
124
+ header=None,
125
+ names=[column],
126
+ dtype=str,
127
+ )
128
+ except Exception as e:
129
+ raise click.ClickException(f"Failed to read {config_table} as headerless list: {e}") from e
130
+
131
+ config_series = df[column]
132
+ else:
133
+ if column not in df.columns:
134
+ raise click.ClickException(
135
+ f"Column '{column}' not found in {config_table}. "
136
+ f"Available columns: {', '.join(df.columns)}"
137
+ )
138
+ config_series = df[column]
139
+
140
+ config_paths = (
141
+ config_series.dropna()
142
+ .map(str)
143
+ .map(lambda p: Path(p).expanduser())
144
+ .tolist()
145
+ )
146
+
147
+ # ----------------------------
148
+ # Validate config paths
149
+ # ----------------------------
150
+ if not config_paths:
151
+ raise click.ClickException("No config paths found.")
152
+
153
+ # ----------------------------
154
+ # Map task to function
155
+ # ----------------------------
156
+ task = task.lower()
157
+ task_funcs = {
158
+ "load": load_adata,
159
+ "preprocess": preprocess_adata,
160
+ "spatial": spatial_adata,
161
+ "hmm": hmm_adata,
162
+ }
163
+
164
+ func = task_funcs[task]
165
+
166
+ click.echo(
167
+ f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
168
+ )
169
+
170
+ # ----------------------------
171
+ # Loop over paths
172
+ # ----------------------------
173
+ for i, cfg in enumerate(config_paths, start=1):
174
+ if not cfg.exists():
175
+ click.echo(f"[{i}/{len(config_paths)}] SKIP (missing): {cfg}")
176
+ continue
177
+
178
+ click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
179
+
180
+ try:
181
+ func(str(cfg)) # underlying functions take a string path
182
+ except Exception as e:
183
+ click.echo(f" ERROR on {cfg}: {e}")
184
+
185
+ click.echo("Batch processing complete.")
186
+ ##########################################
187
+
188
+ ####### concatenate command ###########
189
+ @cli.command("concatenate")
190
+ @click.argument(
191
+ "output_path",
192
+ type=click.Path(path_type=Path, dir_okay=False),
193
+ )
194
+ @click.option(
195
+ "--input-dir",
196
+ "-d",
197
+ type=click.Path(path_type=Path, file_okay=False),
198
+ default=None,
199
+ help="Directory containing .h5ad/.h5ad.gz files to concatenate.",
200
+ )
201
+ @click.option(
202
+ "--csv-path",
203
+ "-c",
204
+ type=click.Path(path_type=Path, dir_okay=False),
205
+ default=None,
206
+ help="CSV/TSV/TXT containing file paths of h5ad files.",
207
+ )
208
+ @click.option(
209
+ "--csv-column",
210
+ "-C",
211
+ default="h5ad_path",
212
+ help="Column in the CSV containing file paths (ignored for TXT).",
213
+ show_default=True,
214
+ )
215
+ @click.option(
216
+ "--suffix",
217
+ "-s",
218
+ multiple=True,
219
+ default=[".h5ad", ".h5ad.gz"],
220
+ help="Allowed file suffixes (repeatable).",
221
+ show_default=True,
222
+ )
223
+ @click.option(
224
+ "--delete",
225
+ is_flag=False,
226
+ help="Delete input .h5ad files after concatenation.",
227
+ )
228
+ @click.option(
229
+ "--restore",
230
+ is_flag=True,
231
+ help="Restore .h5ad backups during reading.",
232
+ )
233
+ def concatenate_cmd(
234
+ output_path: Path,
235
+ input_dir: Path | None,
236
+ csv_path: Path | None,
237
+ csv_column: str,
238
+ suffix: Sequence[str],
239
+ delete: bool,
240
+ restore: bool,
241
+ ):
242
+ """
243
+ Concatenate multiple .h5ad files into a single output file.
244
+
245
+ Two modes:
246
+
247
+ smftools concatenate out.h5ad --input-dir ./dir
248
+
249
+ smftools concatenate out.h5ad --csv-path paths.csv --csv-column h5ad_path
250
+
251
+ TXT input also works (one file path per line).
252
+
253
+ Uses safe_read_h5ad() and safe_write_h5ad().
254
+ """
255
+
256
+ if input_dir and csv_path:
257
+ raise click.ClickException("Provide only ONE of --input-dir or --csv-path.")
258
+
259
+ try:
260
+ out = concatenate_h5ads(
261
+ output_path=output_path,
262
+ input_dir=input_dir,
263
+ csv_path=csv_path,
264
+ csv_column=csv_column,
265
+ file_suffixes=tuple(suffix),
266
+ delete_inputs=delete,
267
+ restore_backups=restore,
268
+ )
269
+ click.echo(f"✓ Concatenated file written to: {out}")
270
+
271
+ except Exception as e:
272
+ raise click.ClickException(str(e)) from e
273
+ ##########################################
274
+
275
+ ####### Merging existing anndatas from an experiment that used two different demultiplexing rules #######
276
+ # REQUIRED_KEYS = ("adata_single_path", "adata_double_path")
277
+ # OPTIONAL_KEYS = (
278
+ # "adata_single_backups_path",
279
+ # "adata_double_backups_path",
280
+ # "output_path",
281
+ # "merged_filename",
282
+ # )
283
+
284
+ # def _read_config_csv(csv_path: Path) -> Dict[str, str]:
285
+ # """
286
+ # Read a multi-row, two-column CSV of key,value pairs into a dict.
287
+
288
+ # Supported features:
289
+ # - Optional header ("key,value") or none.
290
+ # - Comments starting with '#' and blank lines are ignored.
291
+ # - If duplicate keys occur, the last one wins.
292
+ # - Keys are matched literally against REQUIRED_KEYS/OPTIONAL_KEYS.
293
+ # """
294
+ # try:
295
+ # # Read as two columns regardless of header; comments ignored.
296
+ # df = pd.read_csv(
297
+ # csv_path,
298
+ # dtype=str,
299
+ # comment="#",
300
+ # header=None, # treat everything as rows; we'll normalize below
301
+ # usecols=[0, 1],
302
+ # names=["key", "value"]
303
+ # )
304
+ # except Exception as e:
305
+ # raise click.ClickException(f"Failed to read CSV: {e}") from e
306
+
307
+ # # Drop completely empty rows
308
+ # df = df.fillna("").astype(str)
309
+ # df["key"] = df["key"].str.strip()
310
+ # df["value"] = df["value"].str.strip()
311
+ # df = df[(df["key"] != "") & (df["key"].notna())]
312
+
313
+ # if df.empty:
314
+ # raise click.ClickException("Config CSV is empty after removing comments/blank lines.")
315
+
316
+ # # Remove an optional header row if present
317
+ # if df.iloc[0]["key"].lower() in {"key", "keys"}:
318
+ # df = df.iloc[1:]
319
+ # df = df[(df["key"] != "") & (df["key"].notna())]
320
+ # if df.empty:
321
+ # raise click.ClickException("Config CSV contains only a header row.")
322
+
323
+ # # Build dict; last occurrence of a key wins
324
+ # cfg = {}
325
+ # for k, v in zip(df["key"], df["value"]):
326
+ # cfg[k] = v
327
+
328
+ # # Validate required keys
329
+ # missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
330
+ # if missing:
331
+ # raise click.ClickException(
332
+ # "Missing required keys in CSV: "
333
+ # + ", ".join(missing)
334
+ # + "\nExpected keys:\n - "
335
+ # + "\n - ".join(REQUIRED_KEYS)
336
+ # + "\nOptional keys:\n - "
337
+ # + "\n - ".join(OPTIONAL_KEYS)
338
+ # )
339
+
340
+ # return cfg
341
+
342
+ # def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
343
+ # """Decide on the output .h5ad path based on CSV; create directories if needed."""
344
+ # merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
345
+ # if not merged_filename.endswith(".h5ad"):
346
+ # merged_filename += ".h5ad"
347
+
348
+ # output_path_raw = cfg.get("output_path", "").strip()
349
+
350
+ # if not output_path_raw:
351
+ # out_dir = Path.cwd() / "merged_output"
352
+ # out_dir.mkdir(parents=True, exist_ok=True)
353
+ # return out_dir / merged_filename
354
+
355
+ # output_path = Path(output_path_raw)
356
+
357
+ # if output_path.suffix.lower() == ".h5ad":
358
+ # output_path.parent.mkdir(parents=True, exist_ok=True)
359
+ # return output_path
360
+
361
+ # # Treat as directory
362
+ # output_path.mkdir(parents=True, exist_ok=True)
363
+ # return output_path / merged_filename
364
+
365
+ # def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
366
+
367
+ # if backups:
368
+ # click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
369
+ # return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
370
+ # else:
371
+ # click.echo(f"Loading {label} from {primary} with backups disabled ...")
372
+ # return safe_read_h5ad(primary, restore_backups=False)
373
+
374
+
375
+ # @cli.command()
376
+ # @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
377
+ # def merge_barcoded_anndatas(config_path: Path):
378
+ # """
379
+ # Merge two AnnData objects from the same experiment that were demultiplexed
380
+ # under different end-barcoding requirements, using a 1-row CSV for config.
381
+
382
+ # CSV must include:
383
+ # - adata_single_path
384
+ # - adata_double_path
385
+
386
+ # Optional columns:
387
+ # - adata_single_backups_path
388
+ # - adata_double_backups_path
389
+ # - output_path (file or directory; default: ./merged_output/)
390
+ # - merged_filename (default: merged_<single>__<double>.h5ad)
391
+
392
+ # Example CSV:
393
+
394
+ # adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
395
+ # /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
396
+ # """
397
+ # try:
398
+ # cfg = _read_config_csv(config_path)
399
+
400
+ # single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
401
+ # double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
402
+
403
+ # for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
404
+ # if not p.exists():
405
+ # raise click.ClickException(f"{label} does not exist: {p}")
406
+
407
+ # single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
408
+ # double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
409
+
410
+ # if single_backups and not single_backups.exists():
411
+ # raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
412
+ # if double_backups and not double_backups.exists():
413
+ # raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
414
+
415
+ # output_path = _resolve_output_path(cfg, single_path, double_path)
416
+
417
+ # # Load
418
+ # adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
419
+ # adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
420
+
421
+ # click.echo("Merging AnnDatas ...")
422
+ # merged = merge_barcoded_anndatas_core(adata_single, adata_double)
423
+
424
+ # click.echo(f"Writing merged AnnData to: {output_path}")
425
+ # backup_dir = output_path.cwd() / "merged_backups"
426
+ # safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
427
+
428
+ # click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
429
+
430
+ # except click.ClickException:
431
+ # raise
432
+ # except Exception as e:
433
+ # # Surface unexpected errors cleanly
434
+ # raise click.ClickException(f"Unexpected error: {e}") from e
435
+ ################################################################################################################
@@ -1,11 +1,15 @@
1
1
  # Conversion (Bisulfite/APOBEC)footprinting defaults
2
2
  extends: default
3
+
4
+ ######## smftools load params #########
3
5
  conversion_types:
4
6
  - '5mC' # 5mC
5
7
 
8
+ ######## smftools preprocess params #########
6
9
  # Read QC Params
7
10
  read_mod_filtering_use_other_c_as_background: True
8
11
 
12
+ ######## smftools hmm params #########
9
13
  # HMM
10
14
  cpg: True # whether to use the default HMM endogenous CpG patch params
11
15
  hmm_methbases:
@@ -14,16 +18,17 @@ hmm_feature_sets:
14
18
  footprint:
15
19
  state: "Non-Modified"
16
20
  features:
17
- small_bound_stretch: [0, 20]
18
- medium_bound_stretch: [20, 50]
19
- putative_nucleosome: [50, 200]
21
+ small_bound_stretch: [10, 30]
22
+ medium_bound_stretch: [30, 110]
23
+ putative_nucleosome: [110, 200]
20
24
  large_bound_stretch: [200, inf]
21
25
  accessible:
22
26
  state: "Modified"
23
27
  features:
24
- small_accessible_patch: [0, 20]
25
- mid_accessible_patch: [20, 80]
26
- large_accessible_patch: [80, inf]
28
+ small_accessible_patch: [3, 20]
29
+ mid_accessible_patch: [20, 40]
30
+ mid_large_accessible_patch: [40, 130]
31
+ large_accessible_patch: [130, inf]
27
32
  cpg:
28
33
  state: "Modified"
29
34
  features:
@@ -1,11 +1,14 @@
1
1
  # Deaminase footprinting defaults
2
2
  extends: default
3
+
4
+ ######## smftools load params #########
3
5
  conversion_types:
4
6
  - '5mC' # 5mC
5
7
 
6
8
  mod_target_bases:
7
9
  - "C"
8
10
 
11
+ ######## smftools preprocess params #########
9
12
  read_mod_filtering_gpc_thresholds:
10
13
  - null
11
14
  - null
@@ -25,6 +28,7 @@ read_mod_filtering_use_other_c_as_background: False
25
28
  duplicate_detection_site_types:
26
29
  - "any_C"
27
30
 
31
+ ######## smftools analyze params #########
28
32
  # Autocorrelation params
29
33
  autocorr_site_types:
30
34
  - "any_C"
@@ -33,7 +37,7 @@ autocorr_site_types:
33
37
  correlation_matrix_site_types:
34
38
  - "any_C_site"
35
39
 
36
- # HMM
40
+ # ######## smftools hmm params #########
37
41
  cpg: False # whether to use the default HMM endogenous CpG patch params
38
42
  hmm_methbases:
39
43
  - "C"
@@ -41,16 +45,17 @@ hmm_feature_sets:
41
45
  footprint:
42
46
  state: "Non-Modified"
43
47
  features:
44
- small_bound_stretch: [0, 25]
45
- medium_bound_stretch: [25, 80]
46
- putative_nucleosome: [80, 200]
48
+ small_bound_stretch: [10, 30]
49
+ medium_bound_stretch: [30, 110]
50
+ putative_nucleosome: [110, 200]
47
51
  large_bound_stretch: [200, inf]
48
52
  accessible:
49
53
  state: "Modified"
50
54
  features:
51
- small_accessible_patch: [0, 20]
52
- mid_accessible_patch: [20, 100]
53
- large_accessible_patch: [100, inf]
55
+ small_accessible_patch: [3, 20]
56
+ mid_accessible_patch: [20, 40]
57
+ mid_large_accessible_patch: [40, 130]
58
+ large_accessible_patch: [130, inf]
54
59
 
55
60
  hmm_merge_layer_features:
56
61
  - ["C_all_accessible_features", 80]
@@ -1,3 +1,13 @@
1
+ # General
2
+ sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
3
+ sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
+ sample_name_col_for_plotting: 'Barcode'
5
+
6
+ # Compute params
7
+ threads: 4
8
+ device: "auto"
9
+
10
+ ######## smftools load params #########
1
11
  # Generic i/o
2
12
  bam_suffix: ".bam"
3
13
  recursive_input_search: True
@@ -7,16 +17,12 @@ strands:
7
17
  - top
8
18
  conversions:
9
19
  - unconverted
10
- sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
11
- sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
12
20
  fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
13
21
  fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
14
22
  input_already_demuxed: False # If the input files are already demultiplexed.
15
23
  delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
16
-
17
- # Compute params
18
- threads: 4
19
- device: "auto"
24
+ delete_intermediate_bams: True # Whether to delete intermediate BAM files.
25
+ delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
20
26
 
21
27
  # Sequencing modality and general experiment params
22
28
  smf_modality: 'conversion' # conversion, deaminase, direct
@@ -70,11 +76,11 @@ aligner_args:
70
76
  dorado:
71
77
  ont:
72
78
  - "--mm2-opts"
73
- - "-N"
74
- - "5"
79
+ - "-N 5"
75
80
 
76
81
  # Sorted BAM and BED specific handling
77
82
  make_bigwigs: False # Whether to make coverage bigwigs
83
+ make_beds: False # Whether to make beds from the aligned bams
78
84
 
79
85
  # Nanopore specific demultiplexing
80
86
  barcode_both_ends: False # dorado demultiplexing
@@ -85,24 +91,25 @@ mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall w
85
91
  reference_column: 'Reference_strand'
86
92
  sample_column: 'Barcode'
87
93
 
88
- # Preprocessing - Read length, quality, and mapping filtering params
94
+ ######## smftools preprocess params #########
95
+ # Read length, quality, and mapping filtering params
89
96
  read_coord_filter:
90
97
  - null
91
98
  - null
92
99
  read_len_filter_thresholds:
93
- - 200
100
+ - 100
94
101
  - null
95
102
  read_len_to_ref_ratio_filter_thresholds:
96
- - 0.8
103
+ - 0.5
97
104
  - null
98
105
  read_quality_filter_thresholds:
99
- - 20
106
+ - 15
100
107
  - null
101
108
  read_mapping_quality_filter_thresholds:
102
109
  - null
103
110
  - null
104
111
 
105
- # Preprocessing - Read modification filtering params
112
+ # Read modification filtering params
106
113
  read_mod_filtering_gpc_thresholds:
107
114
  - 0.025
108
115
  - 0.975
@@ -116,9 +123,9 @@ read_mod_filtering_a_thresholds:
116
123
  - 0.025
117
124
  - 0.975
118
125
  read_mod_filtering_use_other_c_as_background: False
119
- min_valid_fraction_positions_in_read_vs_ref: 0.8
126
+ min_valid_fraction_positions_in_read_vs_ref: 0.5
120
127
 
121
- # Preprocessing - Duplicate detection params
128
+ # Duplicate detection params
122
129
  duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
123
130
  - "GpC"
124
131
  - "CpG"
@@ -133,11 +140,10 @@ duplicate_detection_do_hierarchical: True # Whether to follow up fwd/rev lexicog
133
140
  duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical clustering distance calculation
134
141
  duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
135
142
 
136
- # Preprocessing - Complexity analysis params
137
-
138
- # General Plotting params
139
- sample_name_col_for_plotting: 'Barcode'
143
+ # Position QC params
144
+ position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
140
145
 
146
+ ######## smftools analyze params #########
141
147
  # Basic Analysis - QC Plotting params
142
148
  rows_per_qc_histogram_grid: 12
143
149
 
@@ -169,6 +175,7 @@ correlation_matrix_cmaps:
169
175
  correlation_matrix_site_types:
170
176
  - "GpC_site"
171
177
 
178
+ ######## smftools hmm params #########
172
179
  # HMM params
173
180
  hmm_n_states: 2 # Number of HMM states
174
181
  hmm_init_emission_probs:
@@ -197,19 +204,23 @@ hmm_feature_sets:
197
204
  footprint:
198
205
  state: "Non-Modified"
199
206
  features:
200
- small_bound_stretch: [0, 25]
201
- medium_bound_stretch: [25, 80]
202
- putative_nucleosome: [80, 200]
207
+ small_bound_stretch: [10, 40]
208
+ medium_bound_stretch: [40, 110]
209
+ putative_nucleosome: [110, 200]
203
210
  large_bound_stretch: [200, inf]
204
211
  accessible:
205
212
  state: "Modified"
206
213
  features:
207
- small_accessible_patch: [0, 20]
208
- mid_accessible_patch: [20, 100]
209
- large_accessible_patch: [100, inf]
214
+ small_accessible_patch: [3, 20]
215
+ mid_accessible_patch: [20, 40]
216
+ mid_large_accessible_patch: [40, 110]
217
+ large_accessible_patch: [110, inf]
210
218
  hmm_merge_layer_features:
211
219
  - [null, 80]
212
220
 
221
+ # Pipeline control flow - load adata
222
+ force_redo_load_adata: False # Whether to perform load adata command from start
223
+
213
224
  # Pipeline control flow - Preprocessing and QC
214
225
  force_redo_preprocessing: False # Whether to force redo the entire preprocessing workflow from the initial raw anndata.
215
226
  force_reload_sample_sheet: True # Whether to force redo sample sheet loading