smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +18 -2
  4. smftools/cli/hmm_adata.py +18 -1
  5. smftools/cli/latent_adata.py +522 -67
  6. smftools/cli/load_adata.py +2 -2
  7. smftools/cli/preprocess_adata.py +32 -93
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +23 -109
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +41 -5
  12. smftools/config/conversion.yaml +0 -10
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +49 -13
  15. smftools/config/experiment_config.py +96 -3
  16. smftools/constants.py +4 -0
  17. smftools/hmm/call_hmm_peaks.py +1 -1
  18. smftools/informatics/binarize_converted_base_identities.py +2 -89
  19. smftools/informatics/converted_BAM_to_adata.py +53 -13
  20. smftools/informatics/h5ad_functions.py +83 -0
  21. smftools/informatics/modkit_extract_to_adata.py +4 -0
  22. smftools/plotting/__init__.py +26 -12
  23. smftools/plotting/autocorrelation_plotting.py +22 -4
  24. smftools/plotting/chimeric_plotting.py +1893 -0
  25. smftools/plotting/classifiers.py +28 -14
  26. smftools/plotting/general_plotting.py +58 -3362
  27. smftools/plotting/hmm_plotting.py +1586 -2
  28. smftools/plotting/latent_plotting.py +804 -0
  29. smftools/plotting/plotting_utils.py +243 -0
  30. smftools/plotting/position_stats.py +16 -8
  31. smftools/plotting/preprocess_plotting.py +281 -0
  32. smftools/plotting/qc_plotting.py +8 -3
  33. smftools/plotting/spatial_plotting.py +1134 -0
  34. smftools/plotting/variant_plotting.py +1231 -0
  35. smftools/preprocessing/__init__.py +3 -0
  36. smftools/preprocessing/append_base_context.py +1 -1
  37. smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
  38. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  39. smftools/preprocessing/append_variant_call_layer.py +480 -0
  40. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  41. smftools/preprocessing/invert_adata.py +1 -0
  42. smftools/readwrite.py +109 -85
  43. smftools/tools/__init__.py +6 -0
  44. smftools/tools/calculate_knn.py +121 -0
  45. smftools/tools/calculate_nmf.py +18 -7
  46. smftools/tools/calculate_pca.py +180 -0
  47. smftools/tools/calculate_umap.py +70 -154
  48. smftools/tools/position_stats.py +4 -4
  49. smftools/tools/rolling_nn_distance.py +640 -3
  50. smftools/tools/sequence_alignment.py +140 -0
  51. smftools/tools/tensor_factorization.py +52 -4
  52. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
  53. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
  54. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  55. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  56. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,423 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional, Tuple
6
+
7
+ import anndata as ad
8
+
9
+ from smftools.constants import LOGGING_DIR, VARIANT_DIR
10
+ from smftools.logging_utils import get_logger, setup_logging
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ def variant_adata(
16
+ config_path: str,
17
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
18
+ """
19
+ CLI-facing wrapper for variant analyses.
20
+
21
+ Called by: `smftools variant <config_path>`
22
+
23
+ Responsibilities:
24
+ - Ensure a usable AnnData exists.
25
+ - Determine which AnnData stages exist.
26
+ - Decide whether to skip (return existing) or run the core.
27
+ - Call `variant_adata_core(...)` when actual work is needed.
28
+ """
29
+ from ..readwrite import safe_read_h5ad
30
+ from .helpers import get_adata_paths, load_experiment_config
31
+
32
+ # 1) Ensure config + basic paths via load_adata
33
+ cfg = load_experiment_config(config_path)
34
+
35
+ paths = get_adata_paths(cfg)
36
+
37
+ pp_path = paths.pp
38
+ pp_dedup_path = paths.pp_dedup
39
+ spatial_path = paths.spatial
40
+ chimeric_path = paths.chimeric
41
+ variant_path = paths.variant
42
+ hmm_path = paths.hmm
43
+ latent_path = paths.latent
44
+
45
+ # Stage-skipping logic
46
+ if not getattr(cfg, "force_redo_variant_analyses", False):
47
+ if variant_path.exists():
48
+ logger.info(f"Variant AnnData found: {variant_path}\nSkipping smftools variant")
49
+ return None, spatial_path
50
+
51
+ # Helper to load from disk, reusing loaded_adata if it matches
52
+ def _load(path: Path):
53
+ adata, _ = safe_read_h5ad(path)
54
+ return adata
55
+
56
+ # 3) Decide which AnnData to use as the *starting point* for analyses
57
+ if hmm_path.exists():
58
+ start_adata = _load(hmm_path)
59
+ source_path = hmm_path
60
+ elif latent_path.exists():
61
+ start_adata = _load(latent_path)
62
+ source_path = latent_path
63
+ elif spatial_path.exists():
64
+ start_adata = _load(spatial_path)
65
+ source_path = spatial_path
66
+ elif chimeric_path.exists():
67
+ start_adata = _load(chimeric_path)
68
+ source_path = chimeric_path
69
+ elif variant_path.exists():
70
+ start_adata = _load(variant_path)
71
+ source_path = variant_path
72
+ elif pp_dedup_path.exists():
73
+ start_adata = _load(pp_dedup_path)
74
+ source_path = pp_dedup_path
75
+ elif pp_path.exists():
76
+ start_adata = _load(pp_path)
77
+ source_path = pp_path
78
+ else:
79
+ logger.warning(
80
+ "No suitable AnnData found for variant analyses (need at least preprocessed)."
81
+ )
82
+ return None, None
83
+
84
+ # 4) Run the core
85
+ adata_variant, variant_path = variant_adata_core(
86
+ adata=start_adata,
87
+ cfg=cfg,
88
+ paths=paths,
89
+ source_adata_path=source_path,
90
+ config_path=config_path,
91
+ )
92
+
93
+ return adata_variant, variant_path
94
+
95
+
96
+ def variant_adata_core(
97
+ adata: ad.AnnData,
98
+ cfg,
99
+ paths: AdataPaths,
100
+ source_adata_path: Optional[Path] = None,
101
+ config_path: Optional[str] = None,
102
+ ) -> Tuple[ad.AnnData, Path]:
103
+ """
104
+ Core variant analysis pipeline.
105
+
106
+ Assumes:
107
+ - `cfg` is the ExperimentConfig.
108
+
109
+ Does:
110
+ -
111
+ - Save AnnData
112
+ """
113
+ import os
114
+ import warnings
115
+ from datetime import datetime
116
+ from pathlib import Path
117
+
118
+ import numpy as np
119
+ import pandas as pd
120
+
121
+ from ..metadata import record_smftools_metadata
122
+ from ..plotting import (
123
+ plot_mismatch_base_frequency_by_position,
124
+ plot_sequence_integer_encoding_clustermaps,
125
+ plot_variant_segment_clustermaps,
126
+ )
127
+ from ..preprocessing import (
128
+ append_mismatch_frequency_sites,
129
+ append_sequence_mismatch_annotations,
130
+ append_variant_call_layer,
131
+ append_variant_segment_layer,
132
+ load_sample_sheet,
133
+ )
134
+ from ..readwrite import make_dirs
135
+ from .helpers import write_gz_h5ad
136
+
137
+ # -----------------------------
138
+ # General setup
139
+ # -----------------------------
140
+ date_str = datetime.today().strftime("%y%m%d")
141
+ now = datetime.now()
142
+ time_str = now.strftime("%H%M%S")
143
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
144
+
145
+ output_directory = Path(cfg.output_directory)
146
+ variant_directory = output_directory / VARIANT_DIR
147
+ logging_directory = variant_directory / LOGGING_DIR
148
+
149
+ make_dirs([output_directory, variant_directory])
150
+
151
+ if cfg.emit_log_file:
152
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
153
+ make_dirs([logging_directory])
154
+ else:
155
+ log_file = None
156
+
157
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
158
+
159
+ smf_modality = cfg.smf_modality
160
+ if smf_modality == "conversion":
161
+ deaminase = False
162
+ else:
163
+ deaminase = True
164
+
165
+ # -----------------------------
166
+ # Optional sample sheet metadata
167
+ # -----------------------------
168
+ if getattr(cfg, "sample_sheet_path", None):
169
+ load_sample_sheet(
170
+ adata,
171
+ cfg.sample_sheet_path,
172
+ mapping_key_column=cfg.sample_sheet_mapping_column,
173
+ as_category=True,
174
+ force_reload=cfg.force_reload_sample_sheet,
175
+ )
176
+
177
+ # ============================================================
178
+ # 1) Reference variant position annotation
179
+ # ============================================================
180
+ seq1_col, seq2_col = getattr(cfg, "references_to_align_for_variant_annotation", [None, None])
181
+ if seq1_col and seq2_col:
182
+ append_sequence_mismatch_annotations(adata, seq1_col, seq2_col)
183
+
184
+ ############################################### Append mismatch frequency per position ###############################################
185
+ append_mismatch_frequency_sites(
186
+ adata,
187
+ ref_column=cfg.reference_column,
188
+ mismatch_layer=cfg.mismatch_frequency_layer,
189
+ read_span_layer=cfg.mismatch_frequency_read_span_layer,
190
+ mismatch_frequency_range=cfg.mismatch_frequency_range,
191
+ bypass=cfg.bypass_append_mismatch_frequency_sites,
192
+ force_redo=cfg.force_redo_append_mismatch_frequency_sites,
193
+ )
194
+
195
+ # ============================================================
196
+ # 2) Per-read variant call layer at reference mismatch sites
197
+ # ============================================================
198
+ if seq1_col and seq2_col:
199
+ # For conversion SMF, derive converted column names so variant calling
200
+ # compares read bases against the converted reference (which reads are mapped to).
201
+ # Unconverted: "{chrom}_{strand}_strand_FASTA_base"
202
+ # Converted: "{chrom}_{conversion}_{strand}_{strand}_strand_FASTA_base"
203
+ # e.g. "6B6_top_strand_FASTA_base" -> "6B6_5mC_top_top_strand_FASTA_base"
204
+ def _find_converted_column(unconverted_col: str, var_columns) -> str | None:
205
+ """Find the converted FASTA column corresponding to an unconverted one.
206
+
207
+ Unconverted columns follow the pattern ``{chromosome}_{strand}_strand_FASTA_base``.
208
+ Converted columns follow ``{chromosome}_{conversion}_{strand}_{strand}_strand_FASTA_base``
209
+ (e.g. ``6B6_5mC_top_top_strand_FASTA_base`` for unconverted ``6B6_top_strand_FASTA_base``).
210
+ """
211
+ suffix = "_strand_FASTA_base"
212
+ if not unconverted_col.endswith(suffix):
213
+ return None
214
+ stem = unconverted_col[: -len(suffix)] # e.g. "6B6_top"
215
+ # Parse strand from end of stem: "6B6_top" -> strand="top", chrom="6B6"
216
+ for strand in ("top", "bottom"):
217
+ if stem.endswith(f"_{strand}"):
218
+ chrom = stem[: -len(f"_{strand}")]
219
+ # Converted column: {chrom}_{conversion}_{strand}_{strand}_strand_FASTA_base
220
+ # The strand appears twice: once in the record name, once in the suffix.
221
+ prefix = f"{chrom}_"
222
+ end = f"_{strand}_{strand}{suffix}"
223
+ candidates = [
224
+ c
225
+ for c in var_columns
226
+ if c.startswith(prefix) and c.endswith(end) and c != unconverted_col
227
+ ]
228
+ if len(candidates) == 1:
229
+ return candidates[0]
230
+ if len(candidates) > 1:
231
+ logger.info(
232
+ "Multiple converted column candidates for '%s': %s",
233
+ unconverted_col,
234
+ candidates,
235
+ )
236
+ return candidates[0]
237
+ break
238
+ return None
239
+
240
+ seq1_conv = _find_converted_column(seq1_col, adata.var.columns)
241
+ seq2_conv = _find_converted_column(seq2_col, adata.var.columns)
242
+ if seq1_conv and seq2_conv:
243
+ logger.info("Using converted columns: '%s', '%s'", seq1_conv, seq2_conv)
244
+
245
+ append_variant_call_layer(
246
+ adata,
247
+ seq1_column=seq1_col,
248
+ seq2_column=seq2_col,
249
+ seq1_converted_column=seq1_conv,
250
+ seq2_converted_column=seq2_conv,
251
+ read_span_layer=cfg.mismatch_frequency_read_span_layer,
252
+ reference_col=cfg.reference_column,
253
+ )
254
+
255
+ append_variant_segment_layer(
256
+ adata,
257
+ seq1_column=seq1_col,
258
+ seq2_column=seq2_col,
259
+ read_span_layer=cfg.mismatch_frequency_read_span_layer,
260
+ reference_col=cfg.reference_column,
261
+ )
262
+
263
+ ############################################### Plot mismatch base frequencies ###############################################
264
+ if cfg.mismatch_frequency_layer not in adata.layers:
265
+ logger.debug(
266
+ "Mismatch layer '%s' not found; skipping mismatch base frequency plots.",
267
+ cfg.mismatch_frequency_layer,
268
+ )
269
+ elif not adata.uns.get("mismatch_integer_encoding_map"):
270
+ logger.debug("Mismatch encoding map not found; skipping mismatch base frequency plots.")
271
+ else:
272
+ mismatch_base_freq_dir = (
273
+ variant_directory / "deduplicated" / "01_mismatch_base_frequency_plots"
274
+ )
275
+ if mismatch_base_freq_dir.is_dir() and not cfg.force_redo_preprocessing:
276
+ logger.debug(
277
+ f"{mismatch_base_freq_dir} already exists. Skipping mismatch base frequency plots."
278
+ )
279
+ else:
280
+ make_dirs([mismatch_base_freq_dir])
281
+ plot_mismatch_base_frequency_by_position(
282
+ adata,
283
+ sample_col=cfg.sample_name_col_for_plotting,
284
+ reference_col=cfg.reference_column,
285
+ mismatch_layer=cfg.mismatch_frequency_layer,
286
+ read_span_layer=cfg.mismatch_frequency_read_span_layer,
287
+ exclude_mod_sites=True, # cfg.mismatch_base_frequency_exclude_mod_sites,
288
+ mod_site_bases=cfg.mod_target_bases,
289
+ save_path=mismatch_base_freq_dir,
290
+ plot_zscores=True,
291
+ )
292
+
293
+ ############################################### Plot integer sequence encoding clustermaps ###############################################
294
+ if "sequence_integer_encoding" not in adata.layers:
295
+ logger.debug(
296
+ "sequence_integer_encoding layer not found; skipping integer encoding clustermaps."
297
+ )
298
+ else:
299
+ seq_clustermap_dir = (
300
+ variant_directory / "deduplicated" / "02_sequence_integer_encoding_clustermaps"
301
+ )
302
+ if seq_clustermap_dir.is_dir() and not cfg.force_redo_preprocessing:
303
+ logger.debug(
304
+ f"{seq_clustermap_dir} already exists. Skipping sequence integer encoding clustermaps."
305
+ )
306
+ else:
307
+ make_dirs([seq_clustermap_dir])
308
+ plot_sequence_integer_encoding_clustermaps(
309
+ adata,
310
+ sample_col=cfg.sample_name_col_for_plotting,
311
+ reference_col=cfg.reference_column,
312
+ demux_types=cfg.clustermap_demux_types_to_plot,
313
+ min_quality=None,
314
+ min_length=None,
315
+ min_mapped_length_to_reference_length_ratio=None,
316
+ sort_by="none",
317
+ max_unknown_fraction=0.5,
318
+ save_path=seq_clustermap_dir,
319
+ show_position_axis=True,
320
+ )
321
+
322
+ if "mismatch_integer_encoding" in adata.layers:
323
+ mismatch_clustermap_dir = (
324
+ variant_directory
325
+ / "deduplicated"
326
+ / "03_mismatch_integer_encoding_clustermaps_no_mod_sites"
327
+ )
328
+ if mismatch_clustermap_dir.is_dir():
329
+ logger.debug(
330
+ f"{mismatch_clustermap_dir} already exists. "
331
+ "Skipping mismatch clustermaps without mod sites."
332
+ )
333
+ else:
334
+ make_dirs([mismatch_clustermap_dir])
335
+ plot_sequence_integer_encoding_clustermaps(
336
+ adata,
337
+ sample_col=cfg.sample_name_col_for_plotting,
338
+ reference_col=cfg.reference_column,
339
+ demux_types=cfg.clustermap_demux_types_to_plot,
340
+ min_quality=None,
341
+ min_length=None,
342
+ min_mapped_length_to_reference_length_ratio=None,
343
+ sort_by="none",
344
+ max_unknown_fraction=0.5,
345
+ save_path=mismatch_clustermap_dir,
346
+ show_position_axis=True,
347
+ exclude_mod_sites=True,
348
+ mod_site_bases=cfg.mod_target_bases,
349
+ )
350
+
351
+ # ============================================================
352
+ # 4) Variant segment clustermaps
353
+ # ============================================================
354
+ if seq1_col and seq2_col:
355
+ segment_layer_name = f"{seq1_col}__{seq2_col}_variant_segments"
356
+ if segment_layer_name in adata.layers:
357
+ segment_dir = variant_directory / "deduplicated" / "04_variant_segment_clustermaps"
358
+ if segment_dir.exists():
359
+ logger.info(
360
+ "Variant segment clustermaps already exist at %s; skipping.",
361
+ segment_dir,
362
+ )
363
+ else:
364
+ make_dirs([segment_dir])
365
+ plot_variant_segment_clustermaps(
366
+ adata,
367
+ seq1_column=seq1_col,
368
+ seq2_column=seq2_col,
369
+ sample_col=cfg.sample_name_col_for_plotting,
370
+ reference_col=cfg.reference_column,
371
+ variant_segment_layer=segment_layer_name,
372
+ read_span_layer=cfg.mismatch_frequency_read_span_layer,
373
+ save_path=segment_dir,
374
+ ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
375
+ ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
376
+ marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
377
+ show_position_axis=True,
378
+ )
379
+
380
+ segment_type_dir = (
381
+ variant_directory
382
+ / "deduplicated"
383
+ / "05_variant_segment_clustermaps_with_mismatch_type"
384
+ )
385
+ if segment_type_dir.exists():
386
+ logger.info(
387
+ "Variant segment mismatch-type clustermaps already exist at %s; skipping.",
388
+ segment_type_dir,
389
+ )
390
+ else:
391
+ make_dirs([segment_type_dir])
392
+ plot_variant_segment_clustermaps(
393
+ adata,
394
+ seq1_column=seq1_col,
395
+ seq2_column=seq2_col,
396
+ sample_col=cfg.sample_name_col_for_plotting,
397
+ reference_col=cfg.reference_column,
398
+ variant_segment_layer=segment_layer_name,
399
+ read_span_layer=cfg.mismatch_frequency_read_span_layer,
400
+ save_path=segment_type_dir,
401
+ ref1_marker_color=getattr(cfg, "variant_overlay_seq1_color", "white"),
402
+ ref2_marker_color=getattr(cfg, "variant_overlay_seq2_color", "black"),
403
+ marker_size=getattr(cfg, "variant_overlay_marker_size", 4.0),
404
+ show_position_axis=True,
405
+ mismatch_type_obs_col="chimeric_variant_sites_type",
406
+ )
407
+
408
+ # ============================================================
409
+ # 5) Save AnnData
410
+ # ============================================================
411
+ if not paths.variant.exists():
412
+ logger.info("Saving variant AnnData")
413
+ record_smftools_metadata(
414
+ adata,
415
+ step_name="variant",
416
+ cfg=cfg,
417
+ config_path=config_path,
418
+ input_paths=[source_adata_path] if source_adata_path else None,
419
+ output_path=paths.variant,
420
+ )
421
+ write_gz_h5ad(adata, paths.variant)
422
+
423
+ return adata, paths.variant
smftools/cli_entry.py CHANGED
@@ -7,11 +7,14 @@ from typing import Sequence
7
7
  import click
8
8
  import pandas as pd
9
9
 
10
+ from .cli.chimeric_adata import chimeric_adata
10
11
  from .cli.hmm_adata import hmm_adata
11
12
  from .cli.latent_adata import latent_adata
12
13
  from .cli.load_adata import load_adata
13
14
  from .cli.preprocess_adata import preprocess_adata
15
+ from .cli.recipes import full_flow
14
16
  from .cli.spatial_adata import spatial_adata
17
+ from .cli.variant_adata import variant_adata
15
18
  from .informatics.pod5_functions import subsample_pod5
16
19
  from .logging_utils import get_logger, setup_logging
17
20
  from .readwrite import concatenate_h5ads
@@ -64,7 +67,7 @@ def cli(log_file: Path | None, log_level: str):
64
67
  @cli.command()
65
68
  @click.argument("config_path", type=click.Path(exists=True))
66
69
  def load(config_path):
67
- """Load and process data from CONFIG_PATH."""
70
+ """Load raw data into AnnData."""
68
71
  load_adata(config_path)
69
72
 
70
73
 
@@ -75,7 +78,7 @@ def load(config_path):
75
78
  @cli.command()
76
79
  @click.argument("config_path", type=click.Path(exists=True))
77
80
  def preprocess(config_path):
78
- """Preprocess data from CONFIG_PATH."""
81
+ """Preprocessing."""
79
82
  preprocess_adata(config_path)
80
83
 
81
84
 
@@ -86,7 +89,7 @@ def preprocess(config_path):
86
89
  @cli.command()
87
90
  @click.argument("config_path", type=click.Path(exists=True))
88
91
  def spatial(config_path):
89
- """Process data from CONFIG_PATH."""
92
+ """Spatial signal analysis"""
90
93
  spatial_adata(config_path)
91
94
 
92
95
 
@@ -97,7 +100,7 @@ def spatial(config_path):
97
100
  @cli.command()
98
101
  @click.argument("config_path", type=click.Path(exists=True))
99
102
  def hmm(config_path):
100
- """Process data from CONFIG_PATH."""
103
+ """HMM feature annotations and plotting"""
101
104
  hmm_adata(config_path)
102
105
 
103
106
 
@@ -108,13 +111,46 @@ def hmm(config_path):
108
111
  @cli.command()
109
112
  @click.argument("config_path", type=click.Path(exists=True))
110
113
  def latent(config_path):
111
- """Process data from CONFIG_PATH."""
114
+ """Latent representations of signal"""
112
115
  latent_adata(config_path)
113
116
 
114
117
 
115
118
  ##########################################
116
119
 
117
120
 
121
+ ####### Variant ###########
122
+ @cli.command()
123
+ @click.argument("config_path", type=click.Path(exists=True))
124
+ def variant(config_path):
125
+ """Sequence variation analyses"""
126
+ variant_adata(config_path)
127
+
128
+
129
+ ##########################################
130
+
131
+
132
+ ####### Chimeric ###########
133
+ @cli.command()
134
+ @click.argument("config_path", type=click.Path(exists=True))
135
+ def chimeric(config_path):
136
+ """Finding putative PCR chimeras"""
137
+ chimeric_adata(config_path)
138
+
139
+
140
+ ##########################################
141
+
142
+
143
+ ####### Recipes ###########
144
+ @cli.command()
145
+ @click.argument("config_path", type=click.Path(exists=True))
146
+ def full(config_path):
147
+ """Workflow: load preprocess spatial variant chimeric hmm latent"""
148
+ full_flow(config_path)
149
+
150
+
151
+ ##########################################
152
+
153
+
118
154
  ####### batch command ###########
119
155
  @cli.command()
120
156
  @click.argument(
@@ -15,16 +15,6 @@ autocorr_site_types:
15
15
 
16
16
  # Spatial Analysis - Clustermap params
17
17
  layer_for_clustermap_plotting: 'nan0_0minus1'
18
- rolling_nn_layer: "nan0_0minus1"
19
- rolling_nn_plot_layer: "nan0_0minus1"
20
- rolling_nn_window: 30
21
- rolling_nn_step: 2
22
- rolling_nn_min_overlap: 20
23
- rolling_nn_return_fraction: true
24
- rolling_nn_obsm_key: "rolling_nn_dist"
25
- rolling_nn_site_types:
26
- - "GpC"
27
- - "CpG"
28
18
  clustermap_cmap_c: "coolwarm"
29
19
  clustermap_cmap_gpc: "coolwarm"
30
20
  clustermap_cmap_cpg: "viridis"
@@ -39,6 +39,9 @@ autocorr_site_types:
39
39
  correlation_matrix_site_types:
40
40
  - "C_site"
41
41
 
42
+ rolling_nn_site_types:
43
+ - "C"
44
+
42
45
  # ######## smftools hmm params #########
43
46
  cpg: False # whether to use the default HMM endogenous CpG patch params
44
47
  hmm_methbases:
@@ -110,7 +110,7 @@ read_len_to_ref_ratio_filter_thresholds:
110
110
  - null
111
111
  - null
112
112
  read_quality_filter_thresholds:
113
- - 15
113
+ - 10
114
114
  - null
115
115
  read_mapping_quality_filter_thresholds:
116
116
  - null
@@ -130,7 +130,7 @@ read_mod_filtering_a_thresholds:
130
130
  - 0.025
131
131
  - 0.975
132
132
  read_mod_filtering_use_other_c_as_background: False
133
- min_valid_fraction_positions_in_read_vs_ref: 0.5
133
+ min_valid_fraction_positions_in_read_vs_ref: 0.2
134
134
 
135
135
  # Plotting params for read length histograms
136
136
  obs_to_plot_pp_qc:
@@ -162,12 +162,13 @@ duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical cl
162
162
  duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
163
163
 
164
164
  # Position QC params
165
- position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
165
+ position_max_nan_threshold: 0.8 # The maximum amount of nans to tolerate in a column
166
166
  mismatch_frequency_range:
167
167
  - 0.01
168
168
  - 0.99
169
169
  mismatch_frequency_layer: "mismatch_integer_encoding"
170
170
  mismatch_frequency_read_span_layer: "read_span_mask"
171
+ mismatch_base_frequency_exclude_mod_sites: True
171
172
 
172
173
  ######## smftools spatial params #########
173
174
  invert_adata: False # Whether to invert the AnnData along the positions axis.
@@ -186,13 +187,56 @@ clustermap_cmap_gpc: "coolwarm"
186
187
  clustermap_cmap_cpg: "coolwarm"
187
188
  clustermap_cmap_a: "coolwarm"
188
189
  spatial_clustermap_sortby: "gpc"
190
+
191
+ # Clustermap variant params
192
+ overlay_variant_calls: false
193
+ variant_overlay_seq1_color: "black"
194
+ variant_overlay_seq2_color: "white"
195
+ variant_overlay_marker_size: 4.0
196
+
197
+ # Spatial analysis - Rolling NN Hamming
198
+ rolling_nn_layer: "nan0_0minus1"
199
+ rolling_nn_plot_layer: "nan0_0minus1"
200
+ rolling_nn_plot_layers:
201
+ - "nan0_0minus1"
202
+ - "zero_hamming_distance_spans"
203
+ rolling_nn_window: 10
204
+ rolling_nn_step: 1
205
+ rolling_nn_min_overlap: 8
206
+ rolling_nn_return_fraction: true
207
+ rolling_nn_obsm_key: "rolling_nn_dist"
189
208
  rolling_nn_site_types:
190
209
  - "GpC"
191
210
  - "CpG"
192
-
193
- # Spatial Analysis - UMAP/Leiden params
211
+ rolling_nn_write_zero_pairs_csvs: true
212
+ rolling_nn_zero_pairs_uns_key: null
213
+ rolling_nn_zero_pairs_segments_key: null
214
+ rolling_nn_zero_pairs_layer_key: null
215
+ rolling_nn_zero_pairs_refine: true
216
+ rolling_nn_zero_pairs_max_nan_run: 2
217
+ rolling_nn_zero_pairs_merge_gap: 1
218
+ rolling_nn_zero_pairs_max_segments_per_read: 2
219
+ rolling_nn_zero_pairs_max_overlap: 5
220
+ rolling_nn_zero_pairs_layer_overlap_mode: "sum"
221
+ rolling_nn_zero_pairs_layer_overlap_value: null
222
+ rolling_nn_zero_pairs_keep_uns: true
223
+ rolling_nn_zero_pairs_segments_keep_uns: true
224
+ rolling_nn_zero_pairs_top_segments_per_read: 3
225
+ rolling_nn_zero_pairs_top_segments_max_overlap: 5
226
+ rolling_nn_zero_pairs_top_segments_min_span: 300
227
+ rolling_nn_zero_pairs_top_segments_write_csvs: true
228
+ rolling_nn_zero_pairs_segment_histogram_bins: 30
229
+
230
+ # Cross-sample rolling NN analysis
231
+ cross_sample_analysis: true
232
+ cross_sample_grouping_col: null
233
+ cross_sample_random_seed: 42
234
+ delta_hamming_chimeric_span_threshold: 200
235
+
236
+ # Latent Analysis - UMAP/Leiden params
194
237
  layer_for_umap_plotting: 'nan_half'
195
238
  umap_layers_to_plot:
239
+ - "leiden"
196
240
  - "mapped_length"
197
241
  - "Raw_modification_signal"
198
242
 
@@ -279,21 +323,13 @@ hmm_merge_layer_features:
279
323
  - ["all_accessible_features", 60]
280
324
  clustermap_cmap_hmm: "coolwarm"
281
325
  hmm_clustermap_feature_layers:
282
- - all_accessible_features
283
326
  - all_accessible_features_merged
284
- - small_accessible_patch
285
- - mid_accessible_patch
286
- - large_accessible_patch
287
- - large_accessible_patch_merged
288
- - nucleosome_depleted_region
289
327
  - nucleosome_depleted_region_merged
290
328
  - small_bound_stretch
291
329
  - medium_bound_stretch
292
330
  - putative_nucleosome
293
- - large_bound_stretch
294
331
  - all_footprint_features
295
332
  hmm_clustermap_length_layers:
296
- - all_accessible_features
297
333
  - all_accessible_features_merged
298
334
  - all_footprint_features
299
335
  hmm_clustermap_sortby: "hmm"