smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +32 -6
  3. smftools/cli/hmm_adata.py +232 -31
  4. smftools/cli/latent_adata.py +318 -0
  5. smftools/cli/load_adata.py +77 -73
  6. smftools/cli/preprocess_adata.py +178 -53
  7. smftools/cli/spatial_adata.py +149 -101
  8. smftools/cli_entry.py +12 -0
  9. smftools/config/conversion.yaml +11 -1
  10. smftools/config/default.yaml +38 -1
  11. smftools/config/experiment_config.py +53 -1
  12. smftools/constants.py +65 -0
  13. smftools/hmm/HMM.py +88 -0
  14. smftools/informatics/__init__.py +6 -0
  15. smftools/informatics/bam_functions.py +358 -8
  16. smftools/informatics/converted_BAM_to_adata.py +584 -163
  17. smftools/informatics/h5ad_functions.py +115 -2
  18. smftools/informatics/modkit_extract_to_adata.py +1003 -425
  19. smftools/informatics/sequence_encoding.py +72 -0
  20. smftools/logging_utils.py +21 -2
  21. smftools/metadata.py +1 -1
  22. smftools/plotting/__init__.py +9 -0
  23. smftools/plotting/general_plotting.py +2411 -628
  24. smftools/plotting/hmm_plotting.py +85 -7
  25. smftools/preprocessing/__init__.py +1 -0
  26. smftools/preprocessing/append_base_context.py +17 -17
  27. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  28. smftools/preprocessing/calculate_consensus.py +1 -1
  29. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  30. smftools/readwrite.py +53 -17
  31. smftools/schema/anndata_schema_v1.yaml +15 -1
  32. smftools/tools/__init__.py +4 -0
  33. smftools/tools/calculate_leiden.py +57 -0
  34. smftools/tools/calculate_nmf.py +119 -0
  35. smftools/tools/calculate_umap.py +91 -8
  36. smftools/tools/rolling_nn_distance.py +235 -0
  37. smftools/tools/tensor_factorization.py +169 -0
  38. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
  39. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
  40. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  41. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  42. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from pathlib import Path
4
5
  from typing import Optional, Tuple
5
6
 
6
7
  import anndata as ad
7
8
 
8
- from smftools.logging_utils import get_logger
9
+ from smftools.constants import LOGGING_DIR, SEQUENCE_INTEGER_ENCODING, SPATIAL_DIR
10
+ from smftools.logging_utils import get_logger, setup_logging
9
11
  from smftools.optional_imports import require
10
12
 
11
13
  logger = get_logger(__name__)
@@ -35,15 +37,13 @@ def spatial_adata(
35
37
  Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
36
38
  """
37
39
  from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
38
- from .helpers import get_adata_paths
39
- from .load_adata import load_adata
40
- from .preprocess_adata import preprocess_adata
40
+ from .helpers import get_adata_paths, load_experiment_config
41
41
 
42
42
  # 1) Ensure config + basic paths via load_adata
43
- loaded_adata, loaded_path, cfg = load_adata(config_path)
43
+ cfg = load_experiment_config(config_path)
44
+
44
45
  paths = get_adata_paths(cfg)
45
46
 
46
- raw_path = paths.raw
47
47
  pp_path = paths.pp
48
48
  pp_dedup_path = paths.pp_dedup
49
49
  spatial_path = paths.spatial
@@ -51,47 +51,34 @@ def spatial_adata(
51
51
 
52
52
  # Stage-skipping logic for spatial
53
53
  if not getattr(cfg, "force_redo_spatial_analyses", False):
54
- # If HMM exists, it's the most processed stage — reuse it.
55
- if hmm_path.exists():
56
- logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
57
- return None, hmm_path
58
-
59
54
  # If spatial exists, we consider spatial analyses already done.
60
55
  if spatial_path.exists():
61
56
  logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
62
57
  return None, spatial_path
63
58
 
64
- # 2) Ensure preprocessing has been run
65
- # This will create pp/pp_dedup as needed or return them if they already exist.
66
- pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
67
- config_path
68
- )
69
-
70
59
  # Helper to load from disk, reusing loaded_adata if it matches
71
60
  def _load(path: Path):
72
- if loaded_adata is not None and loaded_path == path:
73
- return loaded_adata
74
61
  adata, _ = safe_read_h5ad(path)
75
62
  return adata
76
63
 
77
64
  # 3) Decide which AnnData to use as the *starting point* for spatial analyses
78
- # Prefer in-memory pp_dedup_adata when preprocess_adata just ran.
79
- if pp_dedup_adata is not None:
80
- start_adata = pp_dedup_adata
81
- source_path = pp_dedup_adata_path_ret
65
+ if hmm_path.exists():
66
+ start_adata = _load(hmm_path)
67
+ source_path = hmm_path
68
+ elif spatial_path.exists():
69
+ start_adata = _load(spatial_path)
70
+ source_path = spatial_path
71
+ elif pp_dedup_path.exists():
72
+ start_adata = _load(pp_dedup_path)
73
+ source_path = pp_dedup_path
74
+ elif pp_path.exists():
75
+ start_adata = _load(pp_path)
76
+ source_path = pp_path
82
77
  else:
83
- if pp_dedup_path.exists():
84
- start_adata = _load(pp_dedup_path)
85
- source_path = pp_dedup_path
86
- elif pp_path.exists():
87
- start_adata = _load(pp_path)
88
- source_path = pp_path
89
- elif raw_path.exists():
90
- start_adata = _load(raw_path)
91
- source_path = raw_path
92
- else:
93
- logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
94
- return None, None
78
+ logger.warning(
79
+ "No suitable AnnData found for spatial analyses (need at least preprocessed)."
80
+ )
81
+ return None, None
95
82
 
96
83
  # 4) Run the spatial core
97
84
  adata_spatial, spatial_path = spatial_adata_core(
@@ -99,15 +86,10 @@ def spatial_adata(
99
86
  cfg=cfg,
100
87
  spatial_adata_path=spatial_path,
101
88
  pp_adata_path=pp_path,
102
- pp_dup_rem_adata_path=pp_dedup_path,
103
- pp_adata_in_memory=pp_adata,
104
89
  source_adata_path=source_path,
105
90
  config_path=config_path,
106
91
  )
107
92
 
108
- # 5) Register spatial path in summary CSV
109
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
110
-
111
93
  return adata_spatial, spatial_path
112
94
 
113
95
 
@@ -116,8 +98,6 @@ def spatial_adata_core(
116
98
  cfg,
117
99
  spatial_adata_path: Path,
118
100
  pp_adata_path: Path,
119
- pp_dup_rem_adata_path: Path,
120
- pp_adata_in_memory: Optional[ad.AnnData] = None,
121
101
  source_adata_path: Optional[Path] = None,
122
102
  config_path: Optional[str] = None,
123
103
  ) -> Tuple[ad.AnnData, Path]:
@@ -129,8 +109,6 @@ def spatial_adata_core(
129
109
  - `cfg` is the ExperimentConfig.
130
110
  - `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
131
111
  from `get_adata_paths`.
132
- - `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
133
- the same run of `preprocess_adata`, to avoid re-reading from disk.
134
112
 
135
113
  Does:
136
114
  - Optional sample sheet load.
@@ -152,17 +130,17 @@ def spatial_adata_core(
152
130
  """
153
131
  import os
154
132
  import warnings
133
+ from datetime import datetime
155
134
  from pathlib import Path
156
135
 
157
136
  import numpy as np
158
137
  import pandas as pd
159
138
 
160
- sc = require("scanpy", extra="scanpy", purpose="spatial analyses")
161
-
162
139
  from ..metadata import record_smftools_metadata
163
140
  from ..plotting import (
164
141
  combined_raw_clustermap,
165
142
  plot_rolling_grid,
143
+ plot_rolling_nn_and_layer,
166
144
  plot_spatial_autocorr_grid,
167
145
  )
168
146
  from ..preprocessing import (
@@ -171,11 +149,12 @@ def spatial_adata_core(
171
149
  reindex_references_adata,
172
150
  )
173
151
  from ..readwrite import make_dirs, safe_read_h5ad
174
- from ..tools import calculate_umap
152
+ from ..tools import rolling_window_nn_distance
175
153
  from ..tools.position_stats import (
176
154
  compute_positionwise_statistics,
177
155
  plot_positionwise_matrices,
178
156
  )
157
+ from ..tools.rolling_nn_distance import assign_rolling_nn_results
179
158
  from ..tools.spatial_autocorrelation import (
180
159
  analyze_autocorr_matrix,
181
160
  binary_autocorrelation_with_spacing,
@@ -187,8 +166,24 @@ def spatial_adata_core(
187
166
  # -----------------------------
188
167
  # General setup
189
168
  # -----------------------------
169
+ date_str = datetime.today().strftime("%y%m%d")
170
+ now = datetime.now()
171
+ time_str = now.strftime("%H%M%S")
172
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
173
+
190
174
  output_directory = Path(cfg.output_directory)
191
- make_dirs([output_directory])
175
+ spatial_directory = output_directory / SPATIAL_DIR
176
+ logging_directory = spatial_directory / LOGGING_DIR
177
+
178
+ make_dirs([output_directory, spatial_directory])
179
+
180
+ if cfg.emit_log_file:
181
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
182
+ make_dirs([logging_directory])
183
+ else:
184
+ log_file = None
185
+
186
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
192
187
 
193
188
  smf_modality = cfg.smf_modality
194
189
  if smf_modality == "conversion":
@@ -196,8 +191,6 @@ def spatial_adata_core(
196
191
  else:
197
192
  deaminase = True
198
193
 
199
- first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
200
-
201
194
  # -----------------------------
202
195
  # Optional sample sheet metadata
203
196
  # -----------------------------
@@ -231,7 +224,6 @@ def spatial_adata_core(
231
224
  else:
232
225
  reindex_suffix = None
233
226
 
234
- pp_dir = output_directory / "preprocessed"
235
227
  references = adata.obs[cfg.reference_column].cat.categories
236
228
 
237
229
  # ============================================================
@@ -241,7 +233,7 @@ def spatial_adata_core(
241
233
  preprocessed_version_available = pp_adata_path.exists()
242
234
 
243
235
  if preprocessed_version_available:
244
- pp_clustermap_dir = pp_dir / "06_clustermaps"
236
+ pp_clustermap_dir = spatial_directory / "06_clustermaps"
245
237
 
246
238
  if pp_clustermap_dir.is_dir() and not getattr(
247
239
  cfg, "force_redo_spatial_analyses", False
@@ -250,12 +242,9 @@ def spatial_adata_core(
250
242
  f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
251
243
  )
252
244
  else:
253
- make_dirs([pp_dir, pp_clustermap_dir])
245
+ make_dirs([spatial_directory, pp_clustermap_dir])
254
246
 
255
- if first_pp_run and (pp_adata_in_memory is not None):
256
- pp_adata = pp_adata_in_memory
257
- else:
258
- pp_adata, _ = safe_read_h5ad(pp_adata_path)
247
+ pp_adata, _ = safe_read_h5ad(pp_adata_path)
259
248
 
260
249
  # -----------------------------
261
250
  # Optional sample sheet metadata
@@ -304,7 +293,7 @@ def spatial_adata_core(
304
293
  0
305
294
  ],
306
295
  min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
307
- demux_types=("double", "already"),
296
+ demux_types=cfg.clustermap_demux_types_to_plot,
308
297
  bins=None,
309
298
  sample_mapping=None,
310
299
  save_path=pp_clustermap_dir,
@@ -314,19 +303,18 @@ def spatial_adata_core(
314
303
  )
315
304
 
316
305
  # ============================================================
317
- # 2) Clustermaps + UMAP on *deduplicated* preprocessed AnnData
306
+ # 2) Clustermaps on *deduplicated* preprocessed AnnData
318
307
  # ============================================================
319
- pp_dir_dedup = pp_dir / "deduplicated"
320
- pp_clustermap_dir_dedup = pp_dir_dedup / "06_clustermaps"
321
- pp_umap_dir = pp_dir_dedup / "07_umaps"
308
+ spatial_dir_dedup = spatial_directory / "deduplicated"
309
+ clustermap_dir_dedup = spatial_dir_dedup / "06_clustermaps"
322
310
 
323
311
  # Clustermaps on deduplicated adata
324
- if pp_clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
312
+ if clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
325
313
  logger.debug(
326
- f"{pp_clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
314
+ f"{clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
327
315
  )
328
316
  else:
329
- make_dirs([pp_dir_dedup, pp_clustermap_dir_dedup])
317
+ make_dirs([spatial_dir_dedup, clustermap_dir_dedup])
330
318
  combined_raw_clustermap(
331
319
  adata,
332
320
  sample_col=cfg.sample_name_col_for_plotting,
@@ -346,53 +334,113 @@ def spatial_adata_core(
346
334
  0
347
335
  ],
348
336
  min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
349
- demux_types=("double", "already"),
337
+ demux_types=cfg.clustermap_demux_types_to_plot,
350
338
  bins=None,
351
339
  sample_mapping=None,
352
- save_path=pp_clustermap_dir_dedup,
340
+ save_path=clustermap_dir_dedup,
353
341
  sort_by=cfg.spatial_clustermap_sortby,
354
342
  deaminase=deaminase,
355
343
  index_col_suffix=reindex_suffix,
356
344
  )
357
345
 
358
- # UMAP / Leiden
359
- if pp_umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
360
- logger.debug(f"{pp_umap_dir} already exists. Skipping UMAP plotting.")
346
+ # ============================================================
347
+ # 2b) Rolling NN distances + layer clustermaps
348
+ # ============================================================
349
+ pp_rolling_nn_dir = spatial_dir_dedup / "06b_rolling_nn_clustermaps"
350
+
351
+ if pp_rolling_nn_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
352
+ logger.debug(f"{pp_rolling_nn_dir} already exists. Skipping rolling NN distance plots.")
361
353
  else:
362
- make_dirs([pp_umap_dir])
363
-
364
- var_filters = []
365
- if smf_modality == "direct":
366
- for ref in references:
367
- for base in cfg.mod_target_bases:
368
- var_filters.append(f"{ref}_{base}_site")
369
- elif deaminase:
370
- for ref in references:
371
- var_filters.append(f"{ref}_C_site")
372
- else:
373
- for ref in references:
374
- for base in cfg.mod_target_bases:
375
- var_filters.append(f"{ref}_{base}_site")
376
-
377
- adata = calculate_umap(
378
- adata,
379
- layer=cfg.layer_for_umap_plotting,
380
- var_filters=var_filters,
381
- n_pcs=10,
382
- knn_neighbors=15,
354
+ make_dirs([pp_rolling_nn_dir])
355
+ samples = (
356
+ adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
383
357
  )
358
+ references = adata.obs[cfg.reference_column].astype("category").cat.categories.tolist()
384
359
 
385
- sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
360
+ for reference in references:
361
+ for sample in samples:
362
+ mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (
363
+ adata.obs[cfg.reference_column] == reference
364
+ )
365
+ if not mask.any():
366
+ continue
367
+
368
+ subset = adata[mask]
369
+ site_mask = (
370
+ adata.var[[f"{reference}_{st}_site" for st in cfg.rolling_nn_site_types]]
371
+ .fillna(False)
372
+ .any(axis=1)
373
+ )
374
+ subset = subset[:, site_mask].copy()
375
+ try:
376
+ rolling_values, rolling_starts = rolling_window_nn_distance(
377
+ subset,
378
+ layer=cfg.rolling_nn_layer,
379
+ window=cfg.rolling_nn_window,
380
+ step=cfg.rolling_nn_step,
381
+ min_overlap=cfg.rolling_nn_min_overlap,
382
+ return_fraction=cfg.rolling_nn_return_fraction,
383
+ store_obsm=cfg.rolling_nn_obsm_key,
384
+ )
385
+ except Exception as exc:
386
+ logger.warning(
387
+ "Rolling NN distance computation failed for sample=%s ref=%s: %s",
388
+ sample,
389
+ reference,
390
+ exc,
391
+ )
392
+ continue
386
393
 
387
- sc.settings.figdir = pp_umap_dir
388
- umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
389
- umap_layers += cfg.umap_layers_to_plot
390
- sc.pl.umap(adata, color=umap_layers, show=False, save=True)
394
+ safe_sample = str(sample).replace(os.sep, "_")
395
+ safe_ref = str(reference).replace(os.sep, "_")
396
+ parent_obsm_key = f"{cfg.rolling_nn_obsm_key}__{safe_ref}"
397
+ try:
398
+ assign_rolling_nn_results(
399
+ adata,
400
+ subset,
401
+ rolling_values,
402
+ rolling_starts,
403
+ obsm_key=parent_obsm_key,
404
+ window=cfg.rolling_nn_window,
405
+ step=cfg.rolling_nn_step,
406
+ min_overlap=cfg.rolling_nn_min_overlap,
407
+ return_fraction=cfg.rolling_nn_return_fraction,
408
+ layer=cfg.rolling_nn_layer,
409
+ )
410
+ except Exception as exc:
411
+ logger.warning(
412
+ "Failed to merge rolling NN results for sample=%s ref=%s: %s",
413
+ sample,
414
+ reference,
415
+ exc,
416
+ )
417
+ adata.uns.setdefault(f"{cfg.rolling_nn_obsm_key}_reference_map", {})[reference] = (
418
+ parent_obsm_key
419
+ )
420
+ out_png = pp_rolling_nn_dir / f"{safe_sample}__{safe_ref}.png"
421
+ title = f"{sample} {reference}"
422
+ try:
423
+ plot_rolling_nn_and_layer(
424
+ subset,
425
+ obsm_key=cfg.rolling_nn_obsm_key,
426
+ layer_key=cfg.rolling_nn_plot_layer,
427
+ max_nan_fraction=cfg.position_max_nan_threshold,
428
+ var_valid_fraction_col=f"{reference}_valid_fraction",
429
+ title=title,
430
+ save_name=out_png,
431
+ )
432
+ except Exception as exc:
433
+ logger.warning(
434
+ "Failed rolling NN plot for sample=%s ref=%s: %s",
435
+ sample,
436
+ reference,
437
+ exc,
438
+ )
391
439
 
392
440
  # ============================================================
393
441
  # 3) Spatial autocorrelation + rolling metrics
394
442
  # ============================================================
395
- pp_autocorr_dir = pp_dir_dedup / "08_autocorrelations"
443
+ pp_autocorr_dir = spatial_dir_dedup / "08_autocorrelations"
396
444
 
397
445
  if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
398
446
  logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
@@ -735,10 +783,10 @@ def spatial_adata_core(
735
783
  # ============================================================
736
784
  # 4) Pearson / correlation matrices
737
785
  # ============================================================
738
- pp_corr_dir = pp_dir_dedup / "09_correlation_matrices"
786
+ corr_dir = spatial_dir_dedup / "09_correlation_matrices"
739
787
 
740
- if pp_corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
741
- logger.debug(f"{pp_corr_dir} already exists. Skipping correlation matrix plotting.")
788
+ if corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
789
+ logger.debug(f"{corr_dir} already exists. Skipping correlation matrix plotting.")
742
790
  else:
743
791
  compute_positionwise_statistics(
744
792
  adata,
@@ -763,7 +811,7 @@ def spatial_adata_core(
763
811
  cmaps=cfg.correlation_matrix_cmaps,
764
812
  vmin=None,
765
813
  vmax=None,
766
- output_dir=pp_corr_dir,
814
+ output_dir=corr_dir,
767
815
  output_key="positionwise_result",
768
816
  )
769
817
 
smftools/cli_entry.py CHANGED
@@ -8,6 +8,7 @@ import click
8
8
  import pandas as pd
9
9
 
10
10
  from .cli.hmm_adata import hmm_adata
11
+ from .cli.latent_adata import latent_adata
11
12
  from .cli.load_adata import load_adata
12
13
  from .cli.preprocess_adata import preprocess_adata
13
14
  from .cli.spatial_adata import spatial_adata
@@ -103,6 +104,17 @@ def hmm(config_path):
103
104
  ##########################################
104
105
 
105
106
 
107
+ ####### Latent ###########
108
+ @cli.command()
109
+ @click.argument("config_path", type=click.Path(exists=True))
110
+ def latent(config_path):
111
+ """Process data from CONFIG_PATH."""
112
+ latent_adata(config_path)
113
+
114
+
115
+ ##########################################
116
+
117
+
106
118
  ####### batch command ###########
107
119
  @cli.command()
108
120
  @click.argument(
@@ -15,6 +15,16 @@ autocorr_site_types:
15
15
 
16
16
  # Spatial Analysis - Clustermap params
17
17
  layer_for_clustermap_plotting: 'nan0_0minus1'
18
+ rolling_nn_layer: "nan0_0minus1"
19
+ rolling_nn_plot_layer: "nan0_0minus1"
20
+ rolling_nn_window: 30
21
+ rolling_nn_step: 2
22
+ rolling_nn_min_overlap: 20
23
+ rolling_nn_return_fraction: true
24
+ rolling_nn_obsm_key: "rolling_nn_dist"
25
+ rolling_nn_site_types:
26
+ - "GpC"
27
+ - "CpG"
18
28
  clustermap_cmap_c: "coolwarm"
19
29
  clustermap_cmap_gpc: "coolwarm"
20
30
  clustermap_cmap_cpg: "viridis"
@@ -46,4 +56,4 @@ hmm_feature_sets:
46
56
  cpg_patch: [0, inf]
47
57
 
48
58
  hmm_merge_layer_features:
49
- - ["all_accessible_features", 60]
59
+ - ["all_accessible_features", 60]
@@ -18,8 +18,9 @@ conversions:
18
18
  fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
19
19
  fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
20
20
  input_already_demuxed: False # If the input files are already demultiplexed.
21
+
21
22
  delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
22
- delete_intermediate_bams: True # Whether to delete intermediate BAM files.
23
+ delete_intermediate_bams: False # Whether to delete intermediate BAM files.
23
24
  delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
24
25
 
25
26
  # Sequencing modality and general experiment params
@@ -77,6 +78,7 @@ aligner_args:
77
78
  # Sorted BAM and BED specific handling
78
79
  make_bigwigs: False # Whether to make coverage bigwigs
79
80
  make_beds: False # Whether to make beds from the aligned bams
81
+ annotate_secondary_supplementary: True # Whether to annotate reads with secondary/supplementary alignments from the aligned BAM
80
82
  samtools_backend: auto # auto|python|cli for samtools-compatible operations
81
83
  bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
82
84
  bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
@@ -90,6 +92,12 @@ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall w
90
92
  reference_column: 'Reference_strand'
91
93
  sample_column: 'Experiment_name_and_barcode'
92
94
 
95
+ # Plotting params
96
+ clustermap_demux_types_to_plot:
97
+ - "single"
98
+ - "double"
99
+ - "already"
100
+
93
101
  ######## smftools preprocess params #########
94
102
  # Read length, quality, and mapping filtering params
95
103
  read_coord_filter:
@@ -140,6 +148,10 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
140
148
  - "CpG"
141
149
  - "ambiguous_GpC_CpG"
142
150
  duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
151
+ duplicate_detection_demux_types_to_use:
152
+ - "single"
153
+ - "double"
154
+ - "already"
143
155
  hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
144
156
  - Fraction_C_site_modified
145
157
  duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
@@ -151,6 +163,11 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
151
163
 
152
164
  # Position QC params
153
165
  position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
166
+ mismatch_frequency_range:
167
+ - 0.01
168
+ - 0.99
169
+ mismatch_frequency_layer: "mismatch_integer_encoding"
170
+ mismatch_frequency_read_span_layer: "read_span_mask"
154
171
 
155
172
  ######## smftools spatial params #########
156
173
  invert_adata: False # Whether to invert the AnnData along the positions axis.
@@ -169,6 +186,9 @@ clustermap_cmap_gpc: "coolwarm"
169
186
  clustermap_cmap_cpg: "coolwarm"
170
187
  clustermap_cmap_a: "coolwarm"
171
188
  spatial_clustermap_sortby: "gpc"
189
+ rolling_nn_site_types:
190
+ - "GpC"
191
+ - "CpG"
172
192
 
173
193
  # Spatial Analysis - UMAP/Leiden params
174
194
  layer_for_umap_plotting: 'nan_half'
@@ -243,6 +263,18 @@ hmm_feature_sets:
243
263
  mid_accessible_patch: [20, 40]
244
264
  large_accessible_patch: [40, 110]
245
265
  nucleosome_depleted_region: [110, inf]
266
+ hmm_feature_colormaps:
267
+ small_accessible_patch: "#A5D6A7"
268
+ mid_accessible_patch: "#2E7D32"
269
+ large_accessible_patch: "#006400"
270
+ nucleosome_depleted_region: "#00441B"
271
+ all_accessible_features: "#2E7D32"
272
+ small_bound_stretch: "#1E88E5"
273
+ medium_bound_stretch: "#6A1B9A"
274
+ large_bound_stretch: "#FB8C00"
275
+ putative_nucleosome: "#6D4C41"
276
+ all_footprint_features: "#6A1B9A"
277
+ cpg_patch: "#6D4C41"
246
278
  hmm_merge_layer_features:
247
279
  - ["all_accessible_features", 60]
248
280
  clustermap_cmap_hmm: "coolwarm"
@@ -259,6 +291,11 @@ hmm_clustermap_feature_layers:
259
291
  - medium_bound_stretch
260
292
  - putative_nucleosome
261
293
  - large_bound_stretch
294
+ - all_footprint_features
295
+ hmm_clustermap_length_layers:
296
+ - all_accessible_features
297
+ - all_accessible_features_merged
298
+ - all_footprint_features
262
299
  hmm_clustermap_sortby: "hmm"
263
300
  hmm_peak_feature_configs:
264
301
  all_accessible_features: