smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,26 @@
1
1
  # experiment_config.py
2
2
  from __future__ import annotations
3
+
3
4
  import ast
4
5
  import json
5
6
  import warnings
6
- from dataclasses import dataclass, field, asdict
7
+ from dataclasses import asdict, dataclass, field
7
8
  from pathlib import Path
8
- from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
9
+ from typing import IO, Any, Dict, List, Optional, Sequence, Tuple, Union
10
+
11
+ from smftools.constants import (
12
+ BAM_SUFFIX,
13
+ BARCODE_BOTH_ENDS,
14
+ CONVERSIONS,
15
+ MOD_LIST,
16
+ MOD_MAP,
17
+ REF_COL,
18
+ SAMPLE_COL,
19
+ SPLIT_DIR,
20
+ STRANDS,
21
+ TRIM,
22
+ )
23
+
9
24
  from .discover_input_files import discover_input_files
10
25
 
11
26
  # Optional dependency for YAML handling
@@ -14,8 +29,8 @@ try:
14
29
  except Exception:
15
30
  yaml = None
16
31
 
17
- import pandas as pd
18
32
  import numpy as np
33
+ import pandas as pd
19
34
 
20
35
 
21
36
  # -------------------------
@@ -81,6 +96,7 @@ def _parse_numeric(v: Any, fallback: Any = None) -> Any:
81
96
  except Exception:
82
97
  return fallback
83
98
 
99
+
84
100
  def _try_json_or_literal(s: Any) -> Any:
85
101
  """Try parse JSON or python literal; otherwise return original string."""
86
102
  if s is None:
@@ -123,8 +139,8 @@ def resolve_aligner_args(
123
139
  """
124
140
  # builtin defaults (aligner -> args)
125
141
  builtin_defaults = {
126
- "minimap2": ['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no'],
127
- "dorado": ['--mm2-opts', '-N', '5'],
142
+ "minimap2": ["-a", "-x", "map-ont", "--MD", "-Y", "-y", "-N", "5", "--secondary=no"],
143
+ "dorado": ["--mm2-opts", "-N", "5"],
128
144
  }
129
145
  if default_by_aligner is None:
130
146
  default_by_aligner = builtin_defaults
@@ -214,7 +230,7 @@ def resolve_aligner_args(
214
230
  return list(default_by_aligner.get(key_align, []))
215
231
 
216
232
 
217
- # HMM default params and hepler functions
233
+ # HMM default params and helper functions
218
234
  def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
219
235
  """
220
236
  Normalize user-provided `hmm_feature_sets` into canonical structure:
@@ -276,6 +292,59 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
276
292
  return canonical
277
293
 
278
294
 
295
+ def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
296
+ """
297
+ Normalize user-provided `hmm_peak_feature_configs` into:
298
+ {
299
+ layer_name: {
300
+ "min_distance": int,
301
+ "peak_width": int,
302
+ "peak_prominence": float,
303
+ "peak_threshold": float,
304
+ "rolling_window": int,
305
+ },
306
+ ...
307
+ }
308
+
309
+ Accepts dict, JSON/string, None. Returns {} for empty input.
310
+ """
311
+ if raw is None:
312
+ return {}
313
+
314
+ parsed = raw
315
+ if isinstance(raw, str):
316
+ parsed = _try_json_or_literal(raw)
317
+ if not isinstance(parsed, dict):
318
+ return {}
319
+
320
+ defaults = {
321
+ "min_distance": 200,
322
+ "peak_width": 200,
323
+ "peak_prominence": 0.2,
324
+ "peak_threshold": 0.8,
325
+ "rolling_window": 1,
326
+ }
327
+
328
+ out: Dict[str, dict] = {}
329
+ for layer, conf in parsed.items():
330
+ if conf is None:
331
+ conf = {}
332
+ if not isinstance(conf, dict):
333
+ # allow shorthand like 300 -> interpreted as peak_width
334
+ conf = {"peak_width": conf}
335
+
336
+ full = defaults.copy()
337
+ full.update(conf)
338
+ out[str(layer)] = {
339
+ "min_distance": int(full["min_distance"]),
340
+ "peak_width": int(full["peak_width"]),
341
+ "peak_prominence": float(full["peak_prominence"]),
342
+ "peak_threshold": float(full["peak_threshold"]),
343
+ "rolling_window": int(full["rolling_window"]),
344
+ }
345
+ return out
346
+
347
+
279
348
  # -------------------------
280
349
  # LoadExperimentConfig
281
350
  # -------------------------
@@ -313,12 +382,12 @@ class LoadExperimentConfig:
313
382
  df = pd.read_csv(source, dtype=str, keep_default_na=False, na_values=[""])
314
383
  # normalize column names
315
384
  df.columns = [c.strip() for c in df.columns]
316
- if 'variable' not in df.columns:
385
+ if "variable" not in df.columns:
317
386
  raise ValueError("Config CSV must contain a 'variable' column.")
318
- if 'value' not in df.columns:
319
- df['value'] = ''
320
- if 'type' not in df.columns:
321
- df['type'] = ''
387
+ if "value" not in df.columns:
388
+ df["value"] = ""
389
+ if "type" not in df.columns:
390
+ df["type"] = ""
322
391
  return df
323
392
 
324
393
  @staticmethod
@@ -337,9 +406,9 @@ class LoadExperimentConfig:
337
406
 
338
407
  def parse_bool(s: str):
339
408
  s2 = s.strip().lower()
340
- if s2 in ('1', 'true', 't', 'yes', 'y', 'on'):
409
+ if s2 in ("1", "true", "t", "yes", "y", "on"):
341
410
  return True
342
- if s2 in ('0', 'false', 'f', 'no', 'n', 'off'):
411
+ if s2 in ("0", "false", "f", "no", "n", "off"):
343
412
  return False
344
413
  raise ValueError(f"Cannot parse boolean from '{s}'")
345
414
 
@@ -359,18 +428,18 @@ class LoadExperimentConfig:
359
428
  except Exception:
360
429
  pass
361
430
  # fallback split
362
- parts = [p.strip() for p in s.strip("()[] ").split(',') if p.strip() != ""]
431
+ parts = [p.strip() for p in s.strip("()[] ").split(",") if p.strip() != ""]
363
432
  return parts
364
433
 
365
- if hint in ('int', 'integer'):
434
+ if hint in ("int", "integer"):
366
435
  return int(v)
367
- if hint in ('float', 'double'):
436
+ if hint in ("float", "double"):
368
437
  return float(v)
369
- if hint in ('bool', 'boolean'):
438
+ if hint in ("bool", "boolean"):
370
439
  return parse_bool(v)
371
- if hint in ('list', 'array'):
440
+ if hint in ("list", "array"):
372
441
  return parse_list_like(v)
373
- if hint in ('string', 'str'):
442
+ if hint in ("string", "str"):
374
443
  return v
375
444
 
376
445
  # infer
@@ -396,27 +465,31 @@ class LoadExperimentConfig:
396
465
  return lit
397
466
  except Exception:
398
467
  pass
399
- if (',' in v) and (not any(ch in v for ch in '{}[]()')):
400
- return [p.strip() for p in v.split(',') if p.strip() != ""]
468
+ if ("," in v) and (not any(ch in v for ch in "{}[]()")):
469
+ return [p.strip() for p in v.split(",") if p.strip() != ""]
401
470
  return v
402
471
 
403
472
  def _parse_df(self, df: pd.DataFrame) -> Dict[str, Any]:
404
473
  parsed: Dict[str, Any] = {}
405
474
  for idx, row in df.iterrows():
406
- name = str(row['variable']).strip()
475
+ name = str(row["variable"]).strip()
407
476
  if name == "":
408
477
  continue
409
- raw_val = row.get('value', "")
410
- raw_type = row.get('type', "")
478
+ raw_val = row.get("value", "")
479
+ raw_type = row.get("type", "")
411
480
  if pd.isna(raw_val) or str(raw_val).strip() == "":
412
481
  raw_val = None
413
482
  try:
414
483
  parsed_val = self._parse_value_as_type(raw_val, raw_type)
415
484
  except Exception as e:
416
- warnings.warn(f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value.")
485
+ warnings.warn(
486
+ f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value."
487
+ )
417
488
  parsed_val = None if raw_val is None else raw_val
418
489
  if name in parsed:
419
- warnings.warn(f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value.")
490
+ warnings.warn(
491
+ f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value."
492
+ )
420
493
  parsed[name] = parsed_val
421
494
  return parsed
422
495
 
@@ -424,7 +497,7 @@ class LoadExperimentConfig:
424
497
  """Return parsed config as a pandas DataFrame (variable, value)."""
425
498
  rows = []
426
499
  for k, v in self.var_dict.items():
427
- rows.append({'variable': k, 'value': v})
500
+ rows.append({"variable": k, "value": v})
428
501
  return pd.DataFrame(rows)
429
502
 
430
503
 
@@ -592,17 +665,17 @@ class ExperimentConfig:
592
665
  input_data_path: Optional[str] = None
593
666
  output_directory: Optional[str] = None
594
667
  fasta: Optional[str] = None
595
- bam_suffix: str = ".bam"
668
+ bam_suffix: str = BAM_SUFFIX
596
669
  recursive_input_search: bool = True
597
670
  input_type: Optional[str] = None
598
671
  input_files: Optional[List[Path]] = None
599
- split_dir: str = "demultiplexed_BAMs"
672
+ split_dir: str = SPLIT_DIR
600
673
  split_path: Optional[str] = None
601
- strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
602
- conversions: List[str] = field(default_factory=lambda: ["unconverted"])
674
+ strands: List[str] = field(default_factory=lambda: STRANDS)
675
+ conversions: List[str] = field(default_factory=lambda: CONVERSIONS)
603
676
  fasta_regions_of_interest: Optional[str] = None
604
677
  sample_sheet_path: Optional[str] = None
605
- sample_sheet_mapping_column: Optional[str] = 'Barcode'
678
+ sample_sheet_mapping_column: Optional[str] = "Experiment_name_and_barcode"
606
679
  experiment_name: Optional[str] = None
607
680
  input_already_demuxed: bool = False
608
681
  summary_file: Optional[Path] = None
@@ -612,7 +685,7 @@ class ExperimentConfig:
612
685
  fastq_auto_pairing: bool = True
613
686
 
614
687
  # Remove intermediate file options
615
- delete_intermediate_bams: bool = True
688
+ delete_intermediate_bams: bool = False
616
689
  delete_intermediate_tsvs: bool = True
617
690
 
618
691
  # Conversion/Deamination file handling
@@ -638,8 +711,8 @@ class ExperimentConfig:
638
711
  model_dir: Optional[str] = None
639
712
  barcode_kit: Optional[str] = None
640
713
  model: str = "hac"
641
- barcode_both_ends: bool = False
642
- trim: bool = False
714
+ barcode_both_ends: bool = BARCODE_BOTH_ENDS
715
+ trim: bool = TRIM
643
716
  # General basecalling params
644
717
  filter_threshold: float = 0.8
645
718
  # Modified basecalling specific params
@@ -647,53 +720,102 @@ class ExperimentConfig:
647
720
  m5C_threshold: float = 0.7
648
721
  hm5C_threshold: float = 0.7
649
722
  thresholds: List[float] = field(default_factory=list)
650
- mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"])
723
+ mod_list: List[str] = field(
724
+ default_factory=lambda: list(MOD_LIST)
725
+ ) # Dorado modified basecalling codes
726
+ mod_map: Dict[str, str] = field(
727
+ default_factory=lambda: dict(MOD_MAP)
728
+ ) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
651
729
 
652
730
  # Alignment params
653
- mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
654
- aligner: str = "minimap2"
731
+ mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
732
+ align_from_bam: bool = (
733
+ False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
734
+ )
735
+ aligner: str = "dorado"
655
736
  aligner_args: Optional[List[str]] = None
656
737
  make_bigwigs: bool = False
657
738
  make_beds: bool = False
658
739
 
659
740
  # Anndata structure
660
- reference_column: Optional[str] = 'Reference_strand'
661
- sample_column: Optional[str] = 'Barcode'
741
+ reference_column: Optional[str] = REF_COL
742
+ sample_column: Optional[str] = SAMPLE_COL
662
743
 
663
744
  # General Plotting
664
- sample_name_col_for_plotting: Optional[str] = 'Barcode'
745
+ sample_name_col_for_plotting: Optional[str] = "Barcode"
665
746
  rows_per_qc_histogram_grid: int = 12
666
747
 
667
748
  # Preprocessing - Read length and quality filter params
668
749
  read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
669
- read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
670
- read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
671
- read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
672
- read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
750
+ read_len_filter_thresholds: Optional[Sequence[float]] = field(
751
+ default_factory=lambda: [100, None]
752
+ )
753
+ read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(
754
+ default_factory=lambda: [0.4, 1.5]
755
+ )
756
+ read_quality_filter_thresholds: Optional[Sequence[float]] = field(
757
+ default_factory=lambda: [15, None]
758
+ )
759
+ read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(
760
+ default_factory=lambda: [None, None]
761
+ )
762
+
763
+ # Preprocessing - Optional reindexing params
764
+ reindexing_offsets: Dict[str, int] = field(default_factory=dict)
765
+ reindexed_var_suffix: Optional[str] = "reindexed"
673
766
 
674
767
  # Preprocessing - Direct mod detection binarization params
675
- fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
676
- binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
677
- positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
678
- negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
679
- infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
680
- inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
681
- fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
768
+ fit_position_methylation_thresholds: Optional[bool] = (
769
+ False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
770
+ )
771
+ binarize_on_fixed_methlyation_threshold: Optional[float] = (
772
+ 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
773
+ )
774
+ positive_control_sample_methylation_fitting: Optional[str] = (
775
+ None # A positive control Sample_name to use for fully modified template data
776
+ )
777
+ negative_control_sample_methylation_fitting: Optional[str] = (
778
+ None # A negative control Sample_name to use for fully unmodified template data
779
+ )
780
+ infer_on_percentile_sample_methylation_fitting: Optional[int] = (
781
+ 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
782
+ )
783
+ inference_variable_sample_methylation_fitting: Optional[str] = (
784
+ "Raw_modification_signal" # The obs column value used for the percentile metric above.
785
+ )
786
+ fit_j_threshold: Optional[float] = (
787
+ 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
788
+ )
682
789
  output_binary_layer_name: Optional[str] = "binarized_methylation"
683
790
 
684
791
  # Preprocessing - Read modification filter params
685
792
  read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
686
793
  read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
687
- read_mod_filtering_any_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
794
+ read_mod_filtering_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
688
795
  read_mod_filtering_a_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
689
796
  read_mod_filtering_use_other_c_as_background: bool = True
690
797
  min_valid_fraction_positions_in_read_vs_ref: float = 0.2
691
798
 
799
+ # Preprocessing - plotting params
800
+ obs_to_plot_pp_qc: List[str] = field(
801
+ default_factory=lambda: [
802
+ "read_length",
803
+ "mapped_length",
804
+ "read_quality",
805
+ "mapping_quality",
806
+ "mapped_length_to_reference_length_ratio",
807
+ "mapped_length_to_read_length_ratio",
808
+ "Raw_modification_signal",
809
+ ]
810
+ )
811
+
692
812
  # Preprocessing - Duplicate detection params
693
- duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
813
+ duplicate_detection_site_types: List[str] = field(
814
+ default_factory=lambda: ["GpC", "CpG", "ambiguous_GpC_CpG"]
815
+ )
694
816
  duplicate_detection_distance_threshold: float = 0.07
695
- hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_any_C_site_modified'])
696
- duplicate_detection_keep_best_metric: str ='read_quality'
817
+ hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ["Fraction_C_site_modified"])
818
+ duplicate_detection_keep_best_metric: str = "read_quality"
697
819
  duplicate_detection_window_size_for_hamming_neighbors: int = 50
698
820
  duplicate_detection_min_overlapping_positions: int = 20
699
821
  duplicate_detection_do_hierarchical: bool = True
@@ -703,28 +825,38 @@ class ExperimentConfig:
703
825
  # Preprocessing - Position QC
704
826
  position_max_nan_threshold: float = 0.1
705
827
 
706
- # Basic Analysis - Clustermap params
707
- layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
708
-
709
- # Basic Analysis - UMAP/Leiden params
710
- layer_for_umap_plotting: Optional[str] = 'nan_half'
711
- umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
712
-
713
- # Basic Analysis - Spatial Autocorrelation params
828
+ # Spatial Analysis - Clustermap params
829
+ layer_for_clustermap_plotting: Optional[str] = "nan0_0minus1"
830
+ clustermap_cmap_c: Optional[str] = "coolwarm"
831
+ clustermap_cmap_gpc: Optional[str] = "coolwarm"
832
+ clustermap_cmap_cpg: Optional[str] = "coolwarm"
833
+ clustermap_cmap_a: Optional[str] = "coolwarm"
834
+ spatial_clustermap_sortby: Optional[str] = "gpc"
835
+
836
+ # Spatial Analysis - UMAP/Leiden params
837
+ layer_for_umap_plotting: Optional[str] = "nan_half"
838
+ umap_layers_to_plot: List[str] = field(
839
+ default_factory=lambda: ["mapped_length", "Raw_modification_signal"]
840
+ )
841
+
842
+ # Spatial Analysis - Spatial Autocorrelation params
843
+ autocorr_normalization_method: str = "pearson"
714
844
  rows_per_qc_autocorr_grid: int = 12
715
845
  autocorr_rolling_window_size: int = 25
716
846
  autocorr_max_lag: int = 800
717
- autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'any_C'])
847
+ autocorr_site_types: List[str] = field(default_factory=lambda: ["GpC", "CpG", "C"])
718
848
 
719
- # Basic Analysis - Correlation Matrix params
720
- correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
721
- correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
722
- correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
849
+ # Spatial Analysis - Correlation Matrix params
850
+ correlation_matrix_types: List[str] = field(
851
+ default_factory=lambda: ["pearson", "binary_covariance"]
852
+ )
853
+ correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
854
+ correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
723
855
 
724
856
  # HMM params
725
857
  hmm_n_states: int = 2
726
- hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
727
- hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
858
+ hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
859
+ hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
728
860
  hmm_init_start_probs: List[float] = field(default_factory=lambda: [0.5, 0.5])
729
861
  hmm_eps: float = 1e-8
730
862
  hmm_dtype: str = "float64"
@@ -732,12 +864,29 @@ class ExperimentConfig:
732
864
  hmm_batch_size: int = 1024
733
865
  hmm_use_viterbi: bool = False
734
866
  hmm_device: Optional[str] = None
735
- hmm_methbases: Optional[List[str]] = None # if None, HMM.annotate_adata will fall back to mod_target_bases
867
+ hmm_methbases: Optional[List[str]] = (
868
+ None # if None, HMM.annotate_adata will fall back to mod_target_bases
869
+ )
870
+ # HMM fitting/application strategy
871
+ hmm_fit_strategy: str = "per_group" # "per_group" | "shared_transitions"
872
+ hmm_shared_scope: List[str] = field(default_factory=lambda: ["reference", "methbase"])
873
+ hmm_groupby: List[str] = field(default_factory=lambda: ["sample", "reference", "methbase"])
874
+ # Shared-transitions adaptation behavior
875
+ hmm_adapt_emissions: bool = True
876
+ hmm_adapt_startprobs: bool = True
877
+ hmm_emission_adapt_iters: int = 5
878
+ hmm_emission_adapt_tol: float = 1e-4
736
879
  footprints: Optional[bool] = True
737
880
  accessible_patches: Optional[bool] = True
738
881
  cpg: Optional[bool] = False
739
882
  hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
740
- hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
883
+ hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 60)])
884
+ clustermap_cmap_hmm: Optional[str] = "coolwarm"
885
+ hmm_clustermap_feature_layers: List[str] = field(
886
+ default_factory=lambda: ["all_accessible_features"]
887
+ )
888
+ hmm_clustermap_sortby: Optional[str] = "hmm"
889
+ hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
741
890
 
742
891
  # Pipeline control flow - load adata
743
892
  force_redo_load_adata: bool = False
@@ -760,11 +909,11 @@ class ExperimentConfig:
760
909
  force_redo_filter_reads_on_modification_thresholds: bool = False
761
910
  bypass_flag_duplicate_reads: bool = False
762
911
  force_redo_flag_duplicate_reads: bool = False
763
- bypass_complexity_analysis: bool = False
912
+ bypass_complexity_analysis: bool = False
764
913
  force_redo_complexity_analysis: bool = False
765
914
 
766
- # Pipeline control flow - Basic Analyses
767
- force_redo_basic_analyses: bool = False
915
+ # Pipeline control flow - Spatial Analyses
916
+ force_redo_spatial_analyses: bool = False
768
917
  bypass_basic_clustermaps: bool = False
769
918
  force_redo_basic_clustermaps: bool = False
770
919
  bypass_basic_umap: bool = False
@@ -840,7 +989,9 @@ class ExperimentConfig:
840
989
  defaults_loaded = dict(defaults_map[modality] or {})
841
990
  defaults_source_chain = [f"defaults_map['{modality}']"]
842
991
  elif defaults_dir is not None:
843
- defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(defaults_dir, modality)
992
+ defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
993
+ defaults_dir, modality
994
+ )
844
995
 
845
996
  # If CSV asks to extend defaults, load those and merge
846
997
  merged = dict(defaults_loaded or {})
@@ -855,7 +1006,11 @@ class ExperimentConfig:
855
1006
  else:
856
1007
  ext_list = []
857
1008
  for ext in ext_list:
858
- ext_defaults, ext_sources = (load_defaults_with_inheritance(defaults_dir, ext) if defaults_dir else ({}, []))
1009
+ ext_defaults, ext_sources = (
1010
+ load_defaults_with_inheritance(defaults_dir, ext)
1011
+ if defaults_dir
1012
+ else ({}, [])
1013
+ )
859
1014
  merged = deep_merge(merged, ext_defaults)
860
1015
  for s in ext_sources:
861
1016
  if s not in defaults_source_chain:
@@ -885,34 +1040,40 @@ class ExperimentConfig:
885
1040
  merged["experiment_name"] = f"{date_str}_SMF_experiment"
886
1041
 
887
1042
  # Input file types and path handling
888
- input_data_path = Path(merged['input_data_path'])
1043
+ input_data_path = Path(merged["input_data_path"])
889
1044
 
890
1045
  # Detect the input filetype
891
1046
  if input_data_path.is_file():
892
- suffix = input_data_path.suffix.lower()
893
- suffixes = [s.lower() for s in input_data_path.suffixes] # handles multi-part extensions
894
-
895
- # recognize multi-suffix cases like .fastq.gz or .fq.gz
896
- if any(s in ['.pod5', '.p5'] for s in suffixes):
897
- input_type = "pod5"
898
- input_files = [Path(input_data_path)]
899
- elif any(s in ['.fast5', '.f5'] for s in suffixes):
900
- input_type = "fast5"
901
- input_files = [Path(input_data_path)]
902
- elif any(s in ['.fastq', '.fq'] for s in suffixes):
903
- input_type = "fastq"
904
- input_files = [Path(input_data_path)]
905
- elif any(s in ['.bam'] for s in suffixes):
906
- input_type = "bam"
907
- input_files = [Path(input_data_path)]
908
- elif any(s in ['.h5ad', ".h5"] for s in suffixes):
909
- input_type = "h5ad"
910
- input_files = [Path(input_data_path)]
911
- else:
912
- print("Error detecting input file type")
1047
+ suffix = input_data_path.suffix.lower()
1048
+ suffixes = [
1049
+ s.lower() for s in input_data_path.suffixes
1050
+ ] # handles multi-part extensions
1051
+
1052
+ # recognize multi-suffix cases like .fastq.gz or .fq.gz
1053
+ if any(s in [".pod5", ".p5"] for s in suffixes):
1054
+ input_type = "pod5"
1055
+ input_files = [Path(input_data_path)]
1056
+ elif any(s in [".fast5", ".f5"] for s in suffixes):
1057
+ input_type = "fast5"
1058
+ input_files = [Path(input_data_path)]
1059
+ elif any(s in [".fastq", ".fq"] for s in suffixes):
1060
+ input_type = "fastq"
1061
+ input_files = [Path(input_data_path)]
1062
+ elif any(s in [".bam"] for s in suffixes):
1063
+ input_type = "bam"
1064
+ input_files = [Path(input_data_path)]
1065
+ elif any(s in [".h5ad", ".h5"] for s in suffixes):
1066
+ input_type = "h5ad"
1067
+ input_files = [Path(input_data_path)]
1068
+ else:
1069
+ print("Error detecting input file type")
913
1070
 
914
1071
  elif input_data_path.is_dir():
915
- found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
1072
+ found = discover_input_files(
1073
+ input_data_path,
1074
+ bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
1075
+ recursive=merged["recursive_input_search"],
1076
+ )
916
1077
 
917
1078
  if found["input_is_pod5"]:
918
1079
  input_type = "pod5"
@@ -930,15 +1091,22 @@ class ExperimentConfig:
930
1091
  input_type = "h5ad"
931
1092
  input_files = found["h5ad_paths"]
932
1093
 
933
- print(f"Found {found['all_files_searched']} files; fastq={len(found["fastq_paths"])}, bam={len(found["bam_paths"])}, pod5={len(found["pod5_paths"])}, fast5={len(found["fast5_paths"])}, , h5ad={len(found["h5ad_paths"])}")
1094
+ print(
1095
+ f"Found {found['all_files_searched']} files; "
1096
+ f"fastq={len(found['fastq_paths'])}, "
1097
+ f"bam={len(found['bam_paths'])}, "
1098
+ f"pod5={len(found['pod5_paths'])}, "
1099
+ f"fast5={len(found['fast5_paths'])}, "
1100
+ f"h5ad={len(found['h5ad_paths'])}"
1101
+ )
934
1102
 
935
1103
  # summary file output path
936
- output_dir = Path(merged['output_directory'])
937
- summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
1104
+ output_dir = Path(merged["output_directory"])
1105
+ summary_file_basename = merged["experiment_name"] + "_output_summary.csv"
938
1106
  summary_file = output_dir / summary_file_basename
939
1107
 
940
1108
  # Demultiplexing output path
941
- split_dir = merged.get("split_dir", "demultiplexed_BAMs")
1109
+ split_dir = merged.get("split_dir", SPLIT_DIR)
942
1110
  split_path = output_dir / split_dir
943
1111
 
944
1112
  # final normalization
@@ -962,7 +1130,14 @@ class ExperimentConfig:
962
1130
  merged["hm5C_threshold"],
963
1131
  ]
964
1132
 
965
- for bkey in ("barcode_both_ends", "trim", "input_already_demuxed", "make_bigwigs", "skip_unclassified", "delete_batch_hdfs"):
1133
+ for bkey in (
1134
+ "barcode_both_ends",
1135
+ "trim",
1136
+ "input_already_demuxed",
1137
+ "make_bigwigs",
1138
+ "skip_unclassified",
1139
+ "delete_batch_hdfs",
1140
+ ):
966
1141
  if bkey in merged:
967
1142
  merged[bkey] = _parse_bool(merged[bkey])
968
1143
 
@@ -971,16 +1146,19 @@ class ExperimentConfig:
971
1146
  if "threads" in merged:
972
1147
  tval = _parse_numeric(merged.get("threads", None), None)
973
1148
  merged["threads"] = None if tval is None else int(tval)
974
-
1149
+
975
1150
  if "aligner_args" in merged and merged.get("aligner_args") is None:
976
1151
  merged.pop("aligner_args", None)
977
1152
 
978
1153
  # --- Resolve aligner_args into concrete list for the chosen aligner ---
979
- merged['aligner_args'] = resolve_aligner_args(merged)
1154
+ merged["aligner_args"] = resolve_aligner_args(merged)
980
1155
 
981
1156
  if "mod_list" in merged:
982
1157
  merged["mod_list"] = _parse_list(merged.get("mod_list"))
983
1158
 
1159
+ # Preprocessing args
1160
+ obs_to_plot_pp_qc = _parse_list(merged.get("obs_to_plot_pp_qc", None))
1161
+
984
1162
  # HMM feature set handling
985
1163
  if "hmm_feature_sets" in merged:
986
1164
  merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged["hmm_feature_sets"])
@@ -988,11 +1166,22 @@ class ExperimentConfig:
988
1166
  # allow older names (footprint_ranges, accessible_ranges, cpg_ranges) — optional:
989
1167
  maybe_fs = {}
990
1168
  if "footprint_ranges" in merged or "hmm_footprint_ranges" in merged:
991
- maybe_fs["footprint"] = {"features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")), "state": merged.get("hmm_footprint_state", "Non-Modified")}
1169
+ maybe_fs["footprint"] = {
1170
+ "features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")),
1171
+ "state": merged.get("hmm_footprint_state", "Non-Modified"),
1172
+ }
992
1173
  if "accessible_ranges" in merged or "hmm_accessible_ranges" in merged:
993
- maybe_fs["accessible"] = {"features": merged.get("hmm_accessible_ranges", merged.get("accessible_ranges")), "state": merged.get("hmm_accessible_state", "Modified")}
1174
+ maybe_fs["accessible"] = {
1175
+ "features": merged.get(
1176
+ "hmm_accessible_ranges", merged.get("accessible_ranges")
1177
+ ),
1178
+ "state": merged.get("hmm_accessible_state", "Modified"),
1179
+ }
994
1180
  if "cpg_ranges" in merged or "hmm_cpg_ranges" in merged:
995
- maybe_fs["cpg"] = {"features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")), "state": merged.get("hmm_cpg_state", "Modified")}
1181
+ maybe_fs["cpg"] = {
1182
+ "features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")),
1183
+ "state": merged.get("hmm_cpg_state", "Modified"),
1184
+ }
996
1185
  if maybe_fs:
997
1186
  merged.setdefault("hmm_feature_sets", {})
998
1187
  for k, v in maybe_fs.items():
@@ -1013,158 +1202,278 @@ class ExperimentConfig:
1013
1202
  if not hmm_methbases: # None or []
1014
1203
  hmm_methbases = _parse_list(merged.get("mod_target_bases", None))
1015
1204
  if not hmm_methbases:
1016
- hmm_methbases = ['C']
1205
+ hmm_methbases = ["C"]
1017
1206
  hmm_methbases = list(hmm_methbases)
1018
1207
  hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
1208
+ hmm_clustermap_feature_layers = _parse_list(
1209
+ merged.get("hmm_clustermap_feature_layers", "all_accessible_features")
1210
+ )
1211
+
1212
+ hmm_fit_strategy = str(merged.get("hmm_fit_strategy", "per_group")).strip()
1213
+ hmm_shared_scope = _parse_list(merged.get("hmm_shared_scope", ["reference", "methbase"]))
1214
+ hmm_groupby = _parse_list(merged.get("hmm_groupby", ["sample", "reference", "methbase"]))
1215
+
1216
+ hmm_adapt_emissions = _parse_bool(merged.get("hmm_adapt_emissions", True))
1217
+ hmm_adapt_startprobs = _parse_bool(merged.get("hmm_adapt_startprobs", True))
1218
+ hmm_emission_adapt_iters = int(_parse_numeric(merged.get("hmm_emission_adapt_iters", 5), 5))
1219
+ hmm_emission_adapt_tol = float(
1220
+ _parse_numeric(merged.get("hmm_emission_adapt_tol", 1e-4), 1e-4)
1221
+ )
1222
+
1223
+ # HMM peak feature configs (for call_hmm_peaks)
1224
+ merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
1225
+ merged.get("hmm_peak_feature_configs", {})
1226
+ )
1227
+ hmm_peak_feature_configs = merged.get("hmm_peak_feature_configs", {})
1019
1228
 
1020
1229
  # instantiate dataclass
1021
1230
  instance = cls(
1022
- smf_modality = merged.get("smf_modality"),
1023
- input_data_path = input_data_path,
1024
- recursive_input_search = merged.get("recursive_input_search"),
1025
- input_type = input_type,
1026
- input_files = input_files,
1027
- output_directory = output_dir,
1028
- summary_file = summary_file,
1029
- fasta = merged.get("fasta"),
1030
- sequencer = merged.get("sequencer"),
1031
- model_dir = merged.get("model_dir"),
1032
- barcode_kit = merged.get("barcode_kit"),
1033
- fastq_barcode_map = merged.get("fastq_barcode_map"),
1034
- fastq_auto_pairing = merged.get("fastq_auto_pairing"),
1035
- bam_suffix = merged.get("bam_suffix", ".bam"),
1036
- split_dir = split_dir,
1037
- split_path = split_path,
1038
- strands = merged.get("strands", ["bottom","top"]),
1039
- conversions = merged.get("conversions", ["unconverted"]),
1040
- fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
1041
- mapping_threshold = float(merged.get("mapping_threshold", 0.01)),
1042
- experiment_name = merged.get("experiment_name"),
1043
- model = merged.get("model", "hac"),
1044
- barcode_both_ends = merged.get("barcode_both_ends", False),
1045
- trim = merged.get("trim", False),
1046
- input_already_demuxed = merged.get("input_already_demuxed", False),
1047
- threads = merged.get("threads"),
1048
- sample_sheet_path = merged.get("sample_sheet_path"),
1049
- sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
1050
- delete_intermediate_bams = merged.get("delete_intermediate_bams", True),
1051
- delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
1052
- aligner = merged.get("aligner", "minimap2"),
1053
- aligner_args = merged.get("aligner_args", None),
1054
- device = merged.get("device", "auto"),
1055
- make_bigwigs = merged.get("make_bigwigs", False),
1056
- make_beds = merged.get("make_beds", False),
1057
- delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
1058
- mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
1059
- enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
1060
- conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
1061
- filter_threshold = merged.get("filter_threshold", 0.8),
1062
- m6A_threshold = merged.get("m6A_threshold", 0.7),
1063
- m5C_threshold = merged.get("m5C_threshold", 0.7),
1064
- hm5C_threshold = merged.get("hm5C_threshold", 0.7),
1065
- thresholds = merged.get("thresholds", []),
1066
- mod_list = merged.get("mod_list", ["5mC_5hmC","6mA"]),
1067
- batch_size = merged.get("batch_size", 4),
1068
- skip_unclassified = merged.get("skip_unclassified", True),
1069
- delete_batch_hdfs = merged.get("delete_batch_hdfs", True),
1070
- reference_column = merged.get("reference_column", 'Reference_strand'),
1071
- sample_column = merged.get("sample_column", 'Barcode'),
1072
- sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
1073
- fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
1074
- binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
1075
- positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
1076
- negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
1077
- infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
1078
- inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
1079
- fit_j_threshold = merged.get("fit_j_threshold", 0.5),
1080
- output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
1081
- layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
1082
- layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
1083
- umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
1084
- rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
1085
- rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
1086
- autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
1087
- autocorr_max_lag = merged.get("autocorr_max_lag", 800),
1088
- autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'any_C']),
1089
- hmm_n_states = merged.get("hmm_n_states", 2),
1090
- hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
1091
- hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
1092
- hmm_init_start_probs = merged.get("hmm_init_start_probs",[0.5, 0.5]),
1093
- hmm_eps = merged.get("hmm_eps", 1e-8),
1094
- hmm_dtype = merged.get("hmm_dtype", "float64"),
1095
- hmm_feature_sets = hmm_feature_sets,
1096
- hmm_annotation_threshold = hmm_annotation_threshold,
1097
- hmm_batch_size = hmm_batch_size,
1098
- hmm_use_viterbi = hmm_use_viterbi,
1099
- hmm_methbases = hmm_methbases,
1100
- hmm_device = hmm_device,
1101
- hmm_merge_layer_features = hmm_merge_layer_features,
1102
- footprints = merged.get("footprints", None),
1103
- accessible_patches = merged.get("accessible_patches", None),
1104
- cpg = merged.get("cpg", None),
1105
- read_coord_filter = merged.get("read_coord_filter", [None, None]),
1106
- read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
1107
- read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
1108
- read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
1109
- read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
1110
- read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
1111
- read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
1112
- read_mod_filtering_any_c_thresholds = merged.get("read_mod_filtering_any_c_thresholds", [0.025, 0.975]),
1113
- read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
1114
- read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
1115
- min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
1116
- duplicate_detection_site_types = merged.get("duplicate_detection_site_types", ['GpC', 'CpG', 'ambiguous_GpC_CpG']),
1117
- duplicate_detection_distance_threshold = merged.get("duplicate_detection_distance_threshold", 0.07),
1118
- duplicate_detection_keep_best_metric = merged.get("duplicate_detection_keep_best_metric", "read_quality"),
1119
- duplicate_detection_window_size_for_hamming_neighbors = merged.get("duplicate_detection_window_size_for_hamming_neighbors", 50),
1120
- duplicate_detection_min_overlapping_positions = merged.get("duplicate_detection_min_overlapping_positions", 20),
1121
- duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
1122
- duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
1123
- duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
1124
- position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
1125
- correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
1126
- correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1127
- correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
1128
- hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
1129
- force_redo_load_adata = merged.get("force_redo_load_adata", False),
1130
- force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
1131
- force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
1132
- bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),
1133
- force_redo_add_read_length_and_mapping_qc = merged.get("force_redo_add_read_length_and_mapping_qc", False),
1134
- bypass_clean_nan = merged.get("bypass_clean_nan", False),
1135
- force_redo_clean_nan = merged.get("force_redo_clean_nan", False),
1136
- bypass_append_base_context = merged.get("bypass_append_base_context", False),
1137
- force_redo_append_base_context = merged.get("force_redo_append_base_context", False),
1138
- invert_adata = merged.get("invert_adata", False),
1139
- bypass_append_binary_layer_by_base_context = merged.get("bypass_append_binary_layer_by_base_context", False),
1140
- force_redo_append_binary_layer_by_base_context = merged.get("force_redo_append_binary_layer_by_base_context", False),
1141
- bypass_calculate_read_modification_stats = merged.get("bypass_calculate_read_modification_stats", False),
1142
- force_redo_calculate_read_modification_stats = merged.get("force_redo_calculate_read_modification_stats", False),
1143
- bypass_filter_reads_on_modification_thresholds = merged.get("bypass_filter_reads_on_modification_thresholds", False),
1144
- force_redo_filter_reads_on_modification_thresholds = merged.get("force_redo_filter_reads_on_modification_thresholds", False),
1145
- bypass_flag_duplicate_reads = merged.get("bypass_flag_duplicate_reads", False),
1146
- force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
1147
- bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
1148
- force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
1149
- force_redo_basic_analyses = merged.get("force_redo_basic_analyses", False),
1150
- bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
1151
- force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
1152
- bypass_basic_umap = merged.get("bypass_basic_umap", False),
1153
- force_redo_basic_umap = merged.get("force_redo_basic_umap", False),
1154
- bypass_spatial_autocorr_calculations = merged.get("bypass_spatial_autocorr_calculations", False),
1155
- force_redo_spatial_autocorr_calculations = merged.get("force_redo_spatial_autocorr_calculations", False),
1156
- bypass_spatial_autocorr_plotting = merged.get("bypass_spatial_autocorr_plotting", False),
1157
- force_redo_spatial_autocorr_plotting = merged.get("force_redo_spatial_autocorr_plotting", False),
1158
- bypass_matrix_corr_calculations = merged.get("bypass_matrix_corr_calculations", False),
1159
- force_redo_matrix_corr_calculations = merged.get("force_redo_matrix_corr_calculations", False),
1160
- bypass_matrix_corr_plotting = merged.get("bypass_matrix_corr_plotting", False),
1161
- force_redo_matrix_corr_plotting = merged.get("force_redo_matrix_corr_plotting", False),
1162
- bypass_hmm_fit = merged.get("bypass_hmm_fit", False),
1163
- force_redo_hmm_fit = merged.get("force_redo_hmm_fit", False),
1164
- bypass_hmm_apply = merged.get("bypass_hmm_apply", False),
1165
- force_redo_hmm_apply = merged.get("force_redo_hmm_apply", False),
1166
-
1167
- config_source = config_source or "<var_dict>",
1231
+ smf_modality=merged.get("smf_modality"),
1232
+ input_data_path=input_data_path,
1233
+ recursive_input_search=merged.get("recursive_input_search"),
1234
+ input_type=input_type,
1235
+ input_files=input_files,
1236
+ output_directory=output_dir,
1237
+ summary_file=summary_file,
1238
+ fasta=merged.get("fasta"),
1239
+ sequencer=merged.get("sequencer"),
1240
+ model_dir=merged.get("model_dir"),
1241
+ barcode_kit=merged.get("barcode_kit"),
1242
+ fastq_barcode_map=merged.get("fastq_barcode_map"),
1243
+ fastq_auto_pairing=merged.get("fastq_auto_pairing"),
1244
+ bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
1245
+ split_dir=split_dir,
1246
+ split_path=split_path,
1247
+ strands=merged.get("strands", STRANDS),
1248
+ conversions=merged.get("conversions", CONVERSIONS),
1249
+ fasta_regions_of_interest=merged.get("fasta_regions_of_interest"),
1250
+ mapping_threshold=float(merged.get("mapping_threshold", 0.01)),
1251
+ experiment_name=merged.get("experiment_name"),
1252
+ model=merged.get("model", "hac"),
1253
+ barcode_both_ends=merged.get("barcode_both_ends", BARCODE_BOTH_ENDS),
1254
+ trim=merged.get("trim", TRIM),
1255
+ input_already_demuxed=merged.get("input_already_demuxed", False),
1256
+ threads=merged.get("threads"),
1257
+ sample_sheet_path=merged.get("sample_sheet_path"),
1258
+ sample_sheet_mapping_column=merged.get("sample_sheet_mapping_column"),
1259
+ delete_intermediate_bams=merged.get("delete_intermediate_bams", False),
1260
+ delete_intermediate_tsvs=merged.get("delete_intermediate_tsvs", True),
1261
+ align_from_bam=merged.get("align_from_bam", False),
1262
+ aligner=merged.get("aligner", "minimap2"),
1263
+ aligner_args=merged.get("aligner_args", None),
1264
+ device=merged.get("device", "auto"),
1265
+ make_bigwigs=merged.get("make_bigwigs", False),
1266
+ make_beds=merged.get("make_beds", False),
1267
+ delete_intermediate_hdfs=merged.get("delete_intermediate_hdfs", True),
1268
+ mod_target_bases=merged.get("mod_target_bases", ["GpC", "CpG"]),
1269
+ enzyme_target_bases=merged.get("enzyme_target_bases", ["GpC"]),
1270
+ conversion_types=merged.get("conversions", ["unconverted"])
1271
+ + merged.get("conversion_types", ["5mC"]),
1272
+ filter_threshold=merged.get("filter_threshold", 0.8),
1273
+ m6A_threshold=merged.get("m6A_threshold", 0.7),
1274
+ m5C_threshold=merged.get("m5C_threshold", 0.7),
1275
+ hm5C_threshold=merged.get("hm5C_threshold", 0.7),
1276
+ thresholds=merged.get("thresholds", []),
1277
+ mod_list=merged.get("mod_list", list(MOD_LIST)),
1278
+ mod_map=merged.get("mod_map", list(MOD_MAP)),
1279
+ batch_size=merged.get("batch_size", 4),
1280
+ skip_unclassified=merged.get("skip_unclassified", True),
1281
+ delete_batch_hdfs=merged.get("delete_batch_hdfs", True),
1282
+ reference_column=merged.get("reference_column", REF_COL),
1283
+ sample_column=merged.get("sample_column", SAMPLE_COL),
1284
+ sample_name_col_for_plotting=merged.get("sample_name_col_for_plotting", "Barcode"),
1285
+ obs_to_plot_pp_qc=obs_to_plot_pp_qc,
1286
+ fit_position_methylation_thresholds=merged.get(
1287
+ "fit_position_methylation_thresholds", False
1288
+ ),
1289
+ binarize_on_fixed_methlyation_threshold=merged.get(
1290
+ "binarize_on_fixed_methlyation_threshold", 0.7
1291
+ ),
1292
+ positive_control_sample_methylation_fitting=merged.get(
1293
+ "positive_control_sample_methylation_fitting", None
1294
+ ),
1295
+ negative_control_sample_methylation_fitting=merged.get(
1296
+ "negative_control_sample_methylation_fitting", None
1297
+ ),
1298
+ infer_on_percentile_sample_methylation_fitting=merged.get(
1299
+ "infer_on_percentile_sample_methylation_fitting", 10
1300
+ ),
1301
+ inference_variable_sample_methylation_fitting=merged.get(
1302
+ "inference_variable_sample_methylation_fitting", "Raw_modification_signal"
1303
+ ),
1304
+ fit_j_threshold=merged.get("fit_j_threshold", 0.5),
1305
+ output_binary_layer_name=merged.get(
1306
+ "output_binary_layer_name", "binarized_methylation"
1307
+ ),
1308
+ reindexing_offsets=merged.get("reindexing_offsets", {None: None}),
1309
+ reindexed_var_suffix=merged.get("reindexed_var_suffix", "reindexed"),
1310
+ layer_for_clustermap_plotting=merged.get(
1311
+ "layer_for_clustermap_plotting", "nan0_0minus1"
1312
+ ),
1313
+ clustermap_cmap_c=merged.get("clustermap_cmap_c", "coolwarm"),
1314
+ clustermap_cmap_gpc=merged.get("clustermap_cmap_gpc", "coolwarm"),
1315
+ clustermap_cmap_cpg=merged.get("clustermap_cmap_cpg", "coolwarm"),
1316
+ clustermap_cmap_a=merged.get("clustermap_cmap_a", "coolwarm"),
1317
+ spatial_clustermap_sortby=merged.get("spatial_clustermap_sortby", "gpc"),
1318
+ layer_for_umap_plotting=merged.get("layer_for_umap_plotting", "nan_half"),
1319
+ umap_layers_to_plot=merged.get(
1320
+ "umap_layers_to_plot", ["mapped_length", "Raw_modification_signal"]
1321
+ ),
1322
+ rows_per_qc_histogram_grid=merged.get("rows_per_qc_histogram_grid", 12),
1323
+ rows_per_qc_autocorr_grid=merged.get("rows_per_qc_autocorr_grid", 12),
1324
+ autocorr_normalization_method=merged.get("autocorr_normalization_method", "pearson"),
1325
+ autocorr_rolling_window_size=merged.get("autocorr_rolling_window_size", 25),
1326
+ autocorr_max_lag=merged.get("autocorr_max_lag", 800),
1327
+ autocorr_site_types=merged.get("autocorr_site_types", ["GpC", "CpG", "C"]),
1328
+ hmm_n_states=merged.get("hmm_n_states", 2),
1329
+ hmm_init_emission_probs=merged.get("hmm_init_emission_probs", [[0.8, 0.2], [0.2, 0.8]]),
1330
+ hmm_init_transition_probs=merged.get(
1331
+ "hmm_init_transition_probs", [[0.9, 0.1], [0.1, 0.9]]
1332
+ ),
1333
+ hmm_init_start_probs=merged.get("hmm_init_start_probs", [0.5, 0.5]),
1334
+ hmm_eps=merged.get("hmm_eps", 1e-8),
1335
+ hmm_fit_strategy=hmm_fit_strategy,
1336
+ hmm_shared_scope=hmm_shared_scope,
1337
+ hmm_groupby=hmm_groupby,
1338
+ hmm_adapt_emissions=hmm_adapt_emissions,
1339
+ hmm_adapt_startprobs=hmm_adapt_startprobs,
1340
+ hmm_emission_adapt_iters=hmm_emission_adapt_iters,
1341
+ hmm_emission_adapt_tol=hmm_emission_adapt_tol,
1342
+ hmm_dtype=merged.get("hmm_dtype", "float64"),
1343
+ hmm_feature_sets=hmm_feature_sets,
1344
+ hmm_annotation_threshold=hmm_annotation_threshold,
1345
+ hmm_batch_size=hmm_batch_size,
1346
+ hmm_use_viterbi=hmm_use_viterbi,
1347
+ hmm_methbases=hmm_methbases,
1348
+ hmm_device=hmm_device,
1349
+ hmm_merge_layer_features=hmm_merge_layer_features,
1350
+ clustermap_cmap_hmm=merged.get("clustermap_cmap_hmm", "coolwarm"),
1351
+ hmm_clustermap_feature_layers=hmm_clustermap_feature_layers,
1352
+ hmm_clustermap_sortby=merged.get("hmm_clustermap_sortby", "hmm"),
1353
+ hmm_peak_feature_configs=hmm_peak_feature_configs,
1354
+ footprints=merged.get("footprints", None),
1355
+ accessible_patches=merged.get("accessible_patches", None),
1356
+ cpg=merged.get("cpg", None),
1357
+ read_coord_filter=merged.get("read_coord_filter", [None, None]),
1358
+ read_len_filter_thresholds=merged.get("read_len_filter_thresholds", [100, None]),
1359
+ read_len_to_ref_ratio_filter_thresholds=merged.get(
1360
+ "read_len_to_ref_ratio_filter_thresholds", [0.3, None]
1361
+ ),
1362
+ read_quality_filter_thresholds=merged.get("read_quality_filter_thresholds", [15, None]),
1363
+ read_mapping_quality_filter_thresholds=merged.get(
1364
+ "read_mapping_quality_filter_thresholds", [None, None]
1365
+ ),
1366
+ read_mod_filtering_gpc_thresholds=merged.get(
1367
+ "read_mod_filtering_gpc_thresholds", [0.025, 0.975]
1368
+ ),
1369
+ read_mod_filtering_cpg_thresholds=merged.get(
1370
+ "read_mod_filtering_cpg_thresholds", [0.0, 1.0]
1371
+ ),
1372
+ read_mod_filtering_c_thresholds=merged.get(
1373
+ "read_mod_filtering_c_thresholds", [0.025, 0.975]
1374
+ ),
1375
+ read_mod_filtering_a_thresholds=merged.get(
1376
+ "read_mod_filtering_a_thresholds", [0.025, 0.975]
1377
+ ),
1378
+ read_mod_filtering_use_other_c_as_background=merged.get(
1379
+ "read_mod_filtering_use_other_c_as_background", True
1380
+ ),
1381
+ min_valid_fraction_positions_in_read_vs_ref=merged.get(
1382
+ "min_valid_fraction_positions_in_read_vs_ref", 0.2
1383
+ ),
1384
+ duplicate_detection_site_types=merged.get(
1385
+ "duplicate_detection_site_types", ["GpC", "CpG", "ambiguous_GpC_CpG"]
1386
+ ),
1387
+ duplicate_detection_distance_threshold=merged.get(
1388
+ "duplicate_detection_distance_threshold", 0.07
1389
+ ),
1390
+ duplicate_detection_keep_best_metric=merged.get(
1391
+ "duplicate_detection_keep_best_metric", "read_quality"
1392
+ ),
1393
+ duplicate_detection_window_size_for_hamming_neighbors=merged.get(
1394
+ "duplicate_detection_window_size_for_hamming_neighbors", 50
1395
+ ),
1396
+ duplicate_detection_min_overlapping_positions=merged.get(
1397
+ "duplicate_detection_min_overlapping_positions", 20
1398
+ ),
1399
+ duplicate_detection_do_hierarchical=merged.get(
1400
+ "duplicate_detection_do_hierarchical", True
1401
+ ),
1402
+ duplicate_detection_hierarchical_linkage=merged.get(
1403
+ "duplicate_detection_hierarchical_linkage", "average"
1404
+ ),
1405
+ duplicate_detection_do_pca=merged.get("duplicate_detection_do_pca", False),
1406
+ position_max_nan_threshold=merged.get("position_max_nan_threshold", 0.1),
1407
+ correlation_matrix_types=merged.get(
1408
+ "correlation_matrix_types", ["pearson", "binary_covariance"]
1409
+ ),
1410
+ correlation_matrix_cmaps=merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1411
+ correlation_matrix_site_types=merged.get("correlation_matrix_site_types", ["GpC_site"]),
1412
+ hamming_vs_metric_keys=merged.get(
1413
+ "hamming_vs_metric_keys", ["Fraction_C_site_modified"]
1414
+ ),
1415
+ force_redo_load_adata=merged.get("force_redo_load_adata", False),
1416
+ force_redo_preprocessing=merged.get("force_redo_preprocessing", False),
1417
+ force_reload_sample_sheet=merged.get("force_reload_sample_sheet", True),
1418
+ bypass_add_read_length_and_mapping_qc=merged.get(
1419
+ "bypass_add_read_length_and_mapping_qc", False
1420
+ ),
1421
+ force_redo_add_read_length_and_mapping_qc=merged.get(
1422
+ "force_redo_add_read_length_and_mapping_qc", False
1423
+ ),
1424
+ bypass_clean_nan=merged.get("bypass_clean_nan", False),
1425
+ force_redo_clean_nan=merged.get("force_redo_clean_nan", False),
1426
+ bypass_append_base_context=merged.get("bypass_append_base_context", False),
1427
+ force_redo_append_base_context=merged.get("force_redo_append_base_context", False),
1428
+ invert_adata=merged.get("invert_adata", False),
1429
+ bypass_append_binary_layer_by_base_context=merged.get(
1430
+ "bypass_append_binary_layer_by_base_context", False
1431
+ ),
1432
+ force_redo_append_binary_layer_by_base_context=merged.get(
1433
+ "force_redo_append_binary_layer_by_base_context", False
1434
+ ),
1435
+ bypass_calculate_read_modification_stats=merged.get(
1436
+ "bypass_calculate_read_modification_stats", False
1437
+ ),
1438
+ force_redo_calculate_read_modification_stats=merged.get(
1439
+ "force_redo_calculate_read_modification_stats", False
1440
+ ),
1441
+ bypass_filter_reads_on_modification_thresholds=merged.get(
1442
+ "bypass_filter_reads_on_modification_thresholds", False
1443
+ ),
1444
+ force_redo_filter_reads_on_modification_thresholds=merged.get(
1445
+ "force_redo_filter_reads_on_modification_thresholds", False
1446
+ ),
1447
+ bypass_flag_duplicate_reads=merged.get("bypass_flag_duplicate_reads", False),
1448
+ force_redo_flag_duplicate_reads=merged.get("force_redo_flag_duplicate_reads", False),
1449
+ bypass_complexity_analysis=merged.get("bypass_complexity_analysis", False),
1450
+ force_redo_complexity_analysis=merged.get("force_redo_complexity_analysis", False),
1451
+ force_redo_spatial_analyses=merged.get("force_redo_spatial_analyses", False),
1452
+ bypass_basic_clustermaps=merged.get("bypass_basic_clustermaps", False),
1453
+ force_redo_basic_clustermaps=merged.get("force_redo_basic_clustermaps", False),
1454
+ bypass_basic_umap=merged.get("bypass_basic_umap", False),
1455
+ force_redo_basic_umap=merged.get("force_redo_basic_umap", False),
1456
+ bypass_spatial_autocorr_calculations=merged.get(
1457
+ "bypass_spatial_autocorr_calculations", False
1458
+ ),
1459
+ force_redo_spatial_autocorr_calculations=merged.get(
1460
+ "force_redo_spatial_autocorr_calculations", False
1461
+ ),
1462
+ bypass_spatial_autocorr_plotting=merged.get("bypass_spatial_autocorr_plotting", False),
1463
+ force_redo_spatial_autocorr_plotting=merged.get(
1464
+ "force_redo_spatial_autocorr_plotting", False
1465
+ ),
1466
+ bypass_matrix_corr_calculations=merged.get("bypass_matrix_corr_calculations", False),
1467
+ force_redo_matrix_corr_calculations=merged.get(
1468
+ "force_redo_matrix_corr_calculations", False
1469
+ ),
1470
+ bypass_matrix_corr_plotting=merged.get("bypass_matrix_corr_plotting", False),
1471
+ force_redo_matrix_corr_plotting=merged.get("force_redo_matrix_corr_plotting", False),
1472
+ bypass_hmm_fit=merged.get("bypass_hmm_fit", False),
1473
+ force_redo_hmm_fit=merged.get("force_redo_hmm_fit", False),
1474
+ bypass_hmm_apply=merged.get("bypass_hmm_apply", False),
1475
+ force_redo_hmm_apply=merged.get("force_redo_hmm_apply", False),
1476
+ config_source=config_source or "<var_dict>",
1168
1477
  )
1169
1478
 
1170
1479
  report = {
@@ -1191,13 +1500,25 @@ class ExperimentConfig:
1191
1500
  Load CSV using LoadExperimentConfig (or accept DataFrame) and build ExperimentConfig.
1192
1501
  Additional kwargs passed to from_var_dict().
1193
1502
  """
1194
- loader = LoadExperimentConfig(csv_input) if not isinstance(csv_input, pd.DataFrame) else LoadExperimentConfig(pd.DataFrame(csv_input))
1503
+ loader = (
1504
+ LoadExperimentConfig(csv_input)
1505
+ if not isinstance(csv_input, pd.DataFrame)
1506
+ else LoadExperimentConfig(pd.DataFrame(csv_input))
1507
+ )
1195
1508
  var_dict = loader.var_dict
1196
- return cls.from_var_dict(var_dict, date_str=date_str, config_source=config_source, defaults_dir=defaults_dir, defaults_map=defaults_map, **kwargs)
1509
+ return cls.from_var_dict(
1510
+ var_dict,
1511
+ date_str=date_str,
1512
+ config_source=config_source,
1513
+ defaults_dir=defaults_dir,
1514
+ defaults_map=defaults_map,
1515
+ **kwargs,
1516
+ )
1197
1517
 
1198
1518
  # -------------------------
1199
1519
  # validation & serialization
1200
1520
  # -------------------------
1521
+ @staticmethod
1201
1522
  def _validate_hmm_features_structure(hfs: dict) -> List[str]:
1202
1523
  errs = []
1203
1524
  if not isinstance(hfs, dict):
@@ -1205,7 +1526,9 @@ class ExperimentConfig:
1205
1526
  return errs
1206
1527
  for g, info in hfs.items():
1207
1528
  if not isinstance(info, dict):
1208
- errs.append(f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'.")
1529
+ errs.append(
1530
+ f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'."
1531
+ )
1209
1532
  continue
1210
1533
  feats = info.get("features")
1211
1534
  if not isinstance(feats, dict) or len(feats) == 0:
@@ -1215,7 +1538,9 @@ class ExperimentConfig:
1215
1538
  try:
1216
1539
  lo, hi = float(rng[0]), float(rng[1])
1217
1540
  if lo < 0 or hi <= lo:
1218
- errs.append(f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}.")
1541
+ errs.append(
1542
+ f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}."
1543
+ )
1219
1544
  except Exception:
1220
1545
  errs.append(f"Feature range for {g}:{fname} is invalid: {rng}")
1221
1546
  return errs
@@ -1248,13 +1573,18 @@ class ExperimentConfig:
1248
1573
 
1249
1574
  if not (0.0 <= float(self.mapping_threshold) <= 1.0):
1250
1575
  errors.append("mapping_threshold must be in [0,1].")
1251
- for t in (self.filter_threshold, self.m6A_threshold, self.m5C_threshold, self.hm5C_threshold):
1576
+ for t in (
1577
+ self.filter_threshold,
1578
+ self.m6A_threshold,
1579
+ self.m5C_threshold,
1580
+ self.hm5C_threshold,
1581
+ ):
1252
1582
  if not (0.0 <= float(t) <= 1.0):
1253
1583
  errors.append(f"threshold value {t} must be in [0,1].")
1254
1584
 
1255
1585
  if raise_on_error and errors:
1256
1586
  raise ValueError("ExperimentConfig validation failed:\n " + "\n ".join(errors))
1257
-
1587
+
1258
1588
  errs = _validate_hmm_features_structure(self.hmm_feature_sets)
1259
1589
  errors.extend(errs)
1260
1590