smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,26 @@
1
1
  # experiment_config.py
2
2
  from __future__ import annotations
3
+
3
4
  import ast
4
5
  import json
5
6
  import warnings
6
- from dataclasses import dataclass, field, asdict
7
+ from dataclasses import asdict, dataclass, field
7
8
  from pathlib import Path
8
- from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
9
+ from typing import IO, Any, Dict, List, Optional, Sequence, Tuple, Union
10
+
11
+ from smftools.constants import (
12
+ BAM_SUFFIX,
13
+ BARCODE_BOTH_ENDS,
14
+ CONVERSIONS,
15
+ MOD_LIST,
16
+ MOD_MAP,
17
+ REF_COL,
18
+ SAMPLE_COL,
19
+ SPLIT_DIR,
20
+ STRANDS,
21
+ TRIM,
22
+ )
23
+
9
24
  from .discover_input_files import discover_input_files
10
25
 
11
26
  # Optional dependency for YAML handling
@@ -14,8 +29,8 @@ try:
14
29
  except Exception:
15
30
  yaml = None
16
31
 
17
- import pandas as pd
18
32
  import numpy as np
33
+ import pandas as pd
19
34
 
20
35
 
21
36
  # -------------------------
@@ -81,6 +96,7 @@ def _parse_numeric(v: Any, fallback: Any = None) -> Any:
81
96
  except Exception:
82
97
  return fallback
83
98
 
99
+
84
100
  def _try_json_or_literal(s: Any) -> Any:
85
101
  """Try parse JSON or python literal; otherwise return original string."""
86
102
  if s is None:
@@ -123,8 +139,8 @@ def resolve_aligner_args(
123
139
  """
124
140
  # builtin defaults (aligner -> args)
125
141
  builtin_defaults = {
126
- "minimap2": ['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no'],
127
- "dorado": ['--mm2-opts', '-N', '5'],
142
+ "minimap2": ["-a", "-x", "map-ont", "--MD", "-Y", "-y", "-N", "5", "--secondary=no"],
143
+ "dorado": ["--mm2-opts", "-N", "5"],
128
144
  }
129
145
  if default_by_aligner is None:
130
146
  default_by_aligner = builtin_defaults
@@ -275,6 +291,7 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
275
291
  canonical[grp] = {"features": feats, "state": state}
276
292
  return canonical
277
293
 
294
+
278
295
  def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
279
296
  """
280
297
  Normalize user-provided `hmm_peak_feature_configs` into:
@@ -365,12 +382,12 @@ class LoadExperimentConfig:
365
382
  df = pd.read_csv(source, dtype=str, keep_default_na=False, na_values=[""])
366
383
  # normalize column names
367
384
  df.columns = [c.strip() for c in df.columns]
368
- if 'variable' not in df.columns:
385
+ if "variable" not in df.columns:
369
386
  raise ValueError("Config CSV must contain a 'variable' column.")
370
- if 'value' not in df.columns:
371
- df['value'] = ''
372
- if 'type' not in df.columns:
373
- df['type'] = ''
387
+ if "value" not in df.columns:
388
+ df["value"] = ""
389
+ if "type" not in df.columns:
390
+ df["type"] = ""
374
391
  return df
375
392
 
376
393
  @staticmethod
@@ -389,9 +406,9 @@ class LoadExperimentConfig:
389
406
 
390
407
  def parse_bool(s: str):
391
408
  s2 = s.strip().lower()
392
- if s2 in ('1', 'true', 't', 'yes', 'y', 'on'):
409
+ if s2 in ("1", "true", "t", "yes", "y", "on"):
393
410
  return True
394
- if s2 in ('0', 'false', 'f', 'no', 'n', 'off'):
411
+ if s2 in ("0", "false", "f", "no", "n", "off"):
395
412
  return False
396
413
  raise ValueError(f"Cannot parse boolean from '{s}'")
397
414
 
@@ -411,18 +428,18 @@ class LoadExperimentConfig:
411
428
  except Exception:
412
429
  pass
413
430
  # fallback split
414
- parts = [p.strip() for p in s.strip("()[] ").split(',') if p.strip() != ""]
431
+ parts = [p.strip() for p in s.strip("()[] ").split(",") if p.strip() != ""]
415
432
  return parts
416
433
 
417
- if hint in ('int', 'integer'):
434
+ if hint in ("int", "integer"):
418
435
  return int(v)
419
- if hint in ('float', 'double'):
436
+ if hint in ("float", "double"):
420
437
  return float(v)
421
- if hint in ('bool', 'boolean'):
438
+ if hint in ("bool", "boolean"):
422
439
  return parse_bool(v)
423
- if hint in ('list', 'array'):
440
+ if hint in ("list", "array"):
424
441
  return parse_list_like(v)
425
- if hint in ('string', 'str'):
442
+ if hint in ("string", "str"):
426
443
  return v
427
444
 
428
445
  # infer
@@ -448,27 +465,31 @@ class LoadExperimentConfig:
448
465
  return lit
449
466
  except Exception:
450
467
  pass
451
- if (',' in v) and (not any(ch in v for ch in '{}[]()')):
452
- return [p.strip() for p in v.split(',') if p.strip() != ""]
468
+ if ("," in v) and (not any(ch in v for ch in "{}[]()")):
469
+ return [p.strip() for p in v.split(",") if p.strip() != ""]
453
470
  return v
454
471
 
455
472
  def _parse_df(self, df: pd.DataFrame) -> Dict[str, Any]:
456
473
  parsed: Dict[str, Any] = {}
457
474
  for idx, row in df.iterrows():
458
- name = str(row['variable']).strip()
475
+ name = str(row["variable"]).strip()
459
476
  if name == "":
460
477
  continue
461
- raw_val = row.get('value', "")
462
- raw_type = row.get('type', "")
478
+ raw_val = row.get("value", "")
479
+ raw_type = row.get("type", "")
463
480
  if pd.isna(raw_val) or str(raw_val).strip() == "":
464
481
  raw_val = None
465
482
  try:
466
483
  parsed_val = self._parse_value_as_type(raw_val, raw_type)
467
484
  except Exception as e:
468
- warnings.warn(f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value.")
485
+ warnings.warn(
486
+ f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value."
487
+ )
469
488
  parsed_val = None if raw_val is None else raw_val
470
489
  if name in parsed:
471
- warnings.warn(f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value.")
490
+ warnings.warn(
491
+ f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value."
492
+ )
472
493
  parsed[name] = parsed_val
473
494
  return parsed
474
495
 
@@ -476,7 +497,7 @@ class LoadExperimentConfig:
476
497
  """Return parsed config as a pandas DataFrame (variable, value)."""
477
498
  rows = []
478
499
  for k, v in self.var_dict.items():
479
- rows.append({'variable': k, 'value': v})
500
+ rows.append({"variable": k, "value": v})
480
501
  return pd.DataFrame(rows)
481
502
 
482
503
 
@@ -644,17 +665,17 @@ class ExperimentConfig:
644
665
  input_data_path: Optional[str] = None
645
666
  output_directory: Optional[str] = None
646
667
  fasta: Optional[str] = None
647
- bam_suffix: str = ".bam"
668
+ bam_suffix: str = BAM_SUFFIX
648
669
  recursive_input_search: bool = True
649
670
  input_type: Optional[str] = None
650
671
  input_files: Optional[List[Path]] = None
651
- split_dir: str = "demultiplexed_BAMs"
672
+ split_dir: str = SPLIT_DIR
652
673
  split_path: Optional[str] = None
653
- strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
654
- conversions: List[str] = field(default_factory=lambda: ["unconverted"])
674
+ strands: List[str] = field(default_factory=lambda: STRANDS)
675
+ conversions: List[str] = field(default_factory=lambda: CONVERSIONS)
655
676
  fasta_regions_of_interest: Optional[str] = None
656
677
  sample_sheet_path: Optional[str] = None
657
- sample_sheet_mapping_column: Optional[str] = 'Barcode'
678
+ sample_sheet_mapping_column: Optional[str] = "Experiment_name_and_barcode"
658
679
  experiment_name: Optional[str] = None
659
680
  input_already_demuxed: bool = False
660
681
  summary_file: Optional[Path] = None
@@ -690,8 +711,8 @@ class ExperimentConfig:
690
711
  model_dir: Optional[str] = None
691
712
  barcode_kit: Optional[str] = None
692
713
  model: str = "hac"
693
- barcode_both_ends: bool = False
694
- trim: bool = False
714
+ barcode_both_ends: bool = BARCODE_BOTH_ENDS
715
+ trim: bool = TRIM
695
716
  # General basecalling params
696
717
  filter_threshold: float = 0.8
697
718
  # Modified basecalling specific params
@@ -699,44 +720,75 @@ class ExperimentConfig:
699
720
  m5C_threshold: float = 0.7
700
721
  hm5C_threshold: float = 0.7
701
722
  thresholds: List[float] = field(default_factory=list)
702
- mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"]) # Dorado modified basecalling codes
703
- mod_map: Dict[str, str] = field(default_factory=lambda: {'6mA': '6mA', '5mC_5hmC': '5mC'}) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
723
+ mod_list: List[str] = field(
724
+ default_factory=lambda: list(MOD_LIST)
725
+ ) # Dorado modified basecalling codes
726
+ mod_map: Dict[str, str] = field(
727
+ default_factory=lambda: dict(MOD_MAP)
728
+ ) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
704
729
 
705
730
  # Alignment params
706
- mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
707
- align_from_bam: bool = False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
731
+ mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
732
+ align_from_bam: bool = (
733
+ False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
734
+ )
708
735
  aligner: str = "dorado"
709
736
  aligner_args: Optional[List[str]] = None
710
737
  make_bigwigs: bool = False
711
738
  make_beds: bool = False
739
+ samtools_backend: str = "auto"
740
+ bedtools_backend: str = "auto"
741
+ bigwig_backend: str = "auto"
712
742
 
713
743
  # Anndata structure
714
- reference_column: Optional[str] = 'Reference_strand'
715
- sample_column: Optional[str] = 'Barcode'
744
+ reference_column: Optional[str] = REF_COL
745
+ sample_column: Optional[str] = SAMPLE_COL
716
746
 
717
747
  # General Plotting
718
- sample_name_col_for_plotting: Optional[str] = 'Barcode'
748
+ sample_name_col_for_plotting: Optional[str] = "Barcode"
719
749
  rows_per_qc_histogram_grid: int = 12
720
750
 
721
751
  # Preprocessing - Read length and quality filter params
722
752
  read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
723
- read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
724
- read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
725
- read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
726
- read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
753
+ read_len_filter_thresholds: Optional[Sequence[float]] = field(
754
+ default_factory=lambda: [100, None]
755
+ )
756
+ read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(
757
+ default_factory=lambda: [0.4, 1.5]
758
+ )
759
+ read_quality_filter_thresholds: Optional[Sequence[float]] = field(
760
+ default_factory=lambda: [15, None]
761
+ )
762
+ read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(
763
+ default_factory=lambda: [None, None]
764
+ )
727
765
 
728
766
  # Preprocessing - Optional reindexing params
729
767
  reindexing_offsets: Dict[str, int] = field(default_factory=dict)
730
768
  reindexed_var_suffix: Optional[str] = "reindexed"
731
769
 
732
770
  # Preprocessing - Direct mod detection binarization params
733
- fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
734
- binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
735
- positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
736
- negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
737
- infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
738
- inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
739
- fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
771
+ fit_position_methylation_thresholds: Optional[bool] = (
772
+ False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
773
+ )
774
+ binarize_on_fixed_methlyation_threshold: Optional[float] = (
775
+ 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
776
+ )
777
+ positive_control_sample_methylation_fitting: Optional[str] = (
778
+ None # A positive control Sample_name to use for fully modified template data
779
+ )
780
+ negative_control_sample_methylation_fitting: Optional[str] = (
781
+ None # A negative control Sample_name to use for fully unmodified template data
782
+ )
783
+ infer_on_percentile_sample_methylation_fitting: Optional[int] = (
784
+ 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
785
+ )
786
+ inference_variable_sample_methylation_fitting: Optional[str] = (
787
+ "Raw_modification_signal" # The obs column value used for the percentile metric above.
788
+ )
789
+ fit_j_threshold: Optional[float] = (
790
+ 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
791
+ )
740
792
  output_binary_layer_name: Optional[str] = "binarized_methylation"
741
793
 
742
794
  # Preprocessing - Read modification filter params
@@ -748,13 +800,25 @@ class ExperimentConfig:
748
800
  min_valid_fraction_positions_in_read_vs_ref: float = 0.2
749
801
 
750
802
  # Preprocessing - plotting params
751
- obs_to_plot_pp_qc: List[str] = field(default_factory=lambda: ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal'])
803
+ obs_to_plot_pp_qc: List[str] = field(
804
+ default_factory=lambda: [
805
+ "read_length",
806
+ "mapped_length",
807
+ "read_quality",
808
+ "mapping_quality",
809
+ "mapped_length_to_reference_length_ratio",
810
+ "mapped_length_to_read_length_ratio",
811
+ "Raw_modification_signal",
812
+ ]
813
+ )
752
814
 
753
815
  # Preprocessing - Duplicate detection params
754
- duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
816
+ duplicate_detection_site_types: List[str] = field(
817
+ default_factory=lambda: ["GpC", "CpG", "ambiguous_GpC_CpG"]
818
+ )
755
819
  duplicate_detection_distance_threshold: float = 0.07
756
- hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_C_site_modified'])
757
- duplicate_detection_keep_best_metric: str ='read_quality'
820
+ hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ["Fraction_C_site_modified"])
821
+ duplicate_detection_keep_best_metric: str = "read_quality"
758
822
  duplicate_detection_window_size_for_hamming_neighbors: int = 50
759
823
  duplicate_detection_min_overlapping_positions: int = 20
760
824
  duplicate_detection_do_hierarchical: bool = True
@@ -765,32 +829,37 @@ class ExperimentConfig:
765
829
  position_max_nan_threshold: float = 0.1
766
830
 
767
831
  # Spatial Analysis - Clustermap params
768
- layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
769
- clustermap_cmap_c: Optional[str] = 'coolwarm'
770
- clustermap_cmap_gpc: Optional[str] = 'coolwarm'
771
- clustermap_cmap_cpg: Optional[str] = 'coolwarm'
772
- clustermap_cmap_a: Optional[str] = 'coolwarm'
773
- spatial_clustermap_sortby: Optional[str] = 'gpc'
832
+ layer_for_clustermap_plotting: Optional[str] = "nan0_0minus1"
833
+ clustermap_cmap_c: Optional[str] = "coolwarm"
834
+ clustermap_cmap_gpc: Optional[str] = "coolwarm"
835
+ clustermap_cmap_cpg: Optional[str] = "coolwarm"
836
+ clustermap_cmap_a: Optional[str] = "coolwarm"
837
+ spatial_clustermap_sortby: Optional[str] = "gpc"
774
838
 
775
839
  # Spatial Analysis - UMAP/Leiden params
776
- layer_for_umap_plotting: Optional[str] = 'nan_half'
777
- umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
840
+ layer_for_umap_plotting: Optional[str] = "nan_half"
841
+ umap_layers_to_plot: List[str] = field(
842
+ default_factory=lambda: ["mapped_length", "Raw_modification_signal"]
843
+ )
778
844
 
779
845
  # Spatial Analysis - Spatial Autocorrelation params
846
+ autocorr_normalization_method: str = "pearson"
780
847
  rows_per_qc_autocorr_grid: int = 12
781
848
  autocorr_rolling_window_size: int = 25
782
849
  autocorr_max_lag: int = 800
783
- autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'C'])
850
+ autocorr_site_types: List[str] = field(default_factory=lambda: ["GpC", "CpG", "C"])
784
851
 
785
852
  # Spatial Analysis - Correlation Matrix params
786
- correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
787
- correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
788
- correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
853
+ correlation_matrix_types: List[str] = field(
854
+ default_factory=lambda: ["pearson", "binary_covariance"]
855
+ )
856
+ correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
857
+ correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
789
858
 
790
859
  # HMM params
791
860
  hmm_n_states: int = 2
792
- hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
793
- hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
861
+ hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
862
+ hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
794
863
  hmm_init_start_probs: List[float] = field(default_factory=lambda: [0.5, 0.5])
795
864
  hmm_eps: float = 1e-8
796
865
  hmm_dtype: str = "float64"
@@ -798,15 +867,28 @@ class ExperimentConfig:
798
867
  hmm_batch_size: int = 1024
799
868
  hmm_use_viterbi: bool = False
800
869
  hmm_device: Optional[str] = None
801
- hmm_methbases: Optional[List[str]] = None # if None, HMM.annotate_adata will fall back to mod_target_bases
870
+ hmm_methbases: Optional[List[str]] = (
871
+ None # if None, HMM.annotate_adata will fall back to mod_target_bases
872
+ )
873
+ # HMM fitting/application strategy
874
+ hmm_fit_strategy: str = "per_group" # "per_group" | "shared_transitions"
875
+ hmm_shared_scope: List[str] = field(default_factory=lambda: ["reference", "methbase"])
876
+ hmm_groupby: List[str] = field(default_factory=lambda: ["sample", "reference", "methbase"])
877
+ # Shared-transitions adaptation behavior
878
+ hmm_adapt_emissions: bool = True
879
+ hmm_adapt_startprobs: bool = True
880
+ hmm_emission_adapt_iters: int = 5
881
+ hmm_emission_adapt_tol: float = 1e-4
802
882
  footprints: Optional[bool] = True
803
883
  accessible_patches: Optional[bool] = True
804
884
  cpg: Optional[bool] = False
805
885
  hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
806
- hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
807
- clustermap_cmap_hmm: Optional[str] = 'coolwarm'
808
- hmm_clustermap_feature_layers: List[str] = field(default_factory=lambda: ["all_accessible_features"])
809
- hmm_clustermap_sortby: Optional[str] = 'hmm'
886
+ hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 60)])
887
+ clustermap_cmap_hmm: Optional[str] = "coolwarm"
888
+ hmm_clustermap_feature_layers: List[str] = field(
889
+ default_factory=lambda: ["all_accessible_features"]
890
+ )
891
+ hmm_clustermap_sortby: Optional[str] = "hmm"
810
892
  hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
811
893
 
812
894
  # Pipeline control flow - load adata
@@ -830,7 +912,7 @@ class ExperimentConfig:
830
912
  force_redo_filter_reads_on_modification_thresholds: bool = False
831
913
  bypass_flag_duplicate_reads: bool = False
832
914
  force_redo_flag_duplicate_reads: bool = False
833
- bypass_complexity_analysis: bool = False
915
+ bypass_complexity_analysis: bool = False
834
916
  force_redo_complexity_analysis: bool = False
835
917
 
836
918
  # Pipeline control flow - Spatial Analyses
@@ -910,7 +992,9 @@ class ExperimentConfig:
910
992
  defaults_loaded = dict(defaults_map[modality] or {})
911
993
  defaults_source_chain = [f"defaults_map['{modality}']"]
912
994
  elif defaults_dir is not None:
913
- defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(defaults_dir, modality)
995
+ defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
996
+ defaults_dir, modality
997
+ )
914
998
 
915
999
  # If CSV asks to extend defaults, load those and merge
916
1000
  merged = dict(defaults_loaded or {})
@@ -925,7 +1009,11 @@ class ExperimentConfig:
925
1009
  else:
926
1010
  ext_list = []
927
1011
  for ext in ext_list:
928
- ext_defaults, ext_sources = (load_defaults_with_inheritance(defaults_dir, ext) if defaults_dir else ({}, []))
1012
+ ext_defaults, ext_sources = (
1013
+ load_defaults_with_inheritance(defaults_dir, ext)
1014
+ if defaults_dir
1015
+ else ({}, [])
1016
+ )
929
1017
  merged = deep_merge(merged, ext_defaults)
930
1018
  for s in ext_sources:
931
1019
  if s not in defaults_source_chain:
@@ -955,34 +1043,40 @@ class ExperimentConfig:
955
1043
  merged["experiment_name"] = f"{date_str}_SMF_experiment"
956
1044
 
957
1045
  # Input file types and path handling
958
- input_data_path = Path(merged['input_data_path'])
1046
+ input_data_path = Path(merged["input_data_path"])
959
1047
 
960
1048
  # Detect the input filetype
961
1049
  if input_data_path.is_file():
962
- suffix = input_data_path.suffix.lower()
963
- suffixes = [s.lower() for s in input_data_path.suffixes] # handles multi-part extensions
964
-
965
- # recognize multi-suffix cases like .fastq.gz or .fq.gz
966
- if any(s in ['.pod5', '.p5'] for s in suffixes):
967
- input_type = "pod5"
968
- input_files = [Path(input_data_path)]
969
- elif any(s in ['.fast5', '.f5'] for s in suffixes):
970
- input_type = "fast5"
971
- input_files = [Path(input_data_path)]
972
- elif any(s in ['.fastq', '.fq'] for s in suffixes):
973
- input_type = "fastq"
974
- input_files = [Path(input_data_path)]
975
- elif any(s in ['.bam'] for s in suffixes):
976
- input_type = "bam"
977
- input_files = [Path(input_data_path)]
978
- elif any(s in ['.h5ad', ".h5"] for s in suffixes):
979
- input_type = "h5ad"
980
- input_files = [Path(input_data_path)]
981
- else:
982
- print("Error detecting input file type")
1050
+ suffix = input_data_path.suffix.lower()
1051
+ suffixes = [
1052
+ s.lower() for s in input_data_path.suffixes
1053
+ ] # handles multi-part extensions
1054
+
1055
+ # recognize multi-suffix cases like .fastq.gz or .fq.gz
1056
+ if any(s in [".pod5", ".p5"] for s in suffixes):
1057
+ input_type = "pod5"
1058
+ input_files = [Path(input_data_path)]
1059
+ elif any(s in [".fast5", ".f5"] for s in suffixes):
1060
+ input_type = "fast5"
1061
+ input_files = [Path(input_data_path)]
1062
+ elif any(s in [".fastq", ".fq"] for s in suffixes):
1063
+ input_type = "fastq"
1064
+ input_files = [Path(input_data_path)]
1065
+ elif any(s in [".bam"] for s in suffixes):
1066
+ input_type = "bam"
1067
+ input_files = [Path(input_data_path)]
1068
+ elif any(s in [".h5ad", ".h5"] for s in suffixes):
1069
+ input_type = "h5ad"
1070
+ input_files = [Path(input_data_path)]
1071
+ else:
1072
+ print("Error detecting input file type")
983
1073
 
984
1074
  elif input_data_path.is_dir():
985
- found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
1075
+ found = discover_input_files(
1076
+ input_data_path,
1077
+ bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
1078
+ recursive=merged["recursive_input_search"],
1079
+ )
986
1080
 
987
1081
  if found["input_is_pod5"]:
988
1082
  input_type = "pod5"
@@ -1010,12 +1104,12 @@ class ExperimentConfig:
1010
1104
  )
1011
1105
 
1012
1106
  # summary file output path
1013
- output_dir = Path(merged['output_directory'])
1014
- summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
1107
+ output_dir = Path(merged["output_directory"])
1108
+ summary_file_basename = merged["experiment_name"] + "_output_summary.csv"
1015
1109
  summary_file = output_dir / summary_file_basename
1016
1110
 
1017
1111
  # Demultiplexing output path
1018
- split_dir = merged.get("split_dir", "demultiplexed_BAMs")
1112
+ split_dir = merged.get("split_dir", SPLIT_DIR)
1019
1113
  split_path = output_dir / split_dir
1020
1114
 
1021
1115
  # final normalization
@@ -1039,7 +1133,14 @@ class ExperimentConfig:
1039
1133
  merged["hm5C_threshold"],
1040
1134
  ]
1041
1135
 
1042
- for bkey in ("barcode_both_ends", "trim", "input_already_demuxed", "make_bigwigs", "skip_unclassified", "delete_batch_hdfs"):
1136
+ for bkey in (
1137
+ "barcode_both_ends",
1138
+ "trim",
1139
+ "input_already_demuxed",
1140
+ "make_bigwigs",
1141
+ "skip_unclassified",
1142
+ "delete_batch_hdfs",
1143
+ ):
1043
1144
  if bkey in merged:
1044
1145
  merged[bkey] = _parse_bool(merged[bkey])
1045
1146
 
@@ -1048,12 +1149,12 @@ class ExperimentConfig:
1048
1149
  if "threads" in merged:
1049
1150
  tval = _parse_numeric(merged.get("threads", None), None)
1050
1151
  merged["threads"] = None if tval is None else int(tval)
1051
-
1152
+
1052
1153
  if "aligner_args" in merged and merged.get("aligner_args") is None:
1053
1154
  merged.pop("aligner_args", None)
1054
1155
 
1055
1156
  # --- Resolve aligner_args into concrete list for the chosen aligner ---
1056
- merged['aligner_args'] = resolve_aligner_args(merged)
1157
+ merged["aligner_args"] = resolve_aligner_args(merged)
1057
1158
 
1058
1159
  if "mod_list" in merged:
1059
1160
  merged["mod_list"] = _parse_list(merged.get("mod_list"))
@@ -1068,11 +1169,22 @@ class ExperimentConfig:
1068
1169
  # allow older names (footprint_ranges, accessible_ranges, cpg_ranges) — optional:
1069
1170
  maybe_fs = {}
1070
1171
  if "footprint_ranges" in merged or "hmm_footprint_ranges" in merged:
1071
- maybe_fs["footprint"] = {"features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")), "state": merged.get("hmm_footprint_state", "Non-Modified")}
1172
+ maybe_fs["footprint"] = {
1173
+ "features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")),
1174
+ "state": merged.get("hmm_footprint_state", "Non-Modified"),
1175
+ }
1072
1176
  if "accessible_ranges" in merged or "hmm_accessible_ranges" in merged:
1073
- maybe_fs["accessible"] = {"features": merged.get("hmm_accessible_ranges", merged.get("accessible_ranges")), "state": merged.get("hmm_accessible_state", "Modified")}
1177
+ maybe_fs["accessible"] = {
1178
+ "features": merged.get(
1179
+ "hmm_accessible_ranges", merged.get("accessible_ranges")
1180
+ ),
1181
+ "state": merged.get("hmm_accessible_state", "Modified"),
1182
+ }
1074
1183
  if "cpg_ranges" in merged or "hmm_cpg_ranges" in merged:
1075
- maybe_fs["cpg"] = {"features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")), "state": merged.get("hmm_cpg_state", "Modified")}
1184
+ maybe_fs["cpg"] = {
1185
+ "features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")),
1186
+ "state": merged.get("hmm_cpg_state", "Modified"),
1187
+ }
1076
1188
  if maybe_fs:
1077
1189
  merged.setdefault("hmm_feature_sets", {})
1078
1190
  for k, v in maybe_fs.items():
@@ -1093,10 +1205,23 @@ class ExperimentConfig:
1093
1205
  if not hmm_methbases: # None or []
1094
1206
  hmm_methbases = _parse_list(merged.get("mod_target_bases", None))
1095
1207
  if not hmm_methbases:
1096
- hmm_methbases = ['C']
1208
+ hmm_methbases = ["C"]
1097
1209
  hmm_methbases = list(hmm_methbases)
1098
1210
  hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
1099
- hmm_clustermap_feature_layers = _parse_list(merged.get("hmm_clustermap_feature_layers", "all_accessible_features"))
1211
+ hmm_clustermap_feature_layers = _parse_list(
1212
+ merged.get("hmm_clustermap_feature_layers", "all_accessible_features")
1213
+ )
1214
+
1215
+ hmm_fit_strategy = str(merged.get("hmm_fit_strategy", "per_group")).strip()
1216
+ hmm_shared_scope = _parse_list(merged.get("hmm_shared_scope", ["reference", "methbase"]))
1217
+ hmm_groupby = _parse_list(merged.get("hmm_groupby", ["sample", "reference", "methbase"]))
1218
+
1219
+ hmm_adapt_emissions = _parse_bool(merged.get("hmm_adapt_emissions", True))
1220
+ hmm_adapt_startprobs = _parse_bool(merged.get("hmm_adapt_startprobs", True))
1221
+ hmm_emission_adapt_iters = int(_parse_numeric(merged.get("hmm_emission_adapt_iters", 5), 5))
1222
+ hmm_emission_adapt_tol = float(
1223
+ _parse_numeric(merged.get("hmm_emission_adapt_tol", 1e-4), 1e-4)
1224
+ )
1100
1225
 
1101
1226
  # HMM peak feature configs (for call_hmm_peaks)
1102
1227
  merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
@@ -1106,165 +1231,255 @@ class ExperimentConfig:
1106
1231
 
1107
1232
  # instantiate dataclass
1108
1233
  instance = cls(
1109
- smf_modality = merged.get("smf_modality"),
1110
- input_data_path = input_data_path,
1111
- recursive_input_search = merged.get("recursive_input_search"),
1112
- input_type = input_type,
1113
- input_files = input_files,
1114
- output_directory = output_dir,
1115
- summary_file = summary_file,
1116
- fasta = merged.get("fasta"),
1117
- sequencer = merged.get("sequencer"),
1118
- model_dir = merged.get("model_dir"),
1119
- barcode_kit = merged.get("barcode_kit"),
1120
- fastq_barcode_map = merged.get("fastq_barcode_map"),
1121
- fastq_auto_pairing = merged.get("fastq_auto_pairing"),
1122
- bam_suffix = merged.get("bam_suffix", ".bam"),
1123
- split_dir = split_dir,
1124
- split_path = split_path,
1125
- strands = merged.get("strands", ["bottom","top"]),
1126
- conversions = merged.get("conversions", ["unconverted"]),
1127
- fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
1128
- mapping_threshold = float(merged.get("mapping_threshold", 0.01)),
1129
- experiment_name = merged.get("experiment_name"),
1130
- model = merged.get("model", "hac"),
1131
- barcode_both_ends = merged.get("barcode_both_ends", False),
1132
- trim = merged.get("trim", False),
1133
- input_already_demuxed = merged.get("input_already_demuxed", False),
1134
- threads = merged.get("threads"),
1135
- sample_sheet_path = merged.get("sample_sheet_path"),
1136
- sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
1137
- delete_intermediate_bams = merged.get("delete_intermediate_bams", False),
1138
- delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
1139
- align_from_bam = merged.get("align_from_bam", False),
1140
- aligner = merged.get("aligner", "minimap2"),
1141
- aligner_args = merged.get("aligner_args", None),
1142
- device = merged.get("device", "auto"),
1143
- make_bigwigs = merged.get("make_bigwigs", False),
1144
- make_beds = merged.get("make_beds", False),
1145
- delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
1146
- mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
1147
- enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
1148
- conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
1149
- filter_threshold = merged.get("filter_threshold", 0.8),
1150
- m6A_threshold = merged.get("m6A_threshold", 0.7),
1151
- m5C_threshold = merged.get("m5C_threshold", 0.7),
1152
- hm5C_threshold = merged.get("hm5C_threshold", 0.7),
1153
- thresholds = merged.get("thresholds", []),
1154
- mod_list = merged.get("mod_list", ["5mC_5hmC","6mA"]),
1155
- batch_size = merged.get("batch_size", 4),
1156
- skip_unclassified = merged.get("skip_unclassified", True),
1157
- delete_batch_hdfs = merged.get("delete_batch_hdfs", True),
1158
- reference_column = merged.get("reference_column", 'Reference_strand'),
1159
- sample_column = merged.get("sample_column", 'Barcode'),
1160
- sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
1161
- obs_to_plot_pp_qc = obs_to_plot_pp_qc,
1162
- fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
1163
- binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
1164
- positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
1165
- negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
1166
- infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
1167
- inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
1168
- fit_j_threshold = merged.get("fit_j_threshold", 0.5),
1169
- output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
1170
- reindexing_offsets = merged.get("reindexing_offsets", {None: None}),
1171
- reindexed_var_suffix = merged.get("reindexed_var_suffix", "reindexed"),
1172
- layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
1173
- clustermap_cmap_c = merged.get("clustermap_cmap_c", 'coolwarm'),
1174
- clustermap_cmap_gpc = merged.get("clustermap_cmap_gpc", 'coolwarm'),
1175
- clustermap_cmap_cpg = merged.get("clustermap_cmap_cpg", 'coolwarm'),
1176
- clustermap_cmap_a = merged.get("clustermap_cmap_a", 'coolwarm'),
1177
- spatial_clustermap_sortby = merged.get("spatial_clustermap_sortby", 'gpc'),
1178
- layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
1179
- umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
1180
- rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
1181
- rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
1182
- autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
1183
- autocorr_max_lag = merged.get("autocorr_max_lag", 800),
1184
- autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'C']),
1185
- hmm_n_states = merged.get("hmm_n_states", 2),
1186
- hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
1187
- hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
1188
- hmm_init_start_probs = merged.get("hmm_init_start_probs",[0.5, 0.5]),
1189
- hmm_eps = merged.get("hmm_eps", 1e-8),
1190
- hmm_dtype = merged.get("hmm_dtype", "float64"),
1191
- hmm_feature_sets = hmm_feature_sets,
1192
- hmm_annotation_threshold = hmm_annotation_threshold,
1193
- hmm_batch_size = hmm_batch_size,
1194
- hmm_use_viterbi = hmm_use_viterbi,
1195
- hmm_methbases = hmm_methbases,
1196
- hmm_device = hmm_device,
1197
- hmm_merge_layer_features = hmm_merge_layer_features,
1198
- clustermap_cmap_hmm = merged.get("clustermap_cmap_hmm", 'coolwarm'),
1199
- hmm_clustermap_feature_layers = hmm_clustermap_feature_layers,
1200
- hmm_clustermap_sortby = merged.get("hmm_clustermap_sortby", 'hmm'),
1201
- hmm_peak_feature_configs = hmm_peak_feature_configs,
1202
- footprints = merged.get("footprints", None),
1203
- accessible_patches = merged.get("accessible_patches", None),
1204
- cpg = merged.get("cpg", None),
1205
- read_coord_filter = merged.get("read_coord_filter", [None, None]),
1206
- read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
1207
- read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
1208
- read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
1209
- read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
1210
- read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
1211
- read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
1212
- read_mod_filtering_c_thresholds = merged.get("read_mod_filtering_c_thresholds", [0.025, 0.975]),
1213
- read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
1214
- read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
1215
- min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
1216
- duplicate_detection_site_types = merged.get("duplicate_detection_site_types", ['GpC', 'CpG', 'ambiguous_GpC_CpG']),
1217
- duplicate_detection_distance_threshold = merged.get("duplicate_detection_distance_threshold", 0.07),
1218
- duplicate_detection_keep_best_metric = merged.get("duplicate_detection_keep_best_metric", "read_quality"),
1219
- duplicate_detection_window_size_for_hamming_neighbors = merged.get("duplicate_detection_window_size_for_hamming_neighbors", 50),
1220
- duplicate_detection_min_overlapping_positions = merged.get("duplicate_detection_min_overlapping_positions", 20),
1221
- duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
1222
- duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
1223
- duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
1224
- position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
1225
- correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
1226
- correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1227
- correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
1228
- hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_C_site_modified']),
1229
- force_redo_load_adata = merged.get("force_redo_load_adata", False),
1230
- force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
1231
- force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
1232
- bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),
1233
- force_redo_add_read_length_and_mapping_qc = merged.get("force_redo_add_read_length_and_mapping_qc", False),
1234
- bypass_clean_nan = merged.get("bypass_clean_nan", False),
1235
- force_redo_clean_nan = merged.get("force_redo_clean_nan", False),
1236
- bypass_append_base_context = merged.get("bypass_append_base_context", False),
1237
- force_redo_append_base_context = merged.get("force_redo_append_base_context", False),
1238
- invert_adata = merged.get("invert_adata", False),
1239
- bypass_append_binary_layer_by_base_context = merged.get("bypass_append_binary_layer_by_base_context", False),
1240
- force_redo_append_binary_layer_by_base_context = merged.get("force_redo_append_binary_layer_by_base_context", False),
1241
- bypass_calculate_read_modification_stats = merged.get("bypass_calculate_read_modification_stats", False),
1242
- force_redo_calculate_read_modification_stats = merged.get("force_redo_calculate_read_modification_stats", False),
1243
- bypass_filter_reads_on_modification_thresholds = merged.get("bypass_filter_reads_on_modification_thresholds", False),
1244
- force_redo_filter_reads_on_modification_thresholds = merged.get("force_redo_filter_reads_on_modification_thresholds", False),
1245
- bypass_flag_duplicate_reads = merged.get("bypass_flag_duplicate_reads", False),
1246
- force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
1247
- bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
1248
- force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
1249
- force_redo_spatial_analyses = merged.get("force_redo_spatial_analyses", False),
1250
- bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
1251
- force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
1252
- bypass_basic_umap = merged.get("bypass_basic_umap", False),
1253
- force_redo_basic_umap = merged.get("force_redo_basic_umap", False),
1254
- bypass_spatial_autocorr_calculations = merged.get("bypass_spatial_autocorr_calculations", False),
1255
- force_redo_spatial_autocorr_calculations = merged.get("force_redo_spatial_autocorr_calculations", False),
1256
- bypass_spatial_autocorr_plotting = merged.get("bypass_spatial_autocorr_plotting", False),
1257
- force_redo_spatial_autocorr_plotting = merged.get("force_redo_spatial_autocorr_plotting", False),
1258
- bypass_matrix_corr_calculations = merged.get("bypass_matrix_corr_calculations", False),
1259
- force_redo_matrix_corr_calculations = merged.get("force_redo_matrix_corr_calculations", False),
1260
- bypass_matrix_corr_plotting = merged.get("bypass_matrix_corr_plotting", False),
1261
- force_redo_matrix_corr_plotting = merged.get("force_redo_matrix_corr_plotting", False),
1262
- bypass_hmm_fit = merged.get("bypass_hmm_fit", False),
1263
- force_redo_hmm_fit = merged.get("force_redo_hmm_fit", False),
1264
- bypass_hmm_apply = merged.get("bypass_hmm_apply", False),
1265
- force_redo_hmm_apply = merged.get("force_redo_hmm_apply", False),
1266
-
1267
- config_source = config_source or "<var_dict>",
1234
+ smf_modality=merged.get("smf_modality"),
1235
+ input_data_path=input_data_path,
1236
+ recursive_input_search=merged.get("recursive_input_search"),
1237
+ input_type=input_type,
1238
+ input_files=input_files,
1239
+ output_directory=output_dir,
1240
+ summary_file=summary_file,
1241
+ fasta=merged.get("fasta"),
1242
+ sequencer=merged.get("sequencer"),
1243
+ model_dir=merged.get("model_dir"),
1244
+ barcode_kit=merged.get("barcode_kit"),
1245
+ fastq_barcode_map=merged.get("fastq_barcode_map"),
1246
+ fastq_auto_pairing=merged.get("fastq_auto_pairing"),
1247
+ bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
1248
+ split_dir=split_dir,
1249
+ split_path=split_path,
1250
+ strands=merged.get("strands", STRANDS),
1251
+ conversions=merged.get("conversions", CONVERSIONS),
1252
+ fasta_regions_of_interest=merged.get("fasta_regions_of_interest"),
1253
+ mapping_threshold=float(merged.get("mapping_threshold", 0.01)),
1254
+ experiment_name=merged.get("experiment_name"),
1255
+ model=merged.get("model", "hac"),
1256
+ barcode_both_ends=merged.get("barcode_both_ends", BARCODE_BOTH_ENDS),
1257
+ trim=merged.get("trim", TRIM),
1258
+ input_already_demuxed=merged.get("input_already_demuxed", False),
1259
+ threads=merged.get("threads"),
1260
+ sample_sheet_path=merged.get("sample_sheet_path"),
1261
+ sample_sheet_mapping_column=merged.get("sample_sheet_mapping_column"),
1262
+ delete_intermediate_bams=merged.get("delete_intermediate_bams", False),
1263
+ delete_intermediate_tsvs=merged.get("delete_intermediate_tsvs", True),
1264
+ align_from_bam=merged.get("align_from_bam", False),
1265
+ aligner=merged.get("aligner", "minimap2"),
1266
+ aligner_args=merged.get("aligner_args", None),
1267
+ device=merged.get("device", "auto"),
1268
+ make_bigwigs=merged.get("make_bigwigs", False),
1269
+ make_beds=merged.get("make_beds", False),
1270
+ samtools_backend=merged.get("samtools_backend", "auto"),
1271
+ bedtools_backend=merged.get("bedtools_backend", "auto"),
1272
+ bigwig_backend=merged.get("bigwig_backend", "auto"),
1273
+ delete_intermediate_hdfs=merged.get("delete_intermediate_hdfs", True),
1274
+ mod_target_bases=merged.get("mod_target_bases", ["GpC", "CpG"]),
1275
+ enzyme_target_bases=merged.get("enzyme_target_bases", ["GpC"]),
1276
+ conversion_types=merged.get("conversions", ["unconverted"])
1277
+ + merged.get("conversion_types", ["5mC"]),
1278
+ filter_threshold=merged.get("filter_threshold", 0.8),
1279
+ m6A_threshold=merged.get("m6A_threshold", 0.7),
1280
+ m5C_threshold=merged.get("m5C_threshold", 0.7),
1281
+ hm5C_threshold=merged.get("hm5C_threshold", 0.7),
1282
+ thresholds=merged.get("thresholds", []),
1283
+ mod_list=merged.get("mod_list", list(MOD_LIST)),
1284
+ mod_map=merged.get("mod_map", list(MOD_MAP)),
1285
+ batch_size=merged.get("batch_size", 4),
1286
+ skip_unclassified=merged.get("skip_unclassified", True),
1287
+ delete_batch_hdfs=merged.get("delete_batch_hdfs", True),
1288
+ reference_column=merged.get("reference_column", REF_COL),
1289
+ sample_column=merged.get("sample_column", SAMPLE_COL),
1290
+ sample_name_col_for_plotting=merged.get("sample_name_col_for_plotting", "Barcode"),
1291
+ obs_to_plot_pp_qc=obs_to_plot_pp_qc,
1292
+ fit_position_methylation_thresholds=merged.get(
1293
+ "fit_position_methylation_thresholds", False
1294
+ ),
1295
+ binarize_on_fixed_methlyation_threshold=merged.get(
1296
+ "binarize_on_fixed_methlyation_threshold", 0.7
1297
+ ),
1298
+ positive_control_sample_methylation_fitting=merged.get(
1299
+ "positive_control_sample_methylation_fitting", None
1300
+ ),
1301
+ negative_control_sample_methylation_fitting=merged.get(
1302
+ "negative_control_sample_methylation_fitting", None
1303
+ ),
1304
+ infer_on_percentile_sample_methylation_fitting=merged.get(
1305
+ "infer_on_percentile_sample_methylation_fitting", 10
1306
+ ),
1307
+ inference_variable_sample_methylation_fitting=merged.get(
1308
+ "inference_variable_sample_methylation_fitting", "Raw_modification_signal"
1309
+ ),
1310
+ fit_j_threshold=merged.get("fit_j_threshold", 0.5),
1311
+ output_binary_layer_name=merged.get(
1312
+ "output_binary_layer_name", "binarized_methylation"
1313
+ ),
1314
+ reindexing_offsets=merged.get("reindexing_offsets", {None: None}),
1315
+ reindexed_var_suffix=merged.get("reindexed_var_suffix", "reindexed"),
1316
+ layer_for_clustermap_plotting=merged.get(
1317
+ "layer_for_clustermap_plotting", "nan0_0minus1"
1318
+ ),
1319
+ clustermap_cmap_c=merged.get("clustermap_cmap_c", "coolwarm"),
1320
+ clustermap_cmap_gpc=merged.get("clustermap_cmap_gpc", "coolwarm"),
1321
+ clustermap_cmap_cpg=merged.get("clustermap_cmap_cpg", "coolwarm"),
1322
+ clustermap_cmap_a=merged.get("clustermap_cmap_a", "coolwarm"),
1323
+ spatial_clustermap_sortby=merged.get("spatial_clustermap_sortby", "gpc"),
1324
+ layer_for_umap_plotting=merged.get("layer_for_umap_plotting", "nan_half"),
1325
+ umap_layers_to_plot=merged.get(
1326
+ "umap_layers_to_plot", ["mapped_length", "Raw_modification_signal"]
1327
+ ),
1328
+ rows_per_qc_histogram_grid=merged.get("rows_per_qc_histogram_grid", 12),
1329
+ rows_per_qc_autocorr_grid=merged.get("rows_per_qc_autocorr_grid", 12),
1330
+ autocorr_normalization_method=merged.get("autocorr_normalization_method", "pearson"),
1331
+ autocorr_rolling_window_size=merged.get("autocorr_rolling_window_size", 25),
1332
+ autocorr_max_lag=merged.get("autocorr_max_lag", 800),
1333
+ autocorr_site_types=merged.get("autocorr_site_types", ["GpC", "CpG", "C"]),
1334
+ hmm_n_states=merged.get("hmm_n_states", 2),
1335
+ hmm_init_emission_probs=merged.get("hmm_init_emission_probs", [[0.8, 0.2], [0.2, 0.8]]),
1336
+ hmm_init_transition_probs=merged.get(
1337
+ "hmm_init_transition_probs", [[0.9, 0.1], [0.1, 0.9]]
1338
+ ),
1339
+ hmm_init_start_probs=merged.get("hmm_init_start_probs", [0.5, 0.5]),
1340
+ hmm_eps=merged.get("hmm_eps", 1e-8),
1341
+ hmm_fit_strategy=hmm_fit_strategy,
1342
+ hmm_shared_scope=hmm_shared_scope,
1343
+ hmm_groupby=hmm_groupby,
1344
+ hmm_adapt_emissions=hmm_adapt_emissions,
1345
+ hmm_adapt_startprobs=hmm_adapt_startprobs,
1346
+ hmm_emission_adapt_iters=hmm_emission_adapt_iters,
1347
+ hmm_emission_adapt_tol=hmm_emission_adapt_tol,
1348
+ hmm_dtype=merged.get("hmm_dtype", "float64"),
1349
+ hmm_feature_sets=hmm_feature_sets,
1350
+ hmm_annotation_threshold=hmm_annotation_threshold,
1351
+ hmm_batch_size=hmm_batch_size,
1352
+ hmm_use_viterbi=hmm_use_viterbi,
1353
+ hmm_methbases=hmm_methbases,
1354
+ hmm_device=hmm_device,
1355
+ hmm_merge_layer_features=hmm_merge_layer_features,
1356
+ clustermap_cmap_hmm=merged.get("clustermap_cmap_hmm", "coolwarm"),
1357
+ hmm_clustermap_feature_layers=hmm_clustermap_feature_layers,
1358
+ hmm_clustermap_sortby=merged.get("hmm_clustermap_sortby", "hmm"),
1359
+ hmm_peak_feature_configs=hmm_peak_feature_configs,
1360
+ footprints=merged.get("footprints", None),
1361
+ accessible_patches=merged.get("accessible_patches", None),
1362
+ cpg=merged.get("cpg", None),
1363
+ read_coord_filter=merged.get("read_coord_filter", [None, None]),
1364
+ read_len_filter_thresholds=merged.get("read_len_filter_thresholds", [100, None]),
1365
+ read_len_to_ref_ratio_filter_thresholds=merged.get(
1366
+ "read_len_to_ref_ratio_filter_thresholds", [0.3, None]
1367
+ ),
1368
+ read_quality_filter_thresholds=merged.get("read_quality_filter_thresholds", [15, None]),
1369
+ read_mapping_quality_filter_thresholds=merged.get(
1370
+ "read_mapping_quality_filter_thresholds", [None, None]
1371
+ ),
1372
+ read_mod_filtering_gpc_thresholds=merged.get(
1373
+ "read_mod_filtering_gpc_thresholds", [0.025, 0.975]
1374
+ ),
1375
+ read_mod_filtering_cpg_thresholds=merged.get(
1376
+ "read_mod_filtering_cpg_thresholds", [0.0, 1.0]
1377
+ ),
1378
+ read_mod_filtering_c_thresholds=merged.get(
1379
+ "read_mod_filtering_c_thresholds", [0.025, 0.975]
1380
+ ),
1381
+ read_mod_filtering_a_thresholds=merged.get(
1382
+ "read_mod_filtering_a_thresholds", [0.025, 0.975]
1383
+ ),
1384
+ read_mod_filtering_use_other_c_as_background=merged.get(
1385
+ "read_mod_filtering_use_other_c_as_background", True
1386
+ ),
1387
+ min_valid_fraction_positions_in_read_vs_ref=merged.get(
1388
+ "min_valid_fraction_positions_in_read_vs_ref", 0.2
1389
+ ),
1390
+ duplicate_detection_site_types=merged.get(
1391
+ "duplicate_detection_site_types", ["GpC", "CpG", "ambiguous_GpC_CpG"]
1392
+ ),
1393
+ duplicate_detection_distance_threshold=merged.get(
1394
+ "duplicate_detection_distance_threshold", 0.07
1395
+ ),
1396
+ duplicate_detection_keep_best_metric=merged.get(
1397
+ "duplicate_detection_keep_best_metric", "read_quality"
1398
+ ),
1399
+ duplicate_detection_window_size_for_hamming_neighbors=merged.get(
1400
+ "duplicate_detection_window_size_for_hamming_neighbors", 50
1401
+ ),
1402
+ duplicate_detection_min_overlapping_positions=merged.get(
1403
+ "duplicate_detection_min_overlapping_positions", 20
1404
+ ),
1405
+ duplicate_detection_do_hierarchical=merged.get(
1406
+ "duplicate_detection_do_hierarchical", True
1407
+ ),
1408
+ duplicate_detection_hierarchical_linkage=merged.get(
1409
+ "duplicate_detection_hierarchical_linkage", "average"
1410
+ ),
1411
+ duplicate_detection_do_pca=merged.get("duplicate_detection_do_pca", False),
1412
+ position_max_nan_threshold=merged.get("position_max_nan_threshold", 0.1),
1413
+ correlation_matrix_types=merged.get(
1414
+ "correlation_matrix_types", ["pearson", "binary_covariance"]
1415
+ ),
1416
+ correlation_matrix_cmaps=merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1417
+ correlation_matrix_site_types=merged.get("correlation_matrix_site_types", ["GpC_site"]),
1418
+ hamming_vs_metric_keys=merged.get(
1419
+ "hamming_vs_metric_keys", ["Fraction_C_site_modified"]
1420
+ ),
1421
+ force_redo_load_adata=merged.get("force_redo_load_adata", False),
1422
+ force_redo_preprocessing=merged.get("force_redo_preprocessing", False),
1423
+ force_reload_sample_sheet=merged.get("force_reload_sample_sheet", True),
1424
+ bypass_add_read_length_and_mapping_qc=merged.get(
1425
+ "bypass_add_read_length_and_mapping_qc", False
1426
+ ),
1427
+ force_redo_add_read_length_and_mapping_qc=merged.get(
1428
+ "force_redo_add_read_length_and_mapping_qc", False
1429
+ ),
1430
+ bypass_clean_nan=merged.get("bypass_clean_nan", False),
1431
+ force_redo_clean_nan=merged.get("force_redo_clean_nan", False),
1432
+ bypass_append_base_context=merged.get("bypass_append_base_context", False),
1433
+ force_redo_append_base_context=merged.get("force_redo_append_base_context", False),
1434
+ invert_adata=merged.get("invert_adata", False),
1435
+ bypass_append_binary_layer_by_base_context=merged.get(
1436
+ "bypass_append_binary_layer_by_base_context", False
1437
+ ),
1438
+ force_redo_append_binary_layer_by_base_context=merged.get(
1439
+ "force_redo_append_binary_layer_by_base_context", False
1440
+ ),
1441
+ bypass_calculate_read_modification_stats=merged.get(
1442
+ "bypass_calculate_read_modification_stats", False
1443
+ ),
1444
+ force_redo_calculate_read_modification_stats=merged.get(
1445
+ "force_redo_calculate_read_modification_stats", False
1446
+ ),
1447
+ bypass_filter_reads_on_modification_thresholds=merged.get(
1448
+ "bypass_filter_reads_on_modification_thresholds", False
1449
+ ),
1450
+ force_redo_filter_reads_on_modification_thresholds=merged.get(
1451
+ "force_redo_filter_reads_on_modification_thresholds", False
1452
+ ),
1453
+ bypass_flag_duplicate_reads=merged.get("bypass_flag_duplicate_reads", False),
1454
+ force_redo_flag_duplicate_reads=merged.get("force_redo_flag_duplicate_reads", False),
1455
+ bypass_complexity_analysis=merged.get("bypass_complexity_analysis", False),
1456
+ force_redo_complexity_analysis=merged.get("force_redo_complexity_analysis", False),
1457
+ force_redo_spatial_analyses=merged.get("force_redo_spatial_analyses", False),
1458
+ bypass_basic_clustermaps=merged.get("bypass_basic_clustermaps", False),
1459
+ force_redo_basic_clustermaps=merged.get("force_redo_basic_clustermaps", False),
1460
+ bypass_basic_umap=merged.get("bypass_basic_umap", False),
1461
+ force_redo_basic_umap=merged.get("force_redo_basic_umap", False),
1462
+ bypass_spatial_autocorr_calculations=merged.get(
1463
+ "bypass_spatial_autocorr_calculations", False
1464
+ ),
1465
+ force_redo_spatial_autocorr_calculations=merged.get(
1466
+ "force_redo_spatial_autocorr_calculations", False
1467
+ ),
1468
+ bypass_spatial_autocorr_plotting=merged.get("bypass_spatial_autocorr_plotting", False),
1469
+ force_redo_spatial_autocorr_plotting=merged.get(
1470
+ "force_redo_spatial_autocorr_plotting", False
1471
+ ),
1472
+ bypass_matrix_corr_calculations=merged.get("bypass_matrix_corr_calculations", False),
1473
+ force_redo_matrix_corr_calculations=merged.get(
1474
+ "force_redo_matrix_corr_calculations", False
1475
+ ),
1476
+ bypass_matrix_corr_plotting=merged.get("bypass_matrix_corr_plotting", False),
1477
+ force_redo_matrix_corr_plotting=merged.get("force_redo_matrix_corr_plotting", False),
1478
+ bypass_hmm_fit=merged.get("bypass_hmm_fit", False),
1479
+ force_redo_hmm_fit=merged.get("force_redo_hmm_fit", False),
1480
+ bypass_hmm_apply=merged.get("bypass_hmm_apply", False),
1481
+ force_redo_hmm_apply=merged.get("force_redo_hmm_apply", False),
1482
+ config_source=config_source or "<var_dict>",
1268
1483
  )
1269
1484
 
1270
1485
  report = {
@@ -1291,9 +1506,20 @@ class ExperimentConfig:
1291
1506
  Load CSV using LoadExperimentConfig (or accept DataFrame) and build ExperimentConfig.
1292
1507
  Additional kwargs passed to from_var_dict().
1293
1508
  """
1294
- loader = LoadExperimentConfig(csv_input) if not isinstance(csv_input, pd.DataFrame) else LoadExperimentConfig(pd.DataFrame(csv_input))
1509
+ loader = (
1510
+ LoadExperimentConfig(csv_input)
1511
+ if not isinstance(csv_input, pd.DataFrame)
1512
+ else LoadExperimentConfig(pd.DataFrame(csv_input))
1513
+ )
1295
1514
  var_dict = loader.var_dict
1296
- return cls.from_var_dict(var_dict, date_str=date_str, config_source=config_source, defaults_dir=defaults_dir, defaults_map=defaults_map, **kwargs)
1515
+ return cls.from_var_dict(
1516
+ var_dict,
1517
+ date_str=date_str,
1518
+ config_source=config_source,
1519
+ defaults_dir=defaults_dir,
1520
+ defaults_map=defaults_map,
1521
+ **kwargs,
1522
+ )
1297
1523
 
1298
1524
  # -------------------------
1299
1525
  # validation & serialization
@@ -1306,7 +1532,9 @@ class ExperimentConfig:
1306
1532
  return errs
1307
1533
  for g, info in hfs.items():
1308
1534
  if not isinstance(info, dict):
1309
- errs.append(f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'.")
1535
+ errs.append(
1536
+ f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'."
1537
+ )
1310
1538
  continue
1311
1539
  feats = info.get("features")
1312
1540
  if not isinstance(feats, dict) or len(feats) == 0:
@@ -1316,7 +1544,9 @@ class ExperimentConfig:
1316
1544
  try:
1317
1545
  lo, hi = float(rng[0]), float(rng[1])
1318
1546
  if lo < 0 or hi <= lo:
1319
- errs.append(f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}.")
1547
+ errs.append(
1548
+ f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}."
1549
+ )
1320
1550
  except Exception:
1321
1551
  errs.append(f"Feature range for {g}:{fname} is invalid: {rng}")
1322
1552
  return errs
@@ -1349,13 +1579,18 @@ class ExperimentConfig:
1349
1579
 
1350
1580
  if not (0.0 <= float(self.mapping_threshold) <= 1.0):
1351
1581
  errors.append("mapping_threshold must be in [0,1].")
1352
- for t in (self.filter_threshold, self.m6A_threshold, self.m5C_threshold, self.hm5C_threshold):
1582
+ for t in (
1583
+ self.filter_threshold,
1584
+ self.m6A_threshold,
1585
+ self.m5C_threshold,
1586
+ self.hm5C_threshold,
1587
+ ):
1353
1588
  if not (0.0 <= float(t) <= 1.0):
1354
1589
  errors.append(f"threshold value {t} must be in [0,1].")
1355
1590
 
1356
1591
  if raise_on_error and errors:
1357
1592
  raise ValueError("ExperimentConfig validation failed:\n " + "\n ".join(errors))
1358
-
1593
+
1359
1594
  errs = _validate_hmm_features_structure(self.hmm_feature_sets)
1360
1595
  errors.extend(errs)
1361
1596