smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +7 -1
  5. smftools/cli/hmm_adata.py +902 -244
  6. smftools/cli/load_adata.py +318 -198
  7. smftools/cli/preprocess_adata.py +285 -171
  8. smftools/cli/spatial_adata.py +137 -53
  9. smftools/cli_entry.py +94 -178
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +5 -1
  12. smftools/config/deaminase.yaml +1 -1
  13. smftools/config/default.yaml +22 -17
  14. smftools/config/direct.yaml +8 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +505 -276
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2125 -1426
  21. smftools/hmm/__init__.py +2 -3
  22. smftools/hmm/archived/call_hmm_peaks.py +16 -1
  23. smftools/hmm/call_hmm_peaks.py +173 -193
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +379 -156
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +195 -29
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +347 -168
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +145 -85
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +8 -8
  84. smftools/preprocessing/append_base_context.py +105 -79
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  86. smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +127 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +44 -22
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +103 -55
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +688 -271
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +93 -27
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +264 -109
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.4.dist-info/RECORD +0 -176
  128. /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
  129. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  130. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  131. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  132. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  133. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,26 @@
1
1
  # experiment_config.py
2
2
  from __future__ import annotations
3
+
3
4
  import ast
4
5
  import json
5
6
  import warnings
6
- from dataclasses import dataclass, field, asdict
7
+ from dataclasses import asdict, dataclass, field
7
8
  from pathlib import Path
8
- from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
9
+ from typing import IO, Any, Dict, List, Optional, Sequence, Tuple, Union
10
+
11
+ from smftools.constants import (
12
+ BAM_SUFFIX,
13
+ BARCODE_BOTH_ENDS,
14
+ CONVERSIONS,
15
+ MOD_LIST,
16
+ MOD_MAP,
17
+ REF_COL,
18
+ SAMPLE_COL,
19
+ SPLIT_DIR,
20
+ STRANDS,
21
+ TRIM,
22
+ )
23
+
9
24
  from .discover_input_files import discover_input_files
10
25
 
11
26
  # Optional dependency for YAML handling
@@ -14,8 +29,8 @@ try:
14
29
  except Exception:
15
30
  yaml = None
16
31
 
17
- import pandas as pd
18
32
  import numpy as np
33
+ import pandas as pd
19
34
 
20
35
 
21
36
  # -------------------------
@@ -81,6 +96,7 @@ def _parse_numeric(v: Any, fallback: Any = None) -> Any:
81
96
  except Exception:
82
97
  return fallback
83
98
 
99
+
84
100
  def _try_json_or_literal(s: Any) -> Any:
85
101
  """Try parse JSON or python literal; otherwise return original string."""
86
102
  if s is None:
@@ -123,8 +139,8 @@ def resolve_aligner_args(
123
139
  """
124
140
  # builtin defaults (aligner -> args)
125
141
  builtin_defaults = {
126
- "minimap2": ['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no'],
127
- "dorado": ['--mm2-opts', '-N', '5'],
142
+ "minimap2": ["-a", "-x", "map-ont", "--MD", "-Y", "-y", "-N", "5", "--secondary=no"],
143
+ "dorado": ["--mm2-opts", "-N", "5"],
128
144
  }
129
145
  if default_by_aligner is None:
130
146
  default_by_aligner = builtin_defaults
@@ -275,6 +291,7 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
275
291
  canonical[grp] = {"features": feats, "state": state}
276
292
  return canonical
277
293
 
294
+
278
295
  def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
279
296
  """
280
297
  Normalize user-provided `hmm_peak_feature_configs` into:
@@ -365,12 +382,12 @@ class LoadExperimentConfig:
365
382
  df = pd.read_csv(source, dtype=str, keep_default_na=False, na_values=[""])
366
383
  # normalize column names
367
384
  df.columns = [c.strip() for c in df.columns]
368
- if 'variable' not in df.columns:
385
+ if "variable" not in df.columns:
369
386
  raise ValueError("Config CSV must contain a 'variable' column.")
370
- if 'value' not in df.columns:
371
- df['value'] = ''
372
- if 'type' not in df.columns:
373
- df['type'] = ''
387
+ if "value" not in df.columns:
388
+ df["value"] = ""
389
+ if "type" not in df.columns:
390
+ df["type"] = ""
374
391
  return df
375
392
 
376
393
  @staticmethod
@@ -389,9 +406,9 @@ class LoadExperimentConfig:
389
406
 
390
407
  def parse_bool(s: str):
391
408
  s2 = s.strip().lower()
392
- if s2 in ('1', 'true', 't', 'yes', 'y', 'on'):
409
+ if s2 in ("1", "true", "t", "yes", "y", "on"):
393
410
  return True
394
- if s2 in ('0', 'false', 'f', 'no', 'n', 'off'):
411
+ if s2 in ("0", "false", "f", "no", "n", "off"):
395
412
  return False
396
413
  raise ValueError(f"Cannot parse boolean from '{s}'")
397
414
 
@@ -411,18 +428,18 @@ class LoadExperimentConfig:
411
428
  except Exception:
412
429
  pass
413
430
  # fallback split
414
- parts = [p.strip() for p in s.strip("()[] ").split(',') if p.strip() != ""]
431
+ parts = [p.strip() for p in s.strip("()[] ").split(",") if p.strip() != ""]
415
432
  return parts
416
433
 
417
- if hint in ('int', 'integer'):
434
+ if hint in ("int", "integer"):
418
435
  return int(v)
419
- if hint in ('float', 'double'):
436
+ if hint in ("float", "double"):
420
437
  return float(v)
421
- if hint in ('bool', 'boolean'):
438
+ if hint in ("bool", "boolean"):
422
439
  return parse_bool(v)
423
- if hint in ('list', 'array'):
440
+ if hint in ("list", "array"):
424
441
  return parse_list_like(v)
425
- if hint in ('string', 'str'):
442
+ if hint in ("string", "str"):
426
443
  return v
427
444
 
428
445
  # infer
@@ -448,27 +465,31 @@ class LoadExperimentConfig:
448
465
  return lit
449
466
  except Exception:
450
467
  pass
451
- if (',' in v) and (not any(ch in v for ch in '{}[]()')):
452
- return [p.strip() for p in v.split(',') if p.strip() != ""]
468
+ if ("," in v) and (not any(ch in v for ch in "{}[]()")):
469
+ return [p.strip() for p in v.split(",") if p.strip() != ""]
453
470
  return v
454
471
 
455
472
  def _parse_df(self, df: pd.DataFrame) -> Dict[str, Any]:
456
473
  parsed: Dict[str, Any] = {}
457
474
  for idx, row in df.iterrows():
458
- name = str(row['variable']).strip()
475
+ name = str(row["variable"]).strip()
459
476
  if name == "":
460
477
  continue
461
- raw_val = row.get('value', "")
462
- raw_type = row.get('type', "")
478
+ raw_val = row.get("value", "")
479
+ raw_type = row.get("type", "")
463
480
  if pd.isna(raw_val) or str(raw_val).strip() == "":
464
481
  raw_val = None
465
482
  try:
466
483
  parsed_val = self._parse_value_as_type(raw_val, raw_type)
467
484
  except Exception as e:
468
- warnings.warn(f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value.")
485
+ warnings.warn(
486
+ f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value."
487
+ )
469
488
  parsed_val = None if raw_val is None else raw_val
470
489
  if name in parsed:
471
- warnings.warn(f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value.")
490
+ warnings.warn(
491
+ f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value."
492
+ )
472
493
  parsed[name] = parsed_val
473
494
  return parsed
474
495
 
@@ -476,7 +497,7 @@ class LoadExperimentConfig:
476
497
  """Return parsed config as a pandas DataFrame (variable, value)."""
477
498
  rows = []
478
499
  for k, v in self.var_dict.items():
479
- rows.append({'variable': k, 'value': v})
500
+ rows.append({"variable": k, "value": v})
480
501
  return pd.DataFrame(rows)
481
502
 
482
503
 
@@ -644,17 +665,17 @@ class ExperimentConfig:
644
665
  input_data_path: Optional[str] = None
645
666
  output_directory: Optional[str] = None
646
667
  fasta: Optional[str] = None
647
- bam_suffix: str = ".bam"
668
+ bam_suffix: str = BAM_SUFFIX
648
669
  recursive_input_search: bool = True
649
670
  input_type: Optional[str] = None
650
671
  input_files: Optional[List[Path]] = None
651
- split_dir: str = "demultiplexed_BAMs"
672
+ split_dir: str = SPLIT_DIR
652
673
  split_path: Optional[str] = None
653
- strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
654
- conversions: List[str] = field(default_factory=lambda: ["unconverted"])
674
+ strands: List[str] = field(default_factory=lambda: STRANDS)
675
+ conversions: List[str] = field(default_factory=lambda: CONVERSIONS)
655
676
  fasta_regions_of_interest: Optional[str] = None
656
677
  sample_sheet_path: Optional[str] = None
657
- sample_sheet_mapping_column: Optional[str] = 'Barcode'
678
+ sample_sheet_mapping_column: Optional[str] = "Experiment_name_and_barcode"
658
679
  experiment_name: Optional[str] = None
659
680
  input_already_demuxed: bool = False
660
681
  summary_file: Optional[Path] = None
@@ -690,8 +711,8 @@ class ExperimentConfig:
690
711
  model_dir: Optional[str] = None
691
712
  barcode_kit: Optional[str] = None
692
713
  model: str = "hac"
693
- barcode_both_ends: bool = False
694
- trim: bool = False
714
+ barcode_both_ends: bool = BARCODE_BOTH_ENDS
715
+ trim: bool = TRIM
695
716
  # General basecalling params
696
717
  filter_threshold: float = 0.8
697
718
  # Modified basecalling specific params
@@ -699,44 +720,72 @@ class ExperimentConfig:
699
720
  m5C_threshold: float = 0.7
700
721
  hm5C_threshold: float = 0.7
701
722
  thresholds: List[float] = field(default_factory=list)
702
- mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"]) # Dorado modified basecalling codes
703
- mod_map: Dict[str, str] = field(default_factory=lambda: {'6mA': '6mA', '5mC_5hmC': '5mC'}) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
723
+ mod_list: List[str] = field(
724
+ default_factory=lambda: list(MOD_LIST)
725
+ ) # Dorado modified basecalling codes
726
+ mod_map: Dict[str, str] = field(
727
+ default_factory=lambda: dict(MOD_MAP)
728
+ ) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
704
729
 
705
730
  # Alignment params
706
- mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
707
- align_from_bam: bool = False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
731
+ mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
732
+ align_from_bam: bool = (
733
+ False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
734
+ )
708
735
  aligner: str = "dorado"
709
736
  aligner_args: Optional[List[str]] = None
710
737
  make_bigwigs: bool = False
711
738
  make_beds: bool = False
712
739
 
713
740
  # Anndata structure
714
- reference_column: Optional[str] = 'Reference_strand'
715
- sample_column: Optional[str] = 'Barcode'
741
+ reference_column: Optional[str] = REF_COL
742
+ sample_column: Optional[str] = SAMPLE_COL
716
743
 
717
744
  # General Plotting
718
- sample_name_col_for_plotting: Optional[str] = 'Barcode'
745
+ sample_name_col_for_plotting: Optional[str] = "Barcode"
719
746
  rows_per_qc_histogram_grid: int = 12
720
747
 
721
748
  # Preprocessing - Read length and quality filter params
722
749
  read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
723
- read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
724
- read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
725
- read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
726
- read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
750
+ read_len_filter_thresholds: Optional[Sequence[float]] = field(
751
+ default_factory=lambda: [100, None]
752
+ )
753
+ read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(
754
+ default_factory=lambda: [0.4, 1.5]
755
+ )
756
+ read_quality_filter_thresholds: Optional[Sequence[float]] = field(
757
+ default_factory=lambda: [15, None]
758
+ )
759
+ read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(
760
+ default_factory=lambda: [None, None]
761
+ )
727
762
 
728
763
  # Preprocessing - Optional reindexing params
729
764
  reindexing_offsets: Dict[str, int] = field(default_factory=dict)
730
765
  reindexed_var_suffix: Optional[str] = "reindexed"
731
766
 
732
767
  # Preprocessing - Direct mod detection binarization params
733
- fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
734
- binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
735
- positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
736
- negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
737
- infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
738
- inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
739
- fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
768
+ fit_position_methylation_thresholds: Optional[bool] = (
769
+ False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
770
+ )
771
+ binarize_on_fixed_methlyation_threshold: Optional[float] = (
772
+ 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
773
+ )
774
+ positive_control_sample_methylation_fitting: Optional[str] = (
775
+ None # A positive control Sample_name to use for fully modified template data
776
+ )
777
+ negative_control_sample_methylation_fitting: Optional[str] = (
778
+ None # A negative control Sample_name to use for fully unmodified template data
779
+ )
780
+ infer_on_percentile_sample_methylation_fitting: Optional[int] = (
781
+ 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
782
+ )
783
+ inference_variable_sample_methylation_fitting: Optional[str] = (
784
+ "Raw_modification_signal" # The obs column value used for the percentile metric above.
785
+ )
786
+ fit_j_threshold: Optional[float] = (
787
+ 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
788
+ )
740
789
  output_binary_layer_name: Optional[str] = "binarized_methylation"
741
790
 
742
791
  # Preprocessing - Read modification filter params
@@ -748,13 +797,25 @@ class ExperimentConfig:
748
797
  min_valid_fraction_positions_in_read_vs_ref: float = 0.2
749
798
 
750
799
  # Preprocessing - plotting params
751
- obs_to_plot_pp_qc: List[str] = field(default_factory=lambda: ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal'])
800
+ obs_to_plot_pp_qc: List[str] = field(
801
+ default_factory=lambda: [
802
+ "read_length",
803
+ "mapped_length",
804
+ "read_quality",
805
+ "mapping_quality",
806
+ "mapped_length_to_reference_length_ratio",
807
+ "mapped_length_to_read_length_ratio",
808
+ "Raw_modification_signal",
809
+ ]
810
+ )
752
811
 
753
812
  # Preprocessing - Duplicate detection params
754
- duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
813
+ duplicate_detection_site_types: List[str] = field(
814
+ default_factory=lambda: ["GpC", "CpG", "ambiguous_GpC_CpG"]
815
+ )
755
816
  duplicate_detection_distance_threshold: float = 0.07
756
- hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_C_site_modified'])
757
- duplicate_detection_keep_best_metric: str ='read_quality'
817
+ hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ["Fraction_C_site_modified"])
818
+ duplicate_detection_keep_best_metric: str = "read_quality"
758
819
  duplicate_detection_window_size_for_hamming_neighbors: int = 50
759
820
  duplicate_detection_min_overlapping_positions: int = 20
760
821
  duplicate_detection_do_hierarchical: bool = True
@@ -765,32 +826,37 @@ class ExperimentConfig:
765
826
  position_max_nan_threshold: float = 0.1
766
827
 
767
828
  # Spatial Analysis - Clustermap params
768
- layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
769
- clustermap_cmap_c: Optional[str] = 'coolwarm'
770
- clustermap_cmap_gpc: Optional[str] = 'coolwarm'
771
- clustermap_cmap_cpg: Optional[str] = 'coolwarm'
772
- clustermap_cmap_a: Optional[str] = 'coolwarm'
773
- spatial_clustermap_sortby: Optional[str] = 'gpc'
829
+ layer_for_clustermap_plotting: Optional[str] = "nan0_0minus1"
830
+ clustermap_cmap_c: Optional[str] = "coolwarm"
831
+ clustermap_cmap_gpc: Optional[str] = "coolwarm"
832
+ clustermap_cmap_cpg: Optional[str] = "coolwarm"
833
+ clustermap_cmap_a: Optional[str] = "coolwarm"
834
+ spatial_clustermap_sortby: Optional[str] = "gpc"
774
835
 
775
836
  # Spatial Analysis - UMAP/Leiden params
776
- layer_for_umap_plotting: Optional[str] = 'nan_half'
777
- umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
837
+ layer_for_umap_plotting: Optional[str] = "nan_half"
838
+ umap_layers_to_plot: List[str] = field(
839
+ default_factory=lambda: ["mapped_length", "Raw_modification_signal"]
840
+ )
778
841
 
779
842
  # Spatial Analysis - Spatial Autocorrelation params
843
+ autocorr_normalization_method: str = "pearson"
780
844
  rows_per_qc_autocorr_grid: int = 12
781
845
  autocorr_rolling_window_size: int = 25
782
846
  autocorr_max_lag: int = 800
783
- autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'C'])
847
+ autocorr_site_types: List[str] = field(default_factory=lambda: ["GpC", "CpG", "C"])
784
848
 
785
849
  # Spatial Analysis - Correlation Matrix params
786
- correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
787
- correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
788
- correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
850
+ correlation_matrix_types: List[str] = field(
851
+ default_factory=lambda: ["pearson", "binary_covariance"]
852
+ )
853
+ correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
854
+ correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
789
855
 
790
856
  # HMM params
791
857
  hmm_n_states: int = 2
792
- hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
793
- hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
858
+ hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
859
+ hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
794
860
  hmm_init_start_probs: List[float] = field(default_factory=lambda: [0.5, 0.5])
795
861
  hmm_eps: float = 1e-8
796
862
  hmm_dtype: str = "float64"
@@ -798,15 +864,28 @@ class ExperimentConfig:
798
864
  hmm_batch_size: int = 1024
799
865
  hmm_use_viterbi: bool = False
800
866
  hmm_device: Optional[str] = None
801
- hmm_methbases: Optional[List[str]] = None # if None, HMM.annotate_adata will fall back to mod_target_bases
867
+ hmm_methbases: Optional[List[str]] = (
868
+ None # if None, HMM.annotate_adata will fall back to mod_target_bases
869
+ )
870
+ # HMM fitting/application strategy
871
+ hmm_fit_strategy: str = "per_group" # "per_group" | "shared_transitions"
872
+ hmm_shared_scope: List[str] = field(default_factory=lambda: ["reference", "methbase"])
873
+ hmm_groupby: List[str] = field(default_factory=lambda: ["sample", "reference", "methbase"])
874
+ # Shared-transitions adaptation behavior
875
+ hmm_adapt_emissions: bool = True
876
+ hmm_adapt_startprobs: bool = True
877
+ hmm_emission_adapt_iters: int = 5
878
+ hmm_emission_adapt_tol: float = 1e-4
802
879
  footprints: Optional[bool] = True
803
880
  accessible_patches: Optional[bool] = True
804
881
  cpg: Optional[bool] = False
805
882
  hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
806
- hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
807
- clustermap_cmap_hmm: Optional[str] = 'coolwarm'
808
- hmm_clustermap_feature_layers: List[str] = field(default_factory=lambda: ["all_accessible_features"])
809
- hmm_clustermap_sortby: Optional[str] = 'hmm'
883
+ hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 60)])
884
+ clustermap_cmap_hmm: Optional[str] = "coolwarm"
885
+ hmm_clustermap_feature_layers: List[str] = field(
886
+ default_factory=lambda: ["all_accessible_features"]
887
+ )
888
+ hmm_clustermap_sortby: Optional[str] = "hmm"
810
889
  hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
811
890
 
812
891
  # Pipeline control flow - load adata
@@ -830,7 +909,7 @@ class ExperimentConfig:
830
909
  force_redo_filter_reads_on_modification_thresholds: bool = False
831
910
  bypass_flag_duplicate_reads: bool = False
832
911
  force_redo_flag_duplicate_reads: bool = False
833
- bypass_complexity_analysis: bool = False
912
+ bypass_complexity_analysis: bool = False
834
913
  force_redo_complexity_analysis: bool = False
835
914
 
836
915
  # Pipeline control flow - Spatial Analyses
@@ -910,7 +989,9 @@ class ExperimentConfig:
910
989
  defaults_loaded = dict(defaults_map[modality] or {})
911
990
  defaults_source_chain = [f"defaults_map['{modality}']"]
912
991
  elif defaults_dir is not None:
913
- defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(defaults_dir, modality)
992
+ defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
993
+ defaults_dir, modality
994
+ )
914
995
 
915
996
  # If CSV asks to extend defaults, load those and merge
916
997
  merged = dict(defaults_loaded or {})
@@ -925,7 +1006,11 @@ class ExperimentConfig:
925
1006
  else:
926
1007
  ext_list = []
927
1008
  for ext in ext_list:
928
- ext_defaults, ext_sources = (load_defaults_with_inheritance(defaults_dir, ext) if defaults_dir else ({}, []))
1009
+ ext_defaults, ext_sources = (
1010
+ load_defaults_with_inheritance(defaults_dir, ext)
1011
+ if defaults_dir
1012
+ else ({}, [])
1013
+ )
929
1014
  merged = deep_merge(merged, ext_defaults)
930
1015
  for s in ext_sources:
931
1016
  if s not in defaults_source_chain:
@@ -955,34 +1040,40 @@ class ExperimentConfig:
955
1040
  merged["experiment_name"] = f"{date_str}_SMF_experiment"
956
1041
 
957
1042
  # Input file types and path handling
958
- input_data_path = Path(merged['input_data_path'])
1043
+ input_data_path = Path(merged["input_data_path"])
959
1044
 
960
1045
  # Detect the input filetype
961
1046
  if input_data_path.is_file():
962
- suffix = input_data_path.suffix.lower()
963
- suffixes = [s.lower() for s in input_data_path.suffixes] # handles multi-part extensions
964
-
965
- # recognize multi-suffix cases like .fastq.gz or .fq.gz
966
- if any(s in ['.pod5', '.p5'] for s in suffixes):
967
- input_type = "pod5"
968
- input_files = [Path(input_data_path)]
969
- elif any(s in ['.fast5', '.f5'] for s in suffixes):
970
- input_type = "fast5"
971
- input_files = [Path(input_data_path)]
972
- elif any(s in ['.fastq', '.fq'] for s in suffixes):
973
- input_type = "fastq"
974
- input_files = [Path(input_data_path)]
975
- elif any(s in ['.bam'] for s in suffixes):
976
- input_type = "bam"
977
- input_files = [Path(input_data_path)]
978
- elif any(s in ['.h5ad', ".h5"] for s in suffixes):
979
- input_type = "h5ad"
980
- input_files = [Path(input_data_path)]
981
- else:
982
- print("Error detecting input file type")
1047
+ suffix = input_data_path.suffix.lower()
1048
+ suffixes = [
1049
+ s.lower() for s in input_data_path.suffixes
1050
+ ] # handles multi-part extensions
1051
+
1052
+ # recognize multi-suffix cases like .fastq.gz or .fq.gz
1053
+ if any(s in [".pod5", ".p5"] for s in suffixes):
1054
+ input_type = "pod5"
1055
+ input_files = [Path(input_data_path)]
1056
+ elif any(s in [".fast5", ".f5"] for s in suffixes):
1057
+ input_type = "fast5"
1058
+ input_files = [Path(input_data_path)]
1059
+ elif any(s in [".fastq", ".fq"] for s in suffixes):
1060
+ input_type = "fastq"
1061
+ input_files = [Path(input_data_path)]
1062
+ elif any(s in [".bam"] for s in suffixes):
1063
+ input_type = "bam"
1064
+ input_files = [Path(input_data_path)]
1065
+ elif any(s in [".h5ad", ".h5"] for s in suffixes):
1066
+ input_type = "h5ad"
1067
+ input_files = [Path(input_data_path)]
1068
+ else:
1069
+ print("Error detecting input file type")
983
1070
 
984
1071
  elif input_data_path.is_dir():
985
- found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
1072
+ found = discover_input_files(
1073
+ input_data_path,
1074
+ bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
1075
+ recursive=merged["recursive_input_search"],
1076
+ )
986
1077
 
987
1078
  if found["input_is_pod5"]:
988
1079
  input_type = "pod5"
@@ -1010,12 +1101,12 @@ class ExperimentConfig:
1010
1101
  )
1011
1102
 
1012
1103
  # summary file output path
1013
- output_dir = Path(merged['output_directory'])
1014
- summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
1104
+ output_dir = Path(merged["output_directory"])
1105
+ summary_file_basename = merged["experiment_name"] + "_output_summary.csv"
1015
1106
  summary_file = output_dir / summary_file_basename
1016
1107
 
1017
1108
  # Demultiplexing output path
1018
- split_dir = merged.get("split_dir", "demultiplexed_BAMs")
1109
+ split_dir = merged.get("split_dir", SPLIT_DIR)
1019
1110
  split_path = output_dir / split_dir
1020
1111
 
1021
1112
  # final normalization
@@ -1039,7 +1130,14 @@ class ExperimentConfig:
1039
1130
  merged["hm5C_threshold"],
1040
1131
  ]
1041
1132
 
1042
- for bkey in ("barcode_both_ends", "trim", "input_already_demuxed", "make_bigwigs", "skip_unclassified", "delete_batch_hdfs"):
1133
+ for bkey in (
1134
+ "barcode_both_ends",
1135
+ "trim",
1136
+ "input_already_demuxed",
1137
+ "make_bigwigs",
1138
+ "skip_unclassified",
1139
+ "delete_batch_hdfs",
1140
+ ):
1043
1141
  if bkey in merged:
1044
1142
  merged[bkey] = _parse_bool(merged[bkey])
1045
1143
 
@@ -1048,12 +1146,12 @@ class ExperimentConfig:
1048
1146
  if "threads" in merged:
1049
1147
  tval = _parse_numeric(merged.get("threads", None), None)
1050
1148
  merged["threads"] = None if tval is None else int(tval)
1051
-
1149
+
1052
1150
  if "aligner_args" in merged and merged.get("aligner_args") is None:
1053
1151
  merged.pop("aligner_args", None)
1054
1152
 
1055
1153
  # --- Resolve aligner_args into concrete list for the chosen aligner ---
1056
- merged['aligner_args'] = resolve_aligner_args(merged)
1154
+ merged["aligner_args"] = resolve_aligner_args(merged)
1057
1155
 
1058
1156
  if "mod_list" in merged:
1059
1157
  merged["mod_list"] = _parse_list(merged.get("mod_list"))
@@ -1068,11 +1166,22 @@ class ExperimentConfig:
1068
1166
  # allow older names (footprint_ranges, accessible_ranges, cpg_ranges) — optional:
1069
1167
  maybe_fs = {}
1070
1168
  if "footprint_ranges" in merged or "hmm_footprint_ranges" in merged:
1071
- maybe_fs["footprint"] = {"features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")), "state": merged.get("hmm_footprint_state", "Non-Modified")}
1169
+ maybe_fs["footprint"] = {
1170
+ "features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")),
1171
+ "state": merged.get("hmm_footprint_state", "Non-Modified"),
1172
+ }
1072
1173
  if "accessible_ranges" in merged or "hmm_accessible_ranges" in merged:
1073
- maybe_fs["accessible"] = {"features": merged.get("hmm_accessible_ranges", merged.get("accessible_ranges")), "state": merged.get("hmm_accessible_state", "Modified")}
1174
+ maybe_fs["accessible"] = {
1175
+ "features": merged.get(
1176
+ "hmm_accessible_ranges", merged.get("accessible_ranges")
1177
+ ),
1178
+ "state": merged.get("hmm_accessible_state", "Modified"),
1179
+ }
1074
1180
  if "cpg_ranges" in merged or "hmm_cpg_ranges" in merged:
1075
- maybe_fs["cpg"] = {"features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")), "state": merged.get("hmm_cpg_state", "Modified")}
1181
+ maybe_fs["cpg"] = {
1182
+ "features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")),
1183
+ "state": merged.get("hmm_cpg_state", "Modified"),
1184
+ }
1076
1185
  if maybe_fs:
1077
1186
  merged.setdefault("hmm_feature_sets", {})
1078
1187
  for k, v in maybe_fs.items():
@@ -1093,10 +1202,23 @@ class ExperimentConfig:
1093
1202
  if not hmm_methbases: # None or []
1094
1203
  hmm_methbases = _parse_list(merged.get("mod_target_bases", None))
1095
1204
  if not hmm_methbases:
1096
- hmm_methbases = ['C']
1205
+ hmm_methbases = ["C"]
1097
1206
  hmm_methbases = list(hmm_methbases)
1098
1207
  hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
1099
- hmm_clustermap_feature_layers = _parse_list(merged.get("hmm_clustermap_feature_layers", "all_accessible_features"))
1208
+ hmm_clustermap_feature_layers = _parse_list(
1209
+ merged.get("hmm_clustermap_feature_layers", "all_accessible_features")
1210
+ )
1211
+
1212
+ hmm_fit_strategy = str(merged.get("hmm_fit_strategy", "per_group")).strip()
1213
+ hmm_shared_scope = _parse_list(merged.get("hmm_shared_scope", ["reference", "methbase"]))
1214
+ hmm_groupby = _parse_list(merged.get("hmm_groupby", ["sample", "reference", "methbase"]))
1215
+
1216
+ hmm_adapt_emissions = _parse_bool(merged.get("hmm_adapt_emissions", True))
1217
+ hmm_adapt_startprobs = _parse_bool(merged.get("hmm_adapt_startprobs", True))
1218
+ hmm_emission_adapt_iters = int(_parse_numeric(merged.get("hmm_emission_adapt_iters", 5), 5))
1219
+ hmm_emission_adapt_tol = float(
1220
+ _parse_numeric(merged.get("hmm_emission_adapt_tol", 1e-4), 1e-4)
1221
+ )
1100
1222
 
1101
1223
  # HMM peak feature configs (for call_hmm_peaks)
1102
1224
  merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
@@ -1106,165 +1228,252 @@ class ExperimentConfig:
1106
1228
 
1107
1229
  # instantiate dataclass
1108
1230
  instance = cls(
1109
- smf_modality = merged.get("smf_modality"),
1110
- input_data_path = input_data_path,
1111
- recursive_input_search = merged.get("recursive_input_search"),
1112
- input_type = input_type,
1113
- input_files = input_files,
1114
- output_directory = output_dir,
1115
- summary_file = summary_file,
1116
- fasta = merged.get("fasta"),
1117
- sequencer = merged.get("sequencer"),
1118
- model_dir = merged.get("model_dir"),
1119
- barcode_kit = merged.get("barcode_kit"),
1120
- fastq_barcode_map = merged.get("fastq_barcode_map"),
1121
- fastq_auto_pairing = merged.get("fastq_auto_pairing"),
1122
- bam_suffix = merged.get("bam_suffix", ".bam"),
1123
- split_dir = split_dir,
1124
- split_path = split_path,
1125
- strands = merged.get("strands", ["bottom","top"]),
1126
- conversions = merged.get("conversions", ["unconverted"]),
1127
- fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
1128
- mapping_threshold = float(merged.get("mapping_threshold", 0.01)),
1129
- experiment_name = merged.get("experiment_name"),
1130
- model = merged.get("model", "hac"),
1131
- barcode_both_ends = merged.get("barcode_both_ends", False),
1132
- trim = merged.get("trim", False),
1133
- input_already_demuxed = merged.get("input_already_demuxed", False),
1134
- threads = merged.get("threads"),
1135
- sample_sheet_path = merged.get("sample_sheet_path"),
1136
- sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
1137
- delete_intermediate_bams = merged.get("delete_intermediate_bams", False),
1138
- delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
1139
- align_from_bam = merged.get("align_from_bam", False),
1140
- aligner = merged.get("aligner", "minimap2"),
1141
- aligner_args = merged.get("aligner_args", None),
1142
- device = merged.get("device", "auto"),
1143
- make_bigwigs = merged.get("make_bigwigs", False),
1144
- make_beds = merged.get("make_beds", False),
1145
- delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
1146
- mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
1147
- enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
1148
- conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
1149
- filter_threshold = merged.get("filter_threshold", 0.8),
1150
- m6A_threshold = merged.get("m6A_threshold", 0.7),
1151
- m5C_threshold = merged.get("m5C_threshold", 0.7),
1152
- hm5C_threshold = merged.get("hm5C_threshold", 0.7),
1153
- thresholds = merged.get("thresholds", []),
1154
- mod_list = merged.get("mod_list", ["5mC_5hmC","6mA"]),
1155
- batch_size = merged.get("batch_size", 4),
1156
- skip_unclassified = merged.get("skip_unclassified", True),
1157
- delete_batch_hdfs = merged.get("delete_batch_hdfs", True),
1158
- reference_column = merged.get("reference_column", 'Reference_strand'),
1159
- sample_column = merged.get("sample_column", 'Barcode'),
1160
- sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
1161
- obs_to_plot_pp_qc = obs_to_plot_pp_qc,
1162
- fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
1163
- binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
1164
- positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
1165
- negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
1166
- infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
1167
- inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
1168
- fit_j_threshold = merged.get("fit_j_threshold", 0.5),
1169
- output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
1170
- reindexing_offsets = merged.get("reindexing_offsets", {None: None}),
1171
- reindexed_var_suffix = merged.get("reindexed_var_suffix", "reindexed"),
1172
- layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
1173
- clustermap_cmap_c = merged.get("clustermap_cmap_c", 'coolwarm'),
1174
- clustermap_cmap_gpc = merged.get("clustermap_cmap_gpc", 'coolwarm'),
1175
- clustermap_cmap_cpg = merged.get("clustermap_cmap_cpg", 'coolwarm'),
1176
- clustermap_cmap_a = merged.get("clustermap_cmap_a", 'coolwarm'),
1177
- spatial_clustermap_sortby = merged.get("spatial_clustermap_sortby", 'gpc'),
1178
- layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
1179
- umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
1180
- rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
1181
- rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
1182
- autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
1183
- autocorr_max_lag = merged.get("autocorr_max_lag", 800),
1184
- autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'C']),
1185
- hmm_n_states = merged.get("hmm_n_states", 2),
1186
- hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
1187
- hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
1188
- hmm_init_start_probs = merged.get("hmm_init_start_probs",[0.5, 0.5]),
1189
- hmm_eps = merged.get("hmm_eps", 1e-8),
1190
- hmm_dtype = merged.get("hmm_dtype", "float64"),
1191
- hmm_feature_sets = hmm_feature_sets,
1192
- hmm_annotation_threshold = hmm_annotation_threshold,
1193
- hmm_batch_size = hmm_batch_size,
1194
- hmm_use_viterbi = hmm_use_viterbi,
1195
- hmm_methbases = hmm_methbases,
1196
- hmm_device = hmm_device,
1197
- hmm_merge_layer_features = hmm_merge_layer_features,
1198
- clustermap_cmap_hmm = merged.get("clustermap_cmap_hmm", 'coolwarm'),
1199
- hmm_clustermap_feature_layers = hmm_clustermap_feature_layers,
1200
- hmm_clustermap_sortby = merged.get("hmm_clustermap_sortby", 'hmm'),
1201
- hmm_peak_feature_configs = hmm_peak_feature_configs,
1202
- footprints = merged.get("footprints", None),
1203
- accessible_patches = merged.get("accessible_patches", None),
1204
- cpg = merged.get("cpg", None),
1205
- read_coord_filter = merged.get("read_coord_filter", [None, None]),
1206
- read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
1207
- read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
1208
- read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
1209
- read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
1210
- read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
1211
- read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
1212
- read_mod_filtering_c_thresholds = merged.get("read_mod_filtering_c_thresholds", [0.025, 0.975]),
1213
- read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
1214
- read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
1215
- min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
1216
- duplicate_detection_site_types = merged.get("duplicate_detection_site_types", ['GpC', 'CpG', 'ambiguous_GpC_CpG']),
1217
- duplicate_detection_distance_threshold = merged.get("duplicate_detection_distance_threshold", 0.07),
1218
- duplicate_detection_keep_best_metric = merged.get("duplicate_detection_keep_best_metric", "read_quality"),
1219
- duplicate_detection_window_size_for_hamming_neighbors = merged.get("duplicate_detection_window_size_for_hamming_neighbors", 50),
1220
- duplicate_detection_min_overlapping_positions = merged.get("duplicate_detection_min_overlapping_positions", 20),
1221
- duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
1222
- duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
1223
- duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
1224
- position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
1225
- correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
1226
- correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1227
- correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
1228
- hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_C_site_modified']),
1229
- force_redo_load_adata = merged.get("force_redo_load_adata", False),
1230
- force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
1231
- force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
1232
- bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),
1233
- force_redo_add_read_length_and_mapping_qc = merged.get("force_redo_add_read_length_and_mapping_qc", False),
1234
- bypass_clean_nan = merged.get("bypass_clean_nan", False),
1235
- force_redo_clean_nan = merged.get("force_redo_clean_nan", False),
1236
- bypass_append_base_context = merged.get("bypass_append_base_context", False),
1237
- force_redo_append_base_context = merged.get("force_redo_append_base_context", False),
1238
- invert_adata = merged.get("invert_adata", False),
1239
- bypass_append_binary_layer_by_base_context = merged.get("bypass_append_binary_layer_by_base_context", False),
1240
- force_redo_append_binary_layer_by_base_context = merged.get("force_redo_append_binary_layer_by_base_context", False),
1241
- bypass_calculate_read_modification_stats = merged.get("bypass_calculate_read_modification_stats", False),
1242
- force_redo_calculate_read_modification_stats = merged.get("force_redo_calculate_read_modification_stats", False),
1243
- bypass_filter_reads_on_modification_thresholds = merged.get("bypass_filter_reads_on_modification_thresholds", False),
1244
- force_redo_filter_reads_on_modification_thresholds = merged.get("force_redo_filter_reads_on_modification_thresholds", False),
1245
- bypass_flag_duplicate_reads = merged.get("bypass_flag_duplicate_reads", False),
1246
- force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
1247
- bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
1248
- force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
1249
- force_redo_spatial_analyses = merged.get("force_redo_spatial_analyses", False),
1250
- bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
1251
- force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
1252
- bypass_basic_umap = merged.get("bypass_basic_umap", False),
1253
- force_redo_basic_umap = merged.get("force_redo_basic_umap", False),
1254
- bypass_spatial_autocorr_calculations = merged.get("bypass_spatial_autocorr_calculations", False),
1255
- force_redo_spatial_autocorr_calculations = merged.get("force_redo_spatial_autocorr_calculations", False),
1256
- bypass_spatial_autocorr_plotting = merged.get("bypass_spatial_autocorr_plotting", False),
1257
- force_redo_spatial_autocorr_plotting = merged.get("force_redo_spatial_autocorr_plotting", False),
1258
- bypass_matrix_corr_calculations = merged.get("bypass_matrix_corr_calculations", False),
1259
- force_redo_matrix_corr_calculations = merged.get("force_redo_matrix_corr_calculations", False),
1260
- bypass_matrix_corr_plotting = merged.get("bypass_matrix_corr_plotting", False),
1261
- force_redo_matrix_corr_plotting = merged.get("force_redo_matrix_corr_plotting", False),
1262
- bypass_hmm_fit = merged.get("bypass_hmm_fit", False),
1263
- force_redo_hmm_fit = merged.get("force_redo_hmm_fit", False),
1264
- bypass_hmm_apply = merged.get("bypass_hmm_apply", False),
1265
- force_redo_hmm_apply = merged.get("force_redo_hmm_apply", False),
1266
-
1267
- config_source = config_source or "<var_dict>",
1231
+ smf_modality=merged.get("smf_modality"),
1232
+ input_data_path=input_data_path,
1233
+ recursive_input_search=merged.get("recursive_input_search"),
1234
+ input_type=input_type,
1235
+ input_files=input_files,
1236
+ output_directory=output_dir,
1237
+ summary_file=summary_file,
1238
+ fasta=merged.get("fasta"),
1239
+ sequencer=merged.get("sequencer"),
1240
+ model_dir=merged.get("model_dir"),
1241
+ barcode_kit=merged.get("barcode_kit"),
1242
+ fastq_barcode_map=merged.get("fastq_barcode_map"),
1243
+ fastq_auto_pairing=merged.get("fastq_auto_pairing"),
1244
+ bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
1245
+ split_dir=split_dir,
1246
+ split_path=split_path,
1247
+ strands=merged.get("strands", STRANDS),
1248
+ conversions=merged.get("conversions", CONVERSIONS),
1249
+ fasta_regions_of_interest=merged.get("fasta_regions_of_interest"),
1250
+ mapping_threshold=float(merged.get("mapping_threshold", 0.01)),
1251
+ experiment_name=merged.get("experiment_name"),
1252
+ model=merged.get("model", "hac"),
1253
+ barcode_both_ends=merged.get("barcode_both_ends", BARCODE_BOTH_ENDS),
1254
+ trim=merged.get("trim", TRIM),
1255
+ input_already_demuxed=merged.get("input_already_demuxed", False),
1256
+ threads=merged.get("threads"),
1257
+ sample_sheet_path=merged.get("sample_sheet_path"),
1258
+ sample_sheet_mapping_column=merged.get("sample_sheet_mapping_column"),
1259
+ delete_intermediate_bams=merged.get("delete_intermediate_bams", False),
1260
+ delete_intermediate_tsvs=merged.get("delete_intermediate_tsvs", True),
1261
+ align_from_bam=merged.get("align_from_bam", False),
1262
+ aligner=merged.get("aligner", "minimap2"),
1263
+ aligner_args=merged.get("aligner_args", None),
1264
+ device=merged.get("device", "auto"),
1265
+ make_bigwigs=merged.get("make_bigwigs", False),
1266
+ make_beds=merged.get("make_beds", False),
1267
+ delete_intermediate_hdfs=merged.get("delete_intermediate_hdfs", True),
1268
+ mod_target_bases=merged.get("mod_target_bases", ["GpC", "CpG"]),
1269
+ enzyme_target_bases=merged.get("enzyme_target_bases", ["GpC"]),
1270
+ conversion_types=merged.get("conversions", ["unconverted"])
1271
+ + merged.get("conversion_types", ["5mC"]),
1272
+ filter_threshold=merged.get("filter_threshold", 0.8),
1273
+ m6A_threshold=merged.get("m6A_threshold", 0.7),
1274
+ m5C_threshold=merged.get("m5C_threshold", 0.7),
1275
+ hm5C_threshold=merged.get("hm5C_threshold", 0.7),
1276
+ thresholds=merged.get("thresholds", []),
1277
+ mod_list=merged.get("mod_list", list(MOD_LIST)),
1278
+ mod_map=merged.get("mod_map", list(MOD_MAP)),
1279
+ batch_size=merged.get("batch_size", 4),
1280
+ skip_unclassified=merged.get("skip_unclassified", True),
1281
+ delete_batch_hdfs=merged.get("delete_batch_hdfs", True),
1282
+ reference_column=merged.get("reference_column", REF_COL),
1283
+ sample_column=merged.get("sample_column", SAMPLE_COL),
1284
+ sample_name_col_for_plotting=merged.get("sample_name_col_for_plotting", "Barcode"),
1285
+ obs_to_plot_pp_qc=obs_to_plot_pp_qc,
1286
+ fit_position_methylation_thresholds=merged.get(
1287
+ "fit_position_methylation_thresholds", False
1288
+ ),
1289
+ binarize_on_fixed_methlyation_threshold=merged.get(
1290
+ "binarize_on_fixed_methlyation_threshold", 0.7
1291
+ ),
1292
+ positive_control_sample_methylation_fitting=merged.get(
1293
+ "positive_control_sample_methylation_fitting", None
1294
+ ),
1295
+ negative_control_sample_methylation_fitting=merged.get(
1296
+ "negative_control_sample_methylation_fitting", None
1297
+ ),
1298
+ infer_on_percentile_sample_methylation_fitting=merged.get(
1299
+ "infer_on_percentile_sample_methylation_fitting", 10
1300
+ ),
1301
+ inference_variable_sample_methylation_fitting=merged.get(
1302
+ "inference_variable_sample_methylation_fitting", "Raw_modification_signal"
1303
+ ),
1304
+ fit_j_threshold=merged.get("fit_j_threshold", 0.5),
1305
+ output_binary_layer_name=merged.get(
1306
+ "output_binary_layer_name", "binarized_methylation"
1307
+ ),
1308
+ reindexing_offsets=merged.get("reindexing_offsets", {None: None}),
1309
+ reindexed_var_suffix=merged.get("reindexed_var_suffix", "reindexed"),
1310
+ layer_for_clustermap_plotting=merged.get(
1311
+ "layer_for_clustermap_plotting", "nan0_0minus1"
1312
+ ),
1313
+ clustermap_cmap_c=merged.get("clustermap_cmap_c", "coolwarm"),
1314
+ clustermap_cmap_gpc=merged.get("clustermap_cmap_gpc", "coolwarm"),
1315
+ clustermap_cmap_cpg=merged.get("clustermap_cmap_cpg", "coolwarm"),
1316
+ clustermap_cmap_a=merged.get("clustermap_cmap_a", "coolwarm"),
1317
+ spatial_clustermap_sortby=merged.get("spatial_clustermap_sortby", "gpc"),
1318
+ layer_for_umap_plotting=merged.get("layer_for_umap_plotting", "nan_half"),
1319
+ umap_layers_to_plot=merged.get(
1320
+ "umap_layers_to_plot", ["mapped_length", "Raw_modification_signal"]
1321
+ ),
1322
+ rows_per_qc_histogram_grid=merged.get("rows_per_qc_histogram_grid", 12),
1323
+ rows_per_qc_autocorr_grid=merged.get("rows_per_qc_autocorr_grid", 12),
1324
+ autocorr_normalization_method=merged.get("autocorr_normalization_method", "pearson"),
1325
+ autocorr_rolling_window_size=merged.get("autocorr_rolling_window_size", 25),
1326
+ autocorr_max_lag=merged.get("autocorr_max_lag", 800),
1327
+ autocorr_site_types=merged.get("autocorr_site_types", ["GpC", "CpG", "C"]),
1328
+ hmm_n_states=merged.get("hmm_n_states", 2),
1329
+ hmm_init_emission_probs=merged.get("hmm_init_emission_probs", [[0.8, 0.2], [0.2, 0.8]]),
1330
+ hmm_init_transition_probs=merged.get(
1331
+ "hmm_init_transition_probs", [[0.9, 0.1], [0.1, 0.9]]
1332
+ ),
1333
+ hmm_init_start_probs=merged.get("hmm_init_start_probs", [0.5, 0.5]),
1334
+ hmm_eps=merged.get("hmm_eps", 1e-8),
1335
+ hmm_fit_strategy=hmm_fit_strategy,
1336
+ hmm_shared_scope=hmm_shared_scope,
1337
+ hmm_groupby=hmm_groupby,
1338
+ hmm_adapt_emissions=hmm_adapt_emissions,
1339
+ hmm_adapt_startprobs=hmm_adapt_startprobs,
1340
+ hmm_emission_adapt_iters=hmm_emission_adapt_iters,
1341
+ hmm_emission_adapt_tol=hmm_emission_adapt_tol,
1342
+ hmm_dtype=merged.get("hmm_dtype", "float64"),
1343
+ hmm_feature_sets=hmm_feature_sets,
1344
+ hmm_annotation_threshold=hmm_annotation_threshold,
1345
+ hmm_batch_size=hmm_batch_size,
1346
+ hmm_use_viterbi=hmm_use_viterbi,
1347
+ hmm_methbases=hmm_methbases,
1348
+ hmm_device=hmm_device,
1349
+ hmm_merge_layer_features=hmm_merge_layer_features,
1350
+ clustermap_cmap_hmm=merged.get("clustermap_cmap_hmm", "coolwarm"),
1351
+ hmm_clustermap_feature_layers=hmm_clustermap_feature_layers,
1352
+ hmm_clustermap_sortby=merged.get("hmm_clustermap_sortby", "hmm"),
1353
+ hmm_peak_feature_configs=hmm_peak_feature_configs,
1354
+ footprints=merged.get("footprints", None),
1355
+ accessible_patches=merged.get("accessible_patches", None),
1356
+ cpg=merged.get("cpg", None),
1357
+ read_coord_filter=merged.get("read_coord_filter", [None, None]),
1358
+ read_len_filter_thresholds=merged.get("read_len_filter_thresholds", [100, None]),
1359
+ read_len_to_ref_ratio_filter_thresholds=merged.get(
1360
+ "read_len_to_ref_ratio_filter_thresholds", [0.3, None]
1361
+ ),
1362
+ read_quality_filter_thresholds=merged.get("read_quality_filter_thresholds", [15, None]),
1363
+ read_mapping_quality_filter_thresholds=merged.get(
1364
+ "read_mapping_quality_filter_thresholds", [None, None]
1365
+ ),
1366
+ read_mod_filtering_gpc_thresholds=merged.get(
1367
+ "read_mod_filtering_gpc_thresholds", [0.025, 0.975]
1368
+ ),
1369
+ read_mod_filtering_cpg_thresholds=merged.get(
1370
+ "read_mod_filtering_cpg_thresholds", [0.0, 1.0]
1371
+ ),
1372
+ read_mod_filtering_c_thresholds=merged.get(
1373
+ "read_mod_filtering_c_thresholds", [0.025, 0.975]
1374
+ ),
1375
+ read_mod_filtering_a_thresholds=merged.get(
1376
+ "read_mod_filtering_a_thresholds", [0.025, 0.975]
1377
+ ),
1378
+ read_mod_filtering_use_other_c_as_background=merged.get(
1379
+ "read_mod_filtering_use_other_c_as_background", True
1380
+ ),
1381
+ min_valid_fraction_positions_in_read_vs_ref=merged.get(
1382
+ "min_valid_fraction_positions_in_read_vs_ref", 0.2
1383
+ ),
1384
+ duplicate_detection_site_types=merged.get(
1385
+ "duplicate_detection_site_types", ["GpC", "CpG", "ambiguous_GpC_CpG"]
1386
+ ),
1387
+ duplicate_detection_distance_threshold=merged.get(
1388
+ "duplicate_detection_distance_threshold", 0.07
1389
+ ),
1390
+ duplicate_detection_keep_best_metric=merged.get(
1391
+ "duplicate_detection_keep_best_metric", "read_quality"
1392
+ ),
1393
+ duplicate_detection_window_size_for_hamming_neighbors=merged.get(
1394
+ "duplicate_detection_window_size_for_hamming_neighbors", 50
1395
+ ),
1396
+ duplicate_detection_min_overlapping_positions=merged.get(
1397
+ "duplicate_detection_min_overlapping_positions", 20
1398
+ ),
1399
+ duplicate_detection_do_hierarchical=merged.get(
1400
+ "duplicate_detection_do_hierarchical", True
1401
+ ),
1402
+ duplicate_detection_hierarchical_linkage=merged.get(
1403
+ "duplicate_detection_hierarchical_linkage", "average"
1404
+ ),
1405
+ duplicate_detection_do_pca=merged.get("duplicate_detection_do_pca", False),
1406
+ position_max_nan_threshold=merged.get("position_max_nan_threshold", 0.1),
1407
+ correlation_matrix_types=merged.get(
1408
+ "correlation_matrix_types", ["pearson", "binary_covariance"]
1409
+ ),
1410
+ correlation_matrix_cmaps=merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1411
+ correlation_matrix_site_types=merged.get("correlation_matrix_site_types", ["GpC_site"]),
1412
+ hamming_vs_metric_keys=merged.get(
1413
+ "hamming_vs_metric_keys", ["Fraction_C_site_modified"]
1414
+ ),
1415
+ force_redo_load_adata=merged.get("force_redo_load_adata", False),
1416
+ force_redo_preprocessing=merged.get("force_redo_preprocessing", False),
1417
+ force_reload_sample_sheet=merged.get("force_reload_sample_sheet", True),
1418
+ bypass_add_read_length_and_mapping_qc=merged.get(
1419
+ "bypass_add_read_length_and_mapping_qc", False
1420
+ ),
1421
+ force_redo_add_read_length_and_mapping_qc=merged.get(
1422
+ "force_redo_add_read_length_and_mapping_qc", False
1423
+ ),
1424
+ bypass_clean_nan=merged.get("bypass_clean_nan", False),
1425
+ force_redo_clean_nan=merged.get("force_redo_clean_nan", False),
1426
+ bypass_append_base_context=merged.get("bypass_append_base_context", False),
1427
+ force_redo_append_base_context=merged.get("force_redo_append_base_context", False),
1428
+ invert_adata=merged.get("invert_adata", False),
1429
+ bypass_append_binary_layer_by_base_context=merged.get(
1430
+ "bypass_append_binary_layer_by_base_context", False
1431
+ ),
1432
+ force_redo_append_binary_layer_by_base_context=merged.get(
1433
+ "force_redo_append_binary_layer_by_base_context", False
1434
+ ),
1435
+ bypass_calculate_read_modification_stats=merged.get(
1436
+ "bypass_calculate_read_modification_stats", False
1437
+ ),
1438
+ force_redo_calculate_read_modification_stats=merged.get(
1439
+ "force_redo_calculate_read_modification_stats", False
1440
+ ),
1441
+ bypass_filter_reads_on_modification_thresholds=merged.get(
1442
+ "bypass_filter_reads_on_modification_thresholds", False
1443
+ ),
1444
+ force_redo_filter_reads_on_modification_thresholds=merged.get(
1445
+ "force_redo_filter_reads_on_modification_thresholds", False
1446
+ ),
1447
+ bypass_flag_duplicate_reads=merged.get("bypass_flag_duplicate_reads", False),
1448
+ force_redo_flag_duplicate_reads=merged.get("force_redo_flag_duplicate_reads", False),
1449
+ bypass_complexity_analysis=merged.get("bypass_complexity_analysis", False),
1450
+ force_redo_complexity_analysis=merged.get("force_redo_complexity_analysis", False),
1451
+ force_redo_spatial_analyses=merged.get("force_redo_spatial_analyses", False),
1452
+ bypass_basic_clustermaps=merged.get("bypass_basic_clustermaps", False),
1453
+ force_redo_basic_clustermaps=merged.get("force_redo_basic_clustermaps", False),
1454
+ bypass_basic_umap=merged.get("bypass_basic_umap", False),
1455
+ force_redo_basic_umap=merged.get("force_redo_basic_umap", False),
1456
+ bypass_spatial_autocorr_calculations=merged.get(
1457
+ "bypass_spatial_autocorr_calculations", False
1458
+ ),
1459
+ force_redo_spatial_autocorr_calculations=merged.get(
1460
+ "force_redo_spatial_autocorr_calculations", False
1461
+ ),
1462
+ bypass_spatial_autocorr_plotting=merged.get("bypass_spatial_autocorr_plotting", False),
1463
+ force_redo_spatial_autocorr_plotting=merged.get(
1464
+ "force_redo_spatial_autocorr_plotting", False
1465
+ ),
1466
+ bypass_matrix_corr_calculations=merged.get("bypass_matrix_corr_calculations", False),
1467
+ force_redo_matrix_corr_calculations=merged.get(
1468
+ "force_redo_matrix_corr_calculations", False
1469
+ ),
1470
+ bypass_matrix_corr_plotting=merged.get("bypass_matrix_corr_plotting", False),
1471
+ force_redo_matrix_corr_plotting=merged.get("force_redo_matrix_corr_plotting", False),
1472
+ bypass_hmm_fit=merged.get("bypass_hmm_fit", False),
1473
+ force_redo_hmm_fit=merged.get("force_redo_hmm_fit", False),
1474
+ bypass_hmm_apply=merged.get("bypass_hmm_apply", False),
1475
+ force_redo_hmm_apply=merged.get("force_redo_hmm_apply", False),
1476
+ config_source=config_source or "<var_dict>",
1268
1477
  )
1269
1478
 
1270
1479
  report = {
@@ -1291,9 +1500,20 @@ class ExperimentConfig:
1291
1500
  Load CSV using LoadExperimentConfig (or accept DataFrame) and build ExperimentConfig.
1292
1501
  Additional kwargs passed to from_var_dict().
1293
1502
  """
1294
- loader = LoadExperimentConfig(csv_input) if not isinstance(csv_input, pd.DataFrame) else LoadExperimentConfig(pd.DataFrame(csv_input))
1503
+ loader = (
1504
+ LoadExperimentConfig(csv_input)
1505
+ if not isinstance(csv_input, pd.DataFrame)
1506
+ else LoadExperimentConfig(pd.DataFrame(csv_input))
1507
+ )
1295
1508
  var_dict = loader.var_dict
1296
- return cls.from_var_dict(var_dict, date_str=date_str, config_source=config_source, defaults_dir=defaults_dir, defaults_map=defaults_map, **kwargs)
1509
+ return cls.from_var_dict(
1510
+ var_dict,
1511
+ date_str=date_str,
1512
+ config_source=config_source,
1513
+ defaults_dir=defaults_dir,
1514
+ defaults_map=defaults_map,
1515
+ **kwargs,
1516
+ )
1297
1517
 
1298
1518
  # -------------------------
1299
1519
  # validation & serialization
@@ -1306,7 +1526,9 @@ class ExperimentConfig:
1306
1526
  return errs
1307
1527
  for g, info in hfs.items():
1308
1528
  if not isinstance(info, dict):
1309
- errs.append(f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'.")
1529
+ errs.append(
1530
+ f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'."
1531
+ )
1310
1532
  continue
1311
1533
  feats = info.get("features")
1312
1534
  if not isinstance(feats, dict) or len(feats) == 0:
@@ -1316,7 +1538,9 @@ class ExperimentConfig:
1316
1538
  try:
1317
1539
  lo, hi = float(rng[0]), float(rng[1])
1318
1540
  if lo < 0 or hi <= lo:
1319
- errs.append(f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}.")
1541
+ errs.append(
1542
+ f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}."
1543
+ )
1320
1544
  except Exception:
1321
1545
  errs.append(f"Feature range for {g}:{fname} is invalid: {rng}")
1322
1546
  return errs
@@ -1349,13 +1573,18 @@ class ExperimentConfig:
1349
1573
 
1350
1574
  if not (0.0 <= float(self.mapping_threshold) <= 1.0):
1351
1575
  errors.append("mapping_threshold must be in [0,1].")
1352
- for t in (self.filter_threshold, self.m6A_threshold, self.m5C_threshold, self.hm5C_threshold):
1576
+ for t in (
1577
+ self.filter_threshold,
1578
+ self.m6A_threshold,
1579
+ self.m5C_threshold,
1580
+ self.hm5C_threshold,
1581
+ ):
1353
1582
  if not (0.0 <= float(t) <= 1.0):
1354
1583
  errors.append(f"threshold value {t} must be in [0,1].")
1355
1584
 
1356
1585
  if raise_on_error and errors:
1357
1586
  raise ValueError("ExperimentConfig validation failed:\n " + "\n ".join(errors))
1358
-
1587
+
1359
1588
  errs = _validate_hmm_features_structure(self.hmm_feature_sets)
1360
1589
  errors.extend(errs)
1361
1590