smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1288 @@
1
+ # experiment_config.py
2
+ from __future__ import annotations
3
+ import ast
4
+ import json
5
+ import warnings
6
+ from dataclasses import dataclass, field, asdict
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
9
+ from .discover_input_files import discover_input_files
10
+
11
+ # Optional dependency for YAML handling
12
+ try:
13
+ import yaml
14
+ except Exception:
15
+ yaml = None
16
+
17
+ import pandas as pd
18
+ import numpy as np
19
+
20
+
21
+ # -------------------------
22
+ # Utility parsing functions
23
+ # -------------------------
24
+ def _parse_bool(v: Any) -> bool:
25
+ if isinstance(v, bool):
26
+ return v
27
+ if v is None:
28
+ return False
29
+ s = str(v).strip().lower()
30
+ if s in ("1", "true", "t", "yes", "y", "on"):
31
+ return True
32
+ if s in ("0", "false", "f", "no", "n", "off", ""):
33
+ return False
34
+ try:
35
+ return float(s) != 0.0
36
+ except Exception:
37
+ return False
38
+
39
+
40
+ def _parse_list(v: Any) -> List:
41
+ if v is None:
42
+ return []
43
+ if isinstance(v, (list, tuple)):
44
+ return list(v)
45
+ s = str(v).strip()
46
+ if s == "" or s.lower() == "none":
47
+ return []
48
+ # try JSON
49
+ try:
50
+ parsed = json.loads(s)
51
+ if isinstance(parsed, list):
52
+ return parsed
53
+ except Exception:
54
+ pass
55
+ # try python literal eval
56
+ try:
57
+ lit = ast.literal_eval(s)
58
+ if isinstance(lit, (list, tuple)):
59
+ return list(lit)
60
+ except Exception:
61
+ pass
62
+ # fallback comma separated
63
+ s2 = s.strip("[]() ")
64
+ parts = [p.strip() for p in s2.split(",") if p.strip() != ""]
65
+ return parts
66
+
67
+
68
+ def _parse_numeric(v: Any, fallback: Any = None) -> Any:
69
+ if v is None:
70
+ return fallback
71
+ if isinstance(v, (int, float)):
72
+ return v
73
+ s = str(v).strip()
74
+ if s == "" or s.lower() == "none":
75
+ return fallback
76
+ try:
77
+ return int(s)
78
+ except Exception:
79
+ try:
80
+ return float(s)
81
+ except Exception:
82
+ return fallback
83
+
84
+ def _try_json_or_literal(s: Any) -> Any:
85
+ """Try parse JSON or python literal; otherwise return original string."""
86
+ if s is None:
87
+ return None
88
+ if not isinstance(s, str):
89
+ return s
90
+ s0 = s.strip()
91
+ if s0 == "":
92
+ return None
93
+ # try json
94
+ try:
95
+ return json.loads(s0)
96
+ except Exception:
97
+ pass
98
+ # try python literal
99
+ try:
100
+ return ast.literal_eval(s0)
101
+ except Exception:
102
+ pass
103
+ return s
104
+
105
+
106
+ def resolve_aligner_args(
107
+ merged: dict,
108
+ default_by_aligner: Optional[Dict[str, List[str]]] = None,
109
+ aligner_synonyms: Optional[Dict[str, str]] = None,
110
+ ) -> List[str]:
111
+ """
112
+ Resolve merged['aligner_args'] into a concrete list for the chosen aligner and sequencer.
113
+
114
+ Behavior (search order):
115
+ 1. If aligner_args is a dict, try keys in this order (case-insensitive):
116
+ a) "<aligner>@<sequencer>" (top-level combined key)
117
+ b) aligner -> (if dict) sequencer (nested) -> 'default' fallback
118
+ c) aligner -> (if list) use that list
119
+ d) top-level 'default' key in aligner_args dict
120
+ 2. If aligner_args is a list -> return it (applies to any aligner/sequencer).
121
+ 3. If aligner_args is a string -> try parse JSON/literal or return single-element list.
122
+ 4. Otherwise fall back to builtin defaults per aligner.
123
+ """
124
+ # builtin defaults (aligner -> args)
125
+ builtin_defaults = {
126
+ "minimap2": ['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no'],
127
+ "dorado": ['--mm2-opts', '-N', '5'],
128
+ }
129
+ if default_by_aligner is None:
130
+ default_by_aligner = builtin_defaults
131
+
132
+ # synonyms mapping
133
+ synonyms = {"mm2": "minimap2", "minimap": "minimap2", "minimap-2": "minimap2"}
134
+ if aligner_synonyms:
135
+ synonyms.update(aligner_synonyms)
136
+
137
+ # canonicalize requested aligner and sequencer
138
+ raw_aligner = merged.get("aligner", "minimap2") or "minimap2"
139
+ raw_sequencer = merged.get("sequencer", None) # e.g. 'ont', 'pacbio', 'illumina'
140
+ key_align = str(raw_aligner).strip().lower()
141
+ key_seq = None if raw_sequencer is None else str(raw_sequencer).strip().lower()
142
+ if key_align in synonyms:
143
+ key_align = synonyms[key_align]
144
+
145
+ raw = merged.get("aligner_args", None)
146
+
147
+ # helper to coerce a candidate to list[str]
148
+ def _coerce_to_list(val):
149
+ if isinstance(val, (list, tuple)):
150
+ return [str(x) for x in val]
151
+ if isinstance(val, str):
152
+ parsed = _try_json_or_literal(val)
153
+ if isinstance(parsed, (list, tuple)):
154
+ return [str(x) for x in parsed]
155
+ return [str(parsed)]
156
+ if val is None:
157
+ return None
158
+ return [str(val)]
159
+
160
+ # If dict, do layered lookups
161
+ if isinstance(raw, dict):
162
+ # case-insensitive dict
163
+ top_map = {str(k).lower(): v for k, v in raw.items()}
164
+
165
+ # 1) try combined top-level key "aligner@sequencer"
166
+ if key_seq:
167
+ combined_key = f"{key_align}@{key_seq}"
168
+ if combined_key in top_map:
169
+ res = _coerce_to_list(top_map[combined_key])
170
+ if res:
171
+ return res
172
+
173
+ # 2) try aligner key
174
+ if key_align in top_map:
175
+ val = top_map[key_align]
176
+ # if nested dict: try sequencer key then 'default'
177
+ if isinstance(val, dict):
178
+ submap = {str(k).lower(): v for k, v in val.items()}
179
+ if key_seq and key_seq in submap:
180
+ res = _coerce_to_list(submap[key_seq])
181
+ if res:
182
+ return res
183
+ if "default" in submap:
184
+ res = _coerce_to_list(submap["default"])
185
+ if res:
186
+ return res
187
+ # nothing matched inside aligner->dict; fall back to top-level aligner (no sequencer)
188
+ else:
189
+ # aligner maps to list/str: use it
190
+ res = _coerce_to_list(val)
191
+ if res:
192
+ return res
193
+
194
+ # 3) try top-level 'default' key inside aligner_args mapping
195
+ if "default" in top_map:
196
+ res = _coerce_to_list(top_map["default"])
197
+ if res:
198
+ return res
199
+
200
+ # 4) last top-level attempt: any key equal to aligner synonyms etc (already handled)
201
+ # fallthrough to builtin
202
+ # If user provided a concrete list -> use it
203
+ if isinstance(raw, (list, tuple)):
204
+ return [str(x) for x in raw]
205
+
206
+ # If scalar string, attempt to parse
207
+ if isinstance(raw, str):
208
+ parsed = _try_json_or_literal(raw)
209
+ if isinstance(parsed, (list, tuple)):
210
+ return [str(x) for x in parsed]
211
+ return [str(parsed)]
212
+
213
+ # Nothing found -> fallback builtin default
214
+ return list(default_by_aligner.get(key_align, []))
215
+
216
+
217
+ # HMM default params and hepler functions
218
+ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
219
+ """
220
+ Normalize user-provided `hmm_feature_sets` into canonical structure:
221
+ { group_name: {"features": {label: (lo, hi), ...}, "state": "<Modified|Non-Modified>"} }
222
+ Accepts dict, JSON/string, None. Returns {} for empty input.
223
+ """
224
+ if raw is None:
225
+ return {}
226
+ parsed = raw
227
+ if isinstance(raw, str):
228
+ parsed = _try_json_or_literal(raw)
229
+ if not isinstance(parsed, dict):
230
+ return {}
231
+
232
+ def _coerce_bound(x):
233
+ if x is None:
234
+ return None
235
+ if isinstance(x, (int, float)):
236
+ return float(x)
237
+ s = str(x).strip().lower()
238
+ if s in ("inf", "infty", "infinite"):
239
+ return np.inf
240
+ if s in ("none", ""):
241
+ return None
242
+ try:
243
+ return float(x)
244
+ except Exception:
245
+ return None
246
+
247
+ def _coerce_feature_map(feats):
248
+ out = {}
249
+ if not isinstance(feats, dict):
250
+ return out
251
+ for fname, rng in feats.items():
252
+ if rng is None:
253
+ out[fname] = (0.0, np.inf)
254
+ continue
255
+ if isinstance(rng, (list, tuple)) and len(rng) >= 2:
256
+ lo = _coerce_bound(rng[0]) or 0.0
257
+ hi = _coerce_bound(rng[1])
258
+ if hi is None:
259
+ hi = np.inf
260
+ out[fname] = (float(lo), float(hi) if not np.isinf(hi) else np.inf)
261
+ else:
262
+ # scalar -> treat as upper bound
263
+ val = _coerce_bound(rng)
264
+ out[fname] = (0.0, float(val) if val is not None else np.inf)
265
+ return out
266
+
267
+ canonical = {}
268
+ for grp, info in parsed.items():
269
+ if not isinstance(info, dict):
270
+ feats = _coerce_feature_map(info)
271
+ canonical[grp] = {"features": feats, "state": "Modified"}
272
+ continue
273
+ feats = _coerce_feature_map(info.get("features", info.get("ranges", {})))
274
+ state = info.get("state", info.get("label", "Modified"))
275
+ canonical[grp] = {"features": feats, "state": state}
276
+ return canonical
277
+
278
+
279
+ # -------------------------
280
+ # LoadExperimentConfig
281
+ # -------------------------
282
+ class LoadExperimentConfig:
283
+ """
284
+ Load an experiment CSV (or DataFrame / file-like) into a typed var_dict.
285
+
286
+ CSV expected columns: 'variable', 'value', optional 'type'.
287
+ If 'type' missing, the loader will infer type.
288
+
289
+ Example
290
+ -------
291
+ loader = LoadExperimentConfig("experiment_config.csv")
292
+ var_dict = loader.var_dict
293
+ """
294
+
295
+ def __init__(self, experiment_config: Union[str, Path, IO, pd.DataFrame]):
296
+ self.source = experiment_config
297
+ self.df = self._load_df(experiment_config)
298
+ self.var_dict = self._parse_df(self.df)
299
+
300
+ @staticmethod
301
+ def _load_df(source: Union[str, Path, IO, pd.DataFrame]) -> pd.DataFrame:
302
+ """Load a pandas DataFrame from path, file-like, or accept if already DataFrame."""
303
+ if isinstance(source, pd.DataFrame):
304
+ df = source.copy()
305
+ else:
306
+ if isinstance(source, (str, Path)):
307
+ p = Path(source)
308
+ if not p.exists():
309
+ raise FileNotFoundError(f"Config file not found: {source}")
310
+ df = pd.read_csv(p, dtype=str, keep_default_na=False, na_values=[""])
311
+ else:
312
+ # file-like
313
+ df = pd.read_csv(source, dtype=str, keep_default_na=False, na_values=[""])
314
+ # normalize column names
315
+ df.columns = [c.strip() for c in df.columns]
316
+ if 'variable' not in df.columns:
317
+ raise ValueError("Config CSV must contain a 'variable' column.")
318
+ if 'value' not in df.columns:
319
+ df['value'] = ''
320
+ if 'type' not in df.columns:
321
+ df['type'] = ''
322
+ return df
323
+
324
+ @staticmethod
325
+ def _parse_value_as_type(value_str: Optional[str], dtype_hint: Optional[str]) -> Any:
326
+ """
327
+ Parse a single value string into a Python object guided by dtype_hint (or infer).
328
+ Supports int, float, bool, list, JSON, Python literal, or string.
329
+ """
330
+ if value_str is None:
331
+ return None
332
+ v = str(value_str).strip()
333
+ if v == "" or v.lower() == "none":
334
+ return None
335
+
336
+ hint = "" if dtype_hint is None else str(dtype_hint).strip().lower()
337
+
338
+ def parse_bool(s: str):
339
+ s2 = s.strip().lower()
340
+ if s2 in ('1', 'true', 't', 'yes', 'y', 'on'):
341
+ return True
342
+ if s2 in ('0', 'false', 'f', 'no', 'n', 'off'):
343
+ return False
344
+ raise ValueError(f"Cannot parse boolean from '{s}'")
345
+
346
+ def parse_list_like(s: str):
347
+ # try JSON first
348
+ try:
349
+ val = json.loads(s)
350
+ if isinstance(val, list):
351
+ return val
352
+ except Exception:
353
+ pass
354
+ # try python literal
355
+ try:
356
+ val = ast.literal_eval(s)
357
+ if isinstance(val, (list, tuple)):
358
+ return list(val)
359
+ except Exception:
360
+ pass
361
+ # fallback split
362
+ parts = [p.strip() for p in s.strip("()[] ").split(',') if p.strip() != ""]
363
+ return parts
364
+
365
+ if hint in ('int', 'integer'):
366
+ return int(v)
367
+ if hint in ('float', 'double'):
368
+ return float(v)
369
+ if hint in ('bool', 'boolean'):
370
+ return parse_bool(v)
371
+ if hint in ('list', 'array'):
372
+ return parse_list_like(v)
373
+ if hint in ('string', 'str'):
374
+ return v
375
+
376
+ # infer
377
+ try:
378
+ return int(v)
379
+ except Exception:
380
+ pass
381
+ try:
382
+ return float(v)
383
+ except Exception:
384
+ pass
385
+ try:
386
+ return parse_bool(v)
387
+ except Exception:
388
+ pass
389
+ try:
390
+ j = json.loads(v)
391
+ return j
392
+ except Exception:
393
+ pass
394
+ try:
395
+ lit = ast.literal_eval(v)
396
+ return lit
397
+ except Exception:
398
+ pass
399
+ if (',' in v) and (not any(ch in v for ch in '{}[]()')):
400
+ return [p.strip() for p in v.split(',') if p.strip() != ""]
401
+ return v
402
+
403
+ def _parse_df(self, df: pd.DataFrame) -> Dict[str, Any]:
404
+ parsed: Dict[str, Any] = {}
405
+ for idx, row in df.iterrows():
406
+ name = str(row['variable']).strip()
407
+ if name == "":
408
+ continue
409
+ raw_val = row.get('value', "")
410
+ raw_type = row.get('type', "")
411
+ if pd.isna(raw_val) or str(raw_val).strip() == "":
412
+ raw_val = None
413
+ try:
414
+ parsed_val = self._parse_value_as_type(raw_val, raw_type)
415
+ except Exception as e:
416
+ warnings.warn(f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value.")
417
+ parsed_val = None if raw_val is None else raw_val
418
+ if name in parsed:
419
+ warnings.warn(f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value.")
420
+ parsed[name] = parsed_val
421
+ return parsed
422
+
423
+ def to_dataframe(self) -> pd.DataFrame:
424
+ """Return parsed config as a pandas DataFrame (variable, value)."""
425
+ rows = []
426
+ for k, v in self.var_dict.items():
427
+ rows.append({'variable': k, 'value': v})
428
+ return pd.DataFrame(rows)
429
+
430
+
431
+ # -------------------------
432
+ # deep merge & defaults loader (with inheritance)
433
+ # -------------------------
434
+ def deep_merge(a: Dict[str, Any], b: Dict[str, Any]) -> Dict[str, Any]:
435
+ """
436
+ Recursively merge two dicts: returns new dict = a merged with b, where b overrides.
437
+ If both values are dicts -> merge recursively; else b replaces a.
438
+ """
439
+ out = dict(a or {})
440
+ for k, v in (b or {}).items():
441
+ if k in out and isinstance(out[k], dict) and isinstance(v, dict):
442
+ out[k] = deep_merge(out[k], v)
443
+ else:
444
+ out[k] = v
445
+ return out
446
+
447
+
448
+ def _load_defaults_file(path: Path) -> Dict[str, Any]:
449
+ if not path.exists():
450
+ return {}
451
+ text = path.read_text(encoding="utf8")
452
+ suffix = path.suffix.lower()
453
+ if suffix in (".yaml", ".yml"):
454
+ if yaml is None:
455
+ raise RuntimeError("PyYAML required to load YAML defaults (pip install pyyaml).")
456
+ return yaml.safe_load(text) or {}
457
+ elif suffix == ".json":
458
+ return json.loads(text or "{}")
459
+ else:
460
+ # try json then yaml if available
461
+ try:
462
+ return json.loads(text)
463
+ except Exception:
464
+ if yaml is not None:
465
+ return yaml.safe_load(text) or {}
466
+ raise RuntimeError(f"Unknown defaults file type for {path}; provide JSON or YAML.")
467
+
468
+
469
+ def load_defaults_with_inheritance(
470
+ defaults_dir: Union[str, Path],
471
+ modality: Optional[str],
472
+ *,
473
+ default_basename: str = "default",
474
+ allowed_exts: Tuple[str, ...] = (".yaml", ".yml", ".json"),
475
+ debug: bool = False,
476
+ ) -> Tuple[Dict[str, Any], List[str]]:
477
+ """
478
+ Strict loader: only loads default + modality + any explicit 'extends' chain.
479
+
480
+ - defaults_dir: directory containing defaults files.
481
+ - modality: name of modality (e.g. "GpC"). We look for <modality>.<ext> in defaults_dir.
482
+ - default_basename: name of fallback default file (without extension).
483
+ - allowed_exts: allowed extensions to try.
484
+ - debug: if True, prints what was loaded.
485
+
486
+ Returns (merged_defaults_dict, load_order_list) where load_order_list are resolved file paths read.
487
+ """
488
+ pdir = Path(defaults_dir) if defaults_dir is not None else None
489
+ if pdir is None or not pdir.exists():
490
+ return {}, []
491
+
492
+ # Resolve a "name" to a file in defaults_dir.
493
+ # Only treat `name` as an explicit path if it contains a path separator or is absolute.
494
+ def resolve_name_to_path(name: str) -> Optional[Path]:
495
+ n = str(name).strip()
496
+ if n == "":
497
+ return None
498
+ cand = Path(n)
499
+ # If user provided a path-like string (contains slash/backslash or absolute), allow it
500
+ if cand.is_absolute() or ("/" in n) or ("\\" in n):
501
+ if cand.exists() and cand.suffix.lower() in allowed_exts:
502
+ return cand.resolve()
503
+ return None
504
+ # Otherwise only look inside defaults_dir for name + ext (do NOT treat bare name as arbitrary file)
505
+ for ext in allowed_exts:
506
+ p = pdir / f"{n}{ext}"
507
+ if p.exists():
508
+ return p.resolve()
509
+ return None
510
+
511
+ visited = set()
512
+ load_order: List[str] = []
513
+
514
+ def _rec_load(name_or_path: Union[str, Path]) -> Dict[str, Any]:
515
+ # Resolve to a file path (strict)
516
+ if isinstance(name_or_path, Path):
517
+ p = name_or_path
518
+ else:
519
+ p = resolve_name_to_path(str(name_or_path))
520
+ if p is None:
521
+ if debug:
522
+ print(f"[defaults loader] resolve failed for '{name_or_path}'")
523
+ return {}
524
+ p = Path(p).resolve()
525
+ p_str = str(p)
526
+ if p_str in visited:
527
+ if debug:
528
+ print(f"[defaults loader] already visited {p_str} (skipping to avoid cycle)")
529
+ return {}
530
+ visited.add(p_str)
531
+
532
+ data = _load_defaults_file(p) # reuse your existing helper
533
+ if not isinstance(data, dict):
534
+ if debug:
535
+ print(f"[defaults loader] file {p_str} did not produce a dict -> ignoring")
536
+ data = {}
537
+
538
+ # Extract any extends/inherits keys (string or list). They reference other named default files.
539
+ bases = []
540
+ for key in ("extends", "inherits", "base"):
541
+ if key in data:
542
+ b = data.pop(key)
543
+ if isinstance(b, (list, tuple)):
544
+ bases = list(b)
545
+ elif isinstance(b, str):
546
+ bases = [b]
547
+ break
548
+
549
+ merged = {}
550
+ # Load bases first (in order); bases are resolved relative to defaults_dir unless given as path
551
+ for base_name in bases:
552
+ base_defaults = _rec_load(base_name)
553
+ merged = deep_merge(merged, base_defaults)
554
+
555
+ # Then merge this file's data (this file overrides its bases)
556
+ merged = deep_merge(merged, data)
557
+ load_order.append(p_str)
558
+ if debug:
559
+ print(f"[defaults loader] loaded {p_str}")
560
+ return merged
561
+
562
+ merged_defaults = {}
563
+ # Load default.* first if present
564
+ def_path = resolve_name_to_path(default_basename)
565
+ if def_path is not None:
566
+ merged_defaults = deep_merge(merged_defaults, _rec_load(def_path))
567
+
568
+ # Load modality.* if present (modality overrides default)
569
+ if modality:
570
+ mod_path = resolve_name_to_path(modality)
571
+ if mod_path is not None:
572
+ merged_defaults = deep_merge(merged_defaults, _rec_load(mod_path))
573
+ else:
574
+ if debug:
575
+ print(f"[defaults loader] no modality file found for '{modality}' in {pdir}")
576
+
577
+ if debug:
578
+ print("[defaults loader] final load order:", load_order)
579
+ return merged_defaults, load_order
580
+
581
+
582
+ # -------------------------
583
+ # ExperimentConfig dataclass
584
+ # -------------------------
585
+ @dataclass
586
+ class ExperimentConfig:
587
+ # Compute
588
+ threads: Optional[int] = None
589
+ device: str = "auto"
590
+
591
+ # General I/O
592
+ input_data_path: Optional[str] = None
593
+ output_directory: Optional[str] = None
594
+ fasta: Optional[str] = None
595
+ bam_suffix: str = ".bam"
596
+ recursive_input_search: bool = True
597
+ input_type: Optional[str] = None
598
+ input_files: Optional[List[Path]] = None
599
+ split_dir: str = "demultiplexed_BAMs"
600
+ split_path: Optional[str] = None
601
+ strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
602
+ conversions: List[str] = field(default_factory=lambda: ["unconverted"])
603
+ fasta_regions_of_interest: Optional[str] = None
604
+ sample_sheet_path: Optional[str] = None
605
+ sample_sheet_mapping_column: Optional[str] = 'Barcode'
606
+ experiment_name: Optional[str] = None
607
+ input_already_demuxed: bool = False
608
+ summary_file: Optional[Path] = None
609
+
610
+ # FASTQ input specific
611
+ fastq_barcode_map: Optional[Dict[str, str]] = None
612
+ fastq_auto_pairing: bool = True
613
+
614
+ # Remove intermediate file options
615
+ delete_intermediate_bams: bool = True
616
+ delete_intermediate_tsvs: bool = True
617
+
618
+ # Conversion/Deamination file handling
619
+ delete_intermediate_hdfs: bool = True
620
+
621
+ # Direct SMF specific params for initial AnnData loading
622
+ batch_size: int = 4
623
+ skip_unclassified: bool = True
624
+ delete_batch_hdfs: bool = True
625
+
626
+ # Sequencing modality and general experiment params
627
+ smf_modality: Optional[str] = None
628
+ sequencer: Optional[str] = None
629
+
630
+ # Enzyme / mod targets
631
+ mod_target_bases: List[str] = field(default_factory=lambda: ["GpC", "CpG"])
632
+ enzyme_target_bases: List[str] = field(default_factory=lambda: ["GpC"])
633
+
634
+ # Conversion/deamination
635
+ conversion_types: List[str] = field(default_factory=lambda: ["5mC"])
636
+
637
+ # Nanopore specific for basecalling and demultiplexing
638
+ model_dir: Optional[str] = None
639
+ barcode_kit: Optional[str] = None
640
+ model: str = "hac"
641
+ barcode_both_ends: bool = False
642
+ trim: bool = False
643
+ # General basecalling params
644
+ filter_threshold: float = 0.8
645
+ # Modified basecalling specific params
646
+ m6A_threshold: float = 0.7
647
+ m5C_threshold: float = 0.7
648
+ hm5C_threshold: float = 0.7
649
+ thresholds: List[float] = field(default_factory=list)
650
+ mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"])
651
+
652
+ # Alignment params
653
+ mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
654
+ aligner: str = "minimap2"
655
+ aligner_args: Optional[List[str]] = None
656
+ make_bigwigs: bool = False
657
+ make_beds: bool = False
658
+
659
+ # Anndata structure
660
+ reference_column: Optional[str] = 'Reference_strand'
661
+ sample_column: Optional[str] = 'Barcode'
662
+
663
+ # General Plotting
664
+ sample_name_col_for_plotting: Optional[str] = 'Barcode'
665
+ rows_per_qc_histogram_grid: int = 12
666
+
667
+ # Preprocessing - Read length and quality filter params
668
+ read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
669
+ read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
670
+ read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
671
+ read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
672
+ read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
673
+
674
+ # Preprocessing - Direct mod detection binarization params
675
+ fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
676
+ binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
677
+ positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
678
+ negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
679
+ infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
680
+ inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
681
+ fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
682
+ output_binary_layer_name: Optional[str] = "binarized_methylation"
683
+
684
+ # Preprocessing - Read modification filter params
685
+ read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
686
+ read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
687
+ read_mod_filtering_any_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
688
+ read_mod_filtering_a_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
689
+ read_mod_filtering_use_other_c_as_background: bool = True
690
+ min_valid_fraction_positions_in_read_vs_ref: float = 0.2
691
+
692
+ # Preprocessing - Duplicate detection params
693
+ duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
694
+ duplicate_detection_distance_threshold: float = 0.07
695
+ hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_any_C_site_modified'])
696
+ duplicate_detection_keep_best_metric: str ='read_quality'
697
+ duplicate_detection_window_size_for_hamming_neighbors: int = 50
698
+ duplicate_detection_min_overlapping_positions: int = 20
699
+ duplicate_detection_do_hierarchical: bool = True
700
+ duplicate_detection_hierarchical_linkage: str = "average"
701
+ duplicate_detection_do_pca: bool = False
702
+
703
+ # Preprocessing - Position QC
704
+ position_max_nan_threshold: float = 0.1
705
+
706
+ # Basic Analysis - Clustermap params
707
+ layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
708
+
709
+ # Basic Analysis - UMAP/Leiden params
710
+ layer_for_umap_plotting: Optional[str] = 'nan_half'
711
+ umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
712
+
713
+ # Basic Analysis - Spatial Autocorrelation params
714
+ rows_per_qc_autocorr_grid: int = 12
715
+ autocorr_rolling_window_size: int = 25
716
+ autocorr_max_lag: int = 800
717
+ autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'any_C'])
718
+
719
+ # Basic Analysis - Correlation Matrix params
720
+ correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
721
+ correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
722
+ correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
723
+
724
+ # HMM params
725
+ hmm_n_states: int = 2
726
+ hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
727
+ hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
728
+ hmm_init_start_probs: List[float] = field(default_factory=lambda: [0.5, 0.5])
729
+ hmm_eps: float = 1e-8
730
+ hmm_dtype: str = "float64"
731
+ hmm_annotation_threshold: float = 0.5
732
+ hmm_batch_size: int = 1024
733
+ hmm_use_viterbi: bool = False
734
+ hmm_device: Optional[str] = None
735
+ hmm_methbases: Optional[List[str]] = None # if None, HMM.annotate_adata will fall back to mod_target_bases
736
+ footprints: Optional[bool] = True
737
+ accessible_patches: Optional[bool] = True
738
+ cpg: Optional[bool] = False
739
+ hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
740
+ hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
741
+
742
+ # Pipeline control flow - load adata
743
+ force_redo_load_adata: bool = False
744
+
745
+ # Pipeline control flow - preprocessing and QC
746
+ force_redo_preprocessing: bool = False
747
+ force_reload_sample_sheet: bool = True
748
+ bypass_add_read_length_and_mapping_qc: bool = False
749
+ force_redo_add_read_length_and_mapping_qc: bool = False
750
+ bypass_clean_nan: bool = False
751
+ force_redo_clean_nan: bool = False
752
+ bypass_append_base_context: bool = False
753
+ force_redo_append_base_context: bool = False
754
+ invert_adata: bool = False
755
+ bypass_append_binary_layer_by_base_context: bool = False
756
+ force_redo_append_binary_layer_by_base_context: bool = False
757
+ bypass_calculate_read_modification_stats: bool = False
758
+ force_redo_calculate_read_modification_stats: bool = False
759
+ bypass_filter_reads_on_modification_thresholds: bool = False
760
+ force_redo_filter_reads_on_modification_thresholds: bool = False
761
+ bypass_flag_duplicate_reads: bool = False
762
+ force_redo_flag_duplicate_reads: bool = False
763
+ bypass_complexity_analysis: bool = False
764
+ force_redo_complexity_analysis: bool = False
765
+
766
+ # Pipeline control flow - Basic Analyses
767
+ force_redo_basic_analyses: bool = False
768
+ bypass_basic_clustermaps: bool = False
769
+ force_redo_basic_clustermaps: bool = False
770
+ bypass_basic_umap: bool = False
771
+ force_redo_basic_umap: bool = False
772
+ bypass_spatial_autocorr_calculations: bool = False
773
+ force_redo_spatial_autocorr_calculations: bool = False
774
+ bypass_spatial_autocorr_plotting: bool = False
775
+ force_redo_spatial_autocorr_plotting: bool = False
776
+ bypass_matrix_corr_calculations: bool = False
777
+ force_redo_matrix_corr_calculations: bool = False
778
+ bypass_matrix_corr_plotting: bool = False
779
+ force_redo_matrix_corr_plotting: bool = False
780
+
781
+ # Pipeline control flow - HMM Analyses
782
+ bypass_hmm_fit: bool = False
783
+ force_redo_hmm_fit: bool = False
784
+ bypass_hmm_apply: bool = False
785
+ force_redo_hmm_apply: bool = False
786
+
787
+ # metadata
788
+ config_source: Optional[str] = None
789
+
790
+ # -------------------------
791
+ # Construction helpers
792
+ # -------------------------
793
+ @classmethod
794
+ def from_var_dict(
795
+ cls,
796
+ var_dict: Optional[Dict[str, Any]],
797
+ date_str: Optional[str] = None,
798
+ config_source: Optional[str] = None,
799
+ defaults_dir: Optional[Union[str, Path]] = None,
800
+ defaults_map: Optional[Dict[str, Dict[str, Any]]] = None,
801
+ merge_with_defaults: bool = True,
802
+ override_with_csv: bool = True,
803
+ allow_csv_extends: bool = True,
804
+ allow_null_override: bool = False,
805
+ ) -> Tuple["ExperimentConfig", Dict[str, Any]]:
806
+ """
807
+ Create ExperimentConfig from a raw var_dict (as produced by LoadExperimentConfig).
808
+ Returns (instance, report) where report contains modality/defaults/merged info.
809
+
810
+ merge_with_defaults: load defaults from defaults_dir or defaults_map.
811
+ override_with_csv: CSV values override defaults; if False defaults take precedence.
812
+ allow_csv_extends: allow the CSV to include 'extends' to pull in extra defaults files.
813
+ allow_null_override: if False, CSV keys with value None will NOT override defaults (keeps defaults).
814
+ """
815
+ var_dict = var_dict or {}
816
+
817
+ # 1) normalize incoming values
818
+ normalized: Dict[str, Any] = {}
819
+ for k, v in var_dict.items():
820
+ if v is None:
821
+ normalized[k] = None
822
+ continue
823
+ if isinstance(v, str):
824
+ s = v.strip()
825
+ if s == "" or s.lower() == "none":
826
+ normalized[k] = None
827
+ else:
828
+ normalized[k] = _try_json_or_literal(s)
829
+ else:
830
+ normalized[k] = v
831
+
832
+ modality = normalized.get("smf_modality")
833
+ if isinstance(modality, (list, tuple)) and len(modality) > 0:
834
+ modality = modality[0]
835
+
836
+ defaults_loaded = {}
837
+ defaults_source_chain: List[str] = []
838
+ if merge_with_defaults:
839
+ if defaults_map and modality in defaults_map:
840
+ defaults_loaded = dict(defaults_map[modality] or {})
841
+ defaults_source_chain = [f"defaults_map['{modality}']"]
842
+ elif defaults_dir is not None:
843
+ defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(defaults_dir, modality)
844
+
845
+ # If CSV asks to extend defaults, load those and merge
846
+ merged = dict(defaults_loaded or {})
847
+
848
+ if allow_csv_extends:
849
+ extends = normalized.get("extends") or normalized.get("inherits")
850
+ if extends:
851
+ if isinstance(extends, str):
852
+ ext_list = [extends]
853
+ elif isinstance(extends, (list, tuple)):
854
+ ext_list = list(extends)
855
+ else:
856
+ ext_list = []
857
+ for ext in ext_list:
858
+ ext_defaults, ext_sources = (load_defaults_with_inheritance(defaults_dir, ext) if defaults_dir else ({}, []))
859
+ merged = deep_merge(merged, ext_defaults)
860
+ for s in ext_sources:
861
+ if s not in defaults_source_chain:
862
+ defaults_source_chain.append(s)
863
+
864
+ # Now overlay CSV values
865
+ # Prepare csv_effective depending on allow_null_override
866
+ csv_effective = {}
867
+ for k, v in normalized.items():
868
+ if k in ("extends", "inherits"):
869
+ continue
870
+ if v is None and not allow_null_override:
871
+ # skip: keep default
872
+ continue
873
+ csv_effective[k] = v
874
+
875
+ if override_with_csv:
876
+ merged = deep_merge(merged, csv_effective)
877
+ else:
878
+ # defaults take precedence: only set keys missing in merged
879
+ for k, v in csv_effective.items():
880
+ if k not in merged:
881
+ merged[k] = v
882
+
883
+ # experiment_name default
884
+ if merged.get("experiment_name") is None and date_str:
885
+ merged["experiment_name"] = f"{date_str}_SMF_experiment"
886
+
887
+ # Input file types and path handling
888
+ input_data_path = Path(merged['input_data_path'])
889
+
890
+ # Detect the input filetype
891
+ if input_data_path.is_file():
892
+ suffix = input_data_path.suffix.lower()
893
+ suffixes = [s.lower() for s in input_data_path.suffixes] # handles multi-part extensions
894
+
895
+ # recognize multi-suffix cases like .fastq.gz or .fq.gz
896
+ if any(s in ['.pod5', '.p5'] for s in suffixes):
897
+ input_type = "pod5"
898
+ input_files = [Path(input_data_path)]
899
+ elif any(s in ['.fast5', '.f5'] for s in suffixes):
900
+ input_type = "fast5"
901
+ input_files = [Path(input_data_path)]
902
+ elif any(s in ['.fastq', '.fq'] for s in suffixes):
903
+ input_type = "fastq"
904
+ input_files = [Path(input_data_path)]
905
+ elif any(s in ['.bam'] for s in suffixes):
906
+ input_type = "bam"
907
+ input_files = [Path(input_data_path)]
908
+ elif any(s in ['.h5ad', ".h5"] for s in suffixes):
909
+ input_type = "h5ad"
910
+ input_files = [Path(input_data_path)]
911
+ else:
912
+ print("Error detecting input file type")
913
+
914
+ elif input_data_path.is_dir():
915
+ found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
916
+
917
+ if found["input_is_pod5"]:
918
+ input_type = "pod5"
919
+ input_files = found["pod5_paths"]
920
+ elif found["input_is_fast5"]:
921
+ input_type = "fast5"
922
+ input_files = found["fast5_paths"]
923
+ elif found["input_is_fastq"]:
924
+ input_type = "fastq"
925
+ input_files = found["fastq_paths"]
926
+ elif found["input_is_bam"]:
927
+ input_type = "bam"
928
+ input_files = found["bam_paths"]
929
+ elif found["input_is_h5ad"]:
930
+ input_type = "h5ad"
931
+ input_files = found["h5ad_paths"]
932
+
933
+ print(f"Found {found['all_files_searched']} files; fastq={len(found["fastq_paths"])}, bam={len(found["bam_paths"])}, pod5={len(found["pod5_paths"])}, fast5={len(found["fast5_paths"])}, , h5ad={len(found["h5ad_paths"])}")
934
+
935
+ # summary file output path
936
+ output_dir = Path(merged['output_directory'])
937
+ summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
938
+ summary_file = output_dir / summary_file_basename
939
+
940
+ # Demultiplexing output path
941
+ split_dir = merged.get("split_dir", "demultiplexed_BAMs")
942
+ split_path = output_dir / split_dir
943
+
944
+ # final normalization
945
+ if "strands" in merged:
946
+ merged["strands"] = _parse_list(merged["strands"])
947
+ if "conversions" in merged:
948
+ merged["conversions"] = _parse_list(merged["conversions"])
949
+ if "mod_target_bases" in merged:
950
+ merged["mod_target_bases"] = _parse_list(merged["mod_target_bases"])
951
+ if "conversion_types" in merged:
952
+ merged["conversion_types"] = _parse_list(merged["conversion_types"])
953
+
954
+ merged["filter_threshold"] = float(_parse_numeric(merged.get("filter_threshold", 0.8), 0.8))
955
+ merged["m6A_threshold"] = float(_parse_numeric(merged.get("m6A_threshold", 0.7), 0.7))
956
+ merged["m5C_threshold"] = float(_parse_numeric(merged.get("m5C_threshold", 0.7), 0.7))
957
+ merged["hm5C_threshold"] = float(_parse_numeric(merged.get("hm5C_threshold", 0.7), 0.7))
958
+ merged["thresholds"] = [
959
+ merged["filter_threshold"],
960
+ merged["m6A_threshold"],
961
+ merged["m5C_threshold"],
962
+ merged["hm5C_threshold"],
963
+ ]
964
+
965
+ for bkey in ("barcode_both_ends", "trim", "input_already_demuxed", "make_bigwigs", "skip_unclassified", "delete_batch_hdfs"):
966
+ if bkey in merged:
967
+ merged[bkey] = _parse_bool(merged[bkey])
968
+
969
+ if "batch_size" in merged:
970
+ merged["batch_size"] = int(_parse_numeric(merged.get("batch_size", 4), 4))
971
+ if "threads" in merged:
972
+ tval = _parse_numeric(merged.get("threads", None), None)
973
+ merged["threads"] = None if tval is None else int(tval)
974
+
975
+ if "aligner_args" in merged and merged.get("aligner_args") is None:
976
+ merged.pop("aligner_args", None)
977
+
978
+ # --- Resolve aligner_args into concrete list for the chosen aligner ---
979
+ merged['aligner_args'] = resolve_aligner_args(merged)
980
+
981
+ if "mod_list" in merged:
982
+ merged["mod_list"] = _parse_list(merged.get("mod_list"))
983
+
984
+ # HMM feature set handling
985
+ if "hmm_feature_sets" in merged:
986
+ merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged["hmm_feature_sets"])
987
+ else:
988
+ # allow older names (footprint_ranges, accessible_ranges, cpg_ranges) — optional:
989
+ maybe_fs = {}
990
+ if "footprint_ranges" in merged or "hmm_footprint_ranges" in merged:
991
+ maybe_fs["footprint"] = {"features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")), "state": merged.get("hmm_footprint_state", "Non-Modified")}
992
+ if "accessible_ranges" in merged or "hmm_accessible_ranges" in merged:
993
+ maybe_fs["accessible"] = {"features": merged.get("hmm_accessible_ranges", merged.get("accessible_ranges")), "state": merged.get("hmm_accessible_state", "Modified")}
994
+ if "cpg_ranges" in merged or "hmm_cpg_ranges" in merged:
995
+ maybe_fs["cpg"] = {"features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")), "state": merged.get("hmm_cpg_state", "Modified")}
996
+ if maybe_fs:
997
+ merged.setdefault("hmm_feature_sets", {})
998
+ for k, v in maybe_fs.items():
999
+ merged["hmm_feature_sets"].setdefault(k, v)
1000
+
1001
+ # final normalization will be done below
1002
+ # (do not set local hmm_feature_sets here — do it once below)
1003
+ pass
1004
+
1005
+ # Final normalization of hmm_feature_sets and canonical local variables
1006
+ merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged.get("hmm_feature_sets", {}))
1007
+ hmm_feature_sets = merged.get("hmm_feature_sets", {})
1008
+ hmm_annotation_threshold = merged.get("hmm_annotation_threshold", 0.5)
1009
+ hmm_batch_size = int(merged.get("hmm_batch_size", 1024))
1010
+ hmm_use_viterbi = bool(merged.get("hmm_use_viterbi", False))
1011
+ hmm_device = merged.get("hmm_device", None)
1012
+ hmm_methbases = _parse_list(merged.get("hmm_methbases", None))
1013
+ if not hmm_methbases: # None or []
1014
+ hmm_methbases = _parse_list(merged.get("mod_target_bases", None))
1015
+ if not hmm_methbases:
1016
+ hmm_methbases = ['C']
1017
+ hmm_methbases = list(hmm_methbases)
1018
+ hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
1019
+
1020
+ # instantiate dataclass
1021
+ instance = cls(
1022
+ smf_modality = merged.get("smf_modality"),
1023
+ input_data_path = input_data_path,
1024
+ recursive_input_search = merged.get("recursive_input_search"),
1025
+ input_type = input_type,
1026
+ input_files = input_files,
1027
+ output_directory = output_dir,
1028
+ summary_file = summary_file,
1029
+ fasta = merged.get("fasta"),
1030
+ sequencer = merged.get("sequencer"),
1031
+ model_dir = merged.get("model_dir"),
1032
+ barcode_kit = merged.get("barcode_kit"),
1033
+ fastq_barcode_map = merged.get("fastq_barcode_map"),
1034
+ fastq_auto_pairing = merged.get("fastq_auto_pairing"),
1035
+ bam_suffix = merged.get("bam_suffix", ".bam"),
1036
+ split_dir = split_dir,
1037
+ split_path = split_path,
1038
+ strands = merged.get("strands", ["bottom","top"]),
1039
+ conversions = merged.get("conversions", ["unconverted"]),
1040
+ fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
1041
+ mapping_threshold = float(merged.get("mapping_threshold", 0.01)),
1042
+ experiment_name = merged.get("experiment_name"),
1043
+ model = merged.get("model", "hac"),
1044
+ barcode_both_ends = merged.get("barcode_both_ends", False),
1045
+ trim = merged.get("trim", False),
1046
+ input_already_demuxed = merged.get("input_already_demuxed", False),
1047
+ threads = merged.get("threads"),
1048
+ sample_sheet_path = merged.get("sample_sheet_path"),
1049
+ sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
1050
+ delete_intermediate_bams = merged.get("delete_intermediate_bams", True),
1051
+ delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
1052
+ aligner = merged.get("aligner", "minimap2"),
1053
+ aligner_args = merged.get("aligner_args", None),
1054
+ device = merged.get("device", "auto"),
1055
+ make_bigwigs = merged.get("make_bigwigs", False),
1056
+ make_beds = merged.get("make_beds", False),
1057
+ delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
1058
+ mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
1059
+ enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
1060
+ conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
1061
+ filter_threshold = merged.get("filter_threshold", 0.8),
1062
+ m6A_threshold = merged.get("m6A_threshold", 0.7),
1063
+ m5C_threshold = merged.get("m5C_threshold", 0.7),
1064
+ hm5C_threshold = merged.get("hm5C_threshold", 0.7),
1065
+ thresholds = merged.get("thresholds", []),
1066
+ mod_list = merged.get("mod_list", ["5mC_5hmC","6mA"]),
1067
+ batch_size = merged.get("batch_size", 4),
1068
+ skip_unclassified = merged.get("skip_unclassified", True),
1069
+ delete_batch_hdfs = merged.get("delete_batch_hdfs", True),
1070
+ reference_column = merged.get("reference_column", 'Reference_strand'),
1071
+ sample_column = merged.get("sample_column", 'Barcode'),
1072
+ sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
1073
+ fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
1074
+ binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
1075
+ positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
1076
+ negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
1077
+ infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
1078
+ inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
1079
+ fit_j_threshold = merged.get("fit_j_threshold", 0.5),
1080
+ output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
1081
+ layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
1082
+ layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
1083
+ umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
1084
+ rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
1085
+ rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
1086
+ autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
1087
+ autocorr_max_lag = merged.get("autocorr_max_lag", 800),
1088
+ autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'any_C']),
1089
+ hmm_n_states = merged.get("hmm_n_states", 2),
1090
+ hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
1091
+ hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
1092
+ hmm_init_start_probs = merged.get("hmm_init_start_probs",[0.5, 0.5]),
1093
+ hmm_eps = merged.get("hmm_eps", 1e-8),
1094
+ hmm_dtype = merged.get("hmm_dtype", "float64"),
1095
+ hmm_feature_sets = hmm_feature_sets,
1096
+ hmm_annotation_threshold = hmm_annotation_threshold,
1097
+ hmm_batch_size = hmm_batch_size,
1098
+ hmm_use_viterbi = hmm_use_viterbi,
1099
+ hmm_methbases = hmm_methbases,
1100
+ hmm_device = hmm_device,
1101
+ hmm_merge_layer_features = hmm_merge_layer_features,
1102
+ footprints = merged.get("footprints", None),
1103
+ accessible_patches = merged.get("accessible_patches", None),
1104
+ cpg = merged.get("cpg", None),
1105
+ read_coord_filter = merged.get("read_coord_filter", [None, None]),
1106
+ read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
1107
+ read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
1108
+ read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
1109
+ read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
1110
+ read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
1111
+ read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
1112
+ read_mod_filtering_any_c_thresholds = merged.get("read_mod_filtering_any_c_thresholds", [0.025, 0.975]),
1113
+ read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
1114
+ read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
1115
+ min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
1116
+ duplicate_detection_site_types = merged.get("duplicate_detection_site_types", ['GpC', 'CpG', 'ambiguous_GpC_CpG']),
1117
+ duplicate_detection_distance_threshold = merged.get("duplicate_detection_distance_threshold", 0.07),
1118
+ duplicate_detection_keep_best_metric = merged.get("duplicate_detection_keep_best_metric", "read_quality"),
1119
+ duplicate_detection_window_size_for_hamming_neighbors = merged.get("duplicate_detection_window_size_for_hamming_neighbors", 50),
1120
+ duplicate_detection_min_overlapping_positions = merged.get("duplicate_detection_min_overlapping_positions", 20),
1121
+ duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
1122
+ duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
1123
+ duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
1124
+ position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
1125
+ correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
1126
+ correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1127
+ correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
1128
+ hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
1129
+ force_redo_load_adata = merged.get("force_redo_load_adata", False),
1130
+ force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
1131
+ force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
1132
+ bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),
1133
+ force_redo_add_read_length_and_mapping_qc = merged.get("force_redo_add_read_length_and_mapping_qc", False),
1134
+ bypass_clean_nan = merged.get("bypass_clean_nan", False),
1135
+ force_redo_clean_nan = merged.get("force_redo_clean_nan", False),
1136
+ bypass_append_base_context = merged.get("bypass_append_base_context", False),
1137
+ force_redo_append_base_context = merged.get("force_redo_append_base_context", False),
1138
+ invert_adata = merged.get("invert_adata", False),
1139
+ bypass_append_binary_layer_by_base_context = merged.get("bypass_append_binary_layer_by_base_context", False),
1140
+ force_redo_append_binary_layer_by_base_context = merged.get("force_redo_append_binary_layer_by_base_context", False),
1141
+ bypass_calculate_read_modification_stats = merged.get("bypass_calculate_read_modification_stats", False),
1142
+ force_redo_calculate_read_modification_stats = merged.get("force_redo_calculate_read_modification_stats", False),
1143
+ bypass_filter_reads_on_modification_thresholds = merged.get("bypass_filter_reads_on_modification_thresholds", False),
1144
+ force_redo_filter_reads_on_modification_thresholds = merged.get("force_redo_filter_reads_on_modification_thresholds", False),
1145
+ bypass_flag_duplicate_reads = merged.get("bypass_flag_duplicate_reads", False),
1146
+ force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
1147
+ bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
1148
+ force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
1149
+ force_redo_basic_analyses = merged.get("force_redo_basic_analyses", False),
1150
+ bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
1151
+ force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
1152
+ bypass_basic_umap = merged.get("bypass_basic_umap", False),
1153
+ force_redo_basic_umap = merged.get("force_redo_basic_umap", False),
1154
+ bypass_spatial_autocorr_calculations = merged.get("bypass_spatial_autocorr_calculations", False),
1155
+ force_redo_spatial_autocorr_calculations = merged.get("force_redo_spatial_autocorr_calculations", False),
1156
+ bypass_spatial_autocorr_plotting = merged.get("bypass_spatial_autocorr_plotting", False),
1157
+ force_redo_spatial_autocorr_plotting = merged.get("force_redo_spatial_autocorr_plotting", False),
1158
+ bypass_matrix_corr_calculations = merged.get("bypass_matrix_corr_calculations", False),
1159
+ force_redo_matrix_corr_calculations = merged.get("force_redo_matrix_corr_calculations", False),
1160
+ bypass_matrix_corr_plotting = merged.get("bypass_matrix_corr_plotting", False),
1161
+ force_redo_matrix_corr_plotting = merged.get("force_redo_matrix_corr_plotting", False),
1162
+ bypass_hmm_fit = merged.get("bypass_hmm_fit", False),
1163
+ force_redo_hmm_fit = merged.get("force_redo_hmm_fit", False),
1164
+ bypass_hmm_apply = merged.get("bypass_hmm_apply", False),
1165
+ force_redo_hmm_apply = merged.get("force_redo_hmm_apply", False),
1166
+
1167
+ config_source = config_source or "<var_dict>",
1168
+ )
1169
+
1170
+ report = {
1171
+ "modality": modality,
1172
+ "defaults_source_chain": defaults_source_chain,
1173
+ "defaults_loaded": defaults_loaded,
1174
+ "csv_normalized": normalized,
1175
+ "merged": merged,
1176
+ }
1177
+ return instance, report
1178
+
1179
+ # convenience: load from CSV via LoadExperimentConfig
1180
+ @classmethod
1181
+ def from_csv(
1182
+ cls,
1183
+ csv_input: Union[str, Path, IO, pd.DataFrame],
1184
+ date_str: Optional[str] = None,
1185
+ config_source: Optional[str] = None,
1186
+ defaults_dir: Optional[Union[str, Path]] = None,
1187
+ defaults_map: Optional[Dict[str, Dict[str, Any]]] = None,
1188
+ **kwargs,
1189
+ ) -> Tuple["ExperimentConfig", Dict[str, Any]]:
1190
+ """
1191
+ Load CSV using LoadExperimentConfig (or accept DataFrame) and build ExperimentConfig.
1192
+ Additional kwargs passed to from_var_dict().
1193
+ """
1194
+ loader = LoadExperimentConfig(csv_input) if not isinstance(csv_input, pd.DataFrame) else LoadExperimentConfig(pd.DataFrame(csv_input))
1195
+ var_dict = loader.var_dict
1196
+ return cls.from_var_dict(var_dict, date_str=date_str, config_source=config_source, defaults_dir=defaults_dir, defaults_map=defaults_map, **kwargs)
1197
+
1198
+ # -------------------------
1199
+ # validation & serialization
1200
+ # -------------------------
1201
+ def _validate_hmm_features_structure(hfs: dict) -> List[str]:
1202
+ errs = []
1203
+ if not isinstance(hfs, dict):
1204
+ errs.append("hmm_feature_sets must be a mapping if provided.")
1205
+ return errs
1206
+ for g, info in hfs.items():
1207
+ if not isinstance(info, dict):
1208
+ errs.append(f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'.")
1209
+ continue
1210
+ feats = info.get("features")
1211
+ if not isinstance(feats, dict) or len(feats) == 0:
1212
+ errs.append(f"hmm_feature_sets['{g}'] must include non-empty 'features' mapping.")
1213
+ continue
1214
+ for fname, rng in feats.items():
1215
+ try:
1216
+ lo, hi = float(rng[0]), float(rng[1])
1217
+ if lo < 0 or hi <= lo:
1218
+ errs.append(f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}.")
1219
+ except Exception:
1220
+ errs.append(f"Feature range for {g}:{fname} is invalid: {rng}")
1221
+ return errs
1222
+
1223
+ def validate(self, require_paths: bool = True, raise_on_error: bool = True) -> List[str]:
1224
+ """
1225
+ Validate the config. If require_paths True, check paths (input_data_path, fasta) exist;
1226
+ attempt to create output_directory if missing.
1227
+ Returns a list of error messages (empty if none). Raises ValueError if raise_on_error True.
1228
+ """
1229
+ errors: List[str] = []
1230
+ if not self.input_data_path:
1231
+ errors.append("input_data_path is required but missing.")
1232
+ if not self.output_directory:
1233
+ errors.append("output_directory is required but missing.")
1234
+ if not self.fasta:
1235
+ errors.append("fasta (reference FASTA) is required but missing.")
1236
+
1237
+ if require_paths:
1238
+ if self.input_data_path and not Path(self.input_data_path).exists():
1239
+ errors.append(f"input_data_path does not exist: {self.input_data_path}")
1240
+ if self.fasta and not Path(self.fasta).exists():
1241
+ errors.append(f"fasta does not exist: {self.fasta}")
1242
+ outp = Path(self.output_directory) if self.output_directory else None
1243
+ if outp and not outp.exists():
1244
+ try:
1245
+ outp.mkdir(parents=True, exist_ok=True)
1246
+ except Exception as e:
1247
+ errors.append(f"Could not create output_directory {self.output_directory}: {e}")
1248
+
1249
+ if not (0.0 <= float(self.mapping_threshold) <= 1.0):
1250
+ errors.append("mapping_threshold must be in [0,1].")
1251
+ for t in (self.filter_threshold, self.m6A_threshold, self.m5C_threshold, self.hm5C_threshold):
1252
+ if not (0.0 <= float(t) <= 1.0):
1253
+ errors.append(f"threshold value {t} must be in [0,1].")
1254
+
1255
+ if raise_on_error and errors:
1256
+ raise ValueError("ExperimentConfig validation failed:\n " + "\n ".join(errors))
1257
+
1258
+ errs = _validate_hmm_features_structure(self.hmm_feature_sets)
1259
+ errors.extend(errs)
1260
+
1261
+ return errors
1262
+
1263
+ def to_dict(self) -> Dict[str, Any]:
1264
+ return asdict(self)
1265
+
1266
+ def to_yaml(self, path: Optional[Union[str, Path]] = None) -> str:
1267
+ """
1268
+ Dump config to YAML (string if path None) or save to file at path.
1269
+ If pyyaml is not installed, fallback to JSON for file write.
1270
+ """
1271
+ data = self.to_dict()
1272
+ if path is None:
1273
+ if yaml is None:
1274
+ return json.dumps(data, indent=2)
1275
+ return yaml.safe_dump(data, sort_keys=False)
1276
+ else:
1277
+ p = Path(path)
1278
+ if yaml is None:
1279
+ p.write_text(json.dumps(data, indent=2), encoding="utf8")
1280
+ else:
1281
+ p.write_text(yaml.safe_dump(data, sort_keys=False), encoding="utf8")
1282
+ return str(p)
1283
+
1284
+ def save(self, path: Union[str, Path]) -> str:
1285
+ return self.to_yaml(path)
1286
+
1287
+ def __repr__(self) -> str:
1288
+ return f"<ExperimentConfig modality={self.smf_modality} experiment_name={self.experiment_name} source={self.config_source}>"