smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1288 @@
|
|
|
1
|
+
# experiment_config.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import ast
|
|
4
|
+
import json
|
|
5
|
+
import warnings
|
|
6
|
+
from dataclasses import dataclass, field, asdict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
|
|
9
|
+
from .discover_input_files import discover_input_files
|
|
10
|
+
|
|
11
|
+
# Optional dependency for YAML handling
|
|
12
|
+
try:
|
|
13
|
+
import yaml
|
|
14
|
+
except Exception:
|
|
15
|
+
yaml = None
|
|
16
|
+
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# -------------------------
|
|
22
|
+
# Utility parsing functions
|
|
23
|
+
# -------------------------
|
|
24
|
+
def _parse_bool(v: Any) -> bool:
|
|
25
|
+
if isinstance(v, bool):
|
|
26
|
+
return v
|
|
27
|
+
if v is None:
|
|
28
|
+
return False
|
|
29
|
+
s = str(v).strip().lower()
|
|
30
|
+
if s in ("1", "true", "t", "yes", "y", "on"):
|
|
31
|
+
return True
|
|
32
|
+
if s in ("0", "false", "f", "no", "n", "off", ""):
|
|
33
|
+
return False
|
|
34
|
+
try:
|
|
35
|
+
return float(s) != 0.0
|
|
36
|
+
except Exception:
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _parse_list(v: Any) -> List:
|
|
41
|
+
if v is None:
|
|
42
|
+
return []
|
|
43
|
+
if isinstance(v, (list, tuple)):
|
|
44
|
+
return list(v)
|
|
45
|
+
s = str(v).strip()
|
|
46
|
+
if s == "" or s.lower() == "none":
|
|
47
|
+
return []
|
|
48
|
+
# try JSON
|
|
49
|
+
try:
|
|
50
|
+
parsed = json.loads(s)
|
|
51
|
+
if isinstance(parsed, list):
|
|
52
|
+
return parsed
|
|
53
|
+
except Exception:
|
|
54
|
+
pass
|
|
55
|
+
# try python literal eval
|
|
56
|
+
try:
|
|
57
|
+
lit = ast.literal_eval(s)
|
|
58
|
+
if isinstance(lit, (list, tuple)):
|
|
59
|
+
return list(lit)
|
|
60
|
+
except Exception:
|
|
61
|
+
pass
|
|
62
|
+
# fallback comma separated
|
|
63
|
+
s2 = s.strip("[]() ")
|
|
64
|
+
parts = [p.strip() for p in s2.split(",") if p.strip() != ""]
|
|
65
|
+
return parts
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _parse_numeric(v: Any, fallback: Any = None) -> Any:
|
|
69
|
+
if v is None:
|
|
70
|
+
return fallback
|
|
71
|
+
if isinstance(v, (int, float)):
|
|
72
|
+
return v
|
|
73
|
+
s = str(v).strip()
|
|
74
|
+
if s == "" or s.lower() == "none":
|
|
75
|
+
return fallback
|
|
76
|
+
try:
|
|
77
|
+
return int(s)
|
|
78
|
+
except Exception:
|
|
79
|
+
try:
|
|
80
|
+
return float(s)
|
|
81
|
+
except Exception:
|
|
82
|
+
return fallback
|
|
83
|
+
|
|
84
|
+
def _try_json_or_literal(s: Any) -> Any:
|
|
85
|
+
"""Try parse JSON or python literal; otherwise return original string."""
|
|
86
|
+
if s is None:
|
|
87
|
+
return None
|
|
88
|
+
if not isinstance(s, str):
|
|
89
|
+
return s
|
|
90
|
+
s0 = s.strip()
|
|
91
|
+
if s0 == "":
|
|
92
|
+
return None
|
|
93
|
+
# try json
|
|
94
|
+
try:
|
|
95
|
+
return json.loads(s0)
|
|
96
|
+
except Exception:
|
|
97
|
+
pass
|
|
98
|
+
# try python literal
|
|
99
|
+
try:
|
|
100
|
+
return ast.literal_eval(s0)
|
|
101
|
+
except Exception:
|
|
102
|
+
pass
|
|
103
|
+
return s
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def resolve_aligner_args(
|
|
107
|
+
merged: dict,
|
|
108
|
+
default_by_aligner: Optional[Dict[str, List[str]]] = None,
|
|
109
|
+
aligner_synonyms: Optional[Dict[str, str]] = None,
|
|
110
|
+
) -> List[str]:
|
|
111
|
+
"""
|
|
112
|
+
Resolve merged['aligner_args'] into a concrete list for the chosen aligner and sequencer.
|
|
113
|
+
|
|
114
|
+
Behavior (search order):
|
|
115
|
+
1. If aligner_args is a dict, try keys in this order (case-insensitive):
|
|
116
|
+
a) "<aligner>@<sequencer>" (top-level combined key)
|
|
117
|
+
b) aligner -> (if dict) sequencer (nested) -> 'default' fallback
|
|
118
|
+
c) aligner -> (if list) use that list
|
|
119
|
+
d) top-level 'default' key in aligner_args dict
|
|
120
|
+
2. If aligner_args is a list -> return it (applies to any aligner/sequencer).
|
|
121
|
+
3. If aligner_args is a string -> try parse JSON/literal or return single-element list.
|
|
122
|
+
4. Otherwise fall back to builtin defaults per aligner.
|
|
123
|
+
"""
|
|
124
|
+
# builtin defaults (aligner -> args)
|
|
125
|
+
builtin_defaults = {
|
|
126
|
+
"minimap2": ['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no'],
|
|
127
|
+
"dorado": ['--mm2-opts', '-N', '5'],
|
|
128
|
+
}
|
|
129
|
+
if default_by_aligner is None:
|
|
130
|
+
default_by_aligner = builtin_defaults
|
|
131
|
+
|
|
132
|
+
# synonyms mapping
|
|
133
|
+
synonyms = {"mm2": "minimap2", "minimap": "minimap2", "minimap-2": "minimap2"}
|
|
134
|
+
if aligner_synonyms:
|
|
135
|
+
synonyms.update(aligner_synonyms)
|
|
136
|
+
|
|
137
|
+
# canonicalize requested aligner and sequencer
|
|
138
|
+
raw_aligner = merged.get("aligner", "minimap2") or "minimap2"
|
|
139
|
+
raw_sequencer = merged.get("sequencer", None) # e.g. 'ont', 'pacbio', 'illumina'
|
|
140
|
+
key_align = str(raw_aligner).strip().lower()
|
|
141
|
+
key_seq = None if raw_sequencer is None else str(raw_sequencer).strip().lower()
|
|
142
|
+
if key_align in synonyms:
|
|
143
|
+
key_align = synonyms[key_align]
|
|
144
|
+
|
|
145
|
+
raw = merged.get("aligner_args", None)
|
|
146
|
+
|
|
147
|
+
# helper to coerce a candidate to list[str]
|
|
148
|
+
def _coerce_to_list(val):
|
|
149
|
+
if isinstance(val, (list, tuple)):
|
|
150
|
+
return [str(x) for x in val]
|
|
151
|
+
if isinstance(val, str):
|
|
152
|
+
parsed = _try_json_or_literal(val)
|
|
153
|
+
if isinstance(parsed, (list, tuple)):
|
|
154
|
+
return [str(x) for x in parsed]
|
|
155
|
+
return [str(parsed)]
|
|
156
|
+
if val is None:
|
|
157
|
+
return None
|
|
158
|
+
return [str(val)]
|
|
159
|
+
|
|
160
|
+
# If dict, do layered lookups
|
|
161
|
+
if isinstance(raw, dict):
|
|
162
|
+
# case-insensitive dict
|
|
163
|
+
top_map = {str(k).lower(): v for k, v in raw.items()}
|
|
164
|
+
|
|
165
|
+
# 1) try combined top-level key "aligner@sequencer"
|
|
166
|
+
if key_seq:
|
|
167
|
+
combined_key = f"{key_align}@{key_seq}"
|
|
168
|
+
if combined_key in top_map:
|
|
169
|
+
res = _coerce_to_list(top_map[combined_key])
|
|
170
|
+
if res:
|
|
171
|
+
return res
|
|
172
|
+
|
|
173
|
+
# 2) try aligner key
|
|
174
|
+
if key_align in top_map:
|
|
175
|
+
val = top_map[key_align]
|
|
176
|
+
# if nested dict: try sequencer key then 'default'
|
|
177
|
+
if isinstance(val, dict):
|
|
178
|
+
submap = {str(k).lower(): v for k, v in val.items()}
|
|
179
|
+
if key_seq and key_seq in submap:
|
|
180
|
+
res = _coerce_to_list(submap[key_seq])
|
|
181
|
+
if res:
|
|
182
|
+
return res
|
|
183
|
+
if "default" in submap:
|
|
184
|
+
res = _coerce_to_list(submap["default"])
|
|
185
|
+
if res:
|
|
186
|
+
return res
|
|
187
|
+
# nothing matched inside aligner->dict; fall back to top-level aligner (no sequencer)
|
|
188
|
+
else:
|
|
189
|
+
# aligner maps to list/str: use it
|
|
190
|
+
res = _coerce_to_list(val)
|
|
191
|
+
if res:
|
|
192
|
+
return res
|
|
193
|
+
|
|
194
|
+
# 3) try top-level 'default' key inside aligner_args mapping
|
|
195
|
+
if "default" in top_map:
|
|
196
|
+
res = _coerce_to_list(top_map["default"])
|
|
197
|
+
if res:
|
|
198
|
+
return res
|
|
199
|
+
|
|
200
|
+
# 4) last top-level attempt: any key equal to aligner synonyms etc (already handled)
|
|
201
|
+
# fallthrough to builtin
|
|
202
|
+
# If user provided a concrete list -> use it
|
|
203
|
+
if isinstance(raw, (list, tuple)):
|
|
204
|
+
return [str(x) for x in raw]
|
|
205
|
+
|
|
206
|
+
# If scalar string, attempt to parse
|
|
207
|
+
if isinstance(raw, str):
|
|
208
|
+
parsed = _try_json_or_literal(raw)
|
|
209
|
+
if isinstance(parsed, (list, tuple)):
|
|
210
|
+
return [str(x) for x in parsed]
|
|
211
|
+
return [str(parsed)]
|
|
212
|
+
|
|
213
|
+
# Nothing found -> fallback builtin default
|
|
214
|
+
return list(default_by_aligner.get(key_align, []))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# HMM default params and hepler functions
|
|
218
|
+
def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
|
|
219
|
+
"""
|
|
220
|
+
Normalize user-provided `hmm_feature_sets` into canonical structure:
|
|
221
|
+
{ group_name: {"features": {label: (lo, hi), ...}, "state": "<Modified|Non-Modified>"} }
|
|
222
|
+
Accepts dict, JSON/string, None. Returns {} for empty input.
|
|
223
|
+
"""
|
|
224
|
+
if raw is None:
|
|
225
|
+
return {}
|
|
226
|
+
parsed = raw
|
|
227
|
+
if isinstance(raw, str):
|
|
228
|
+
parsed = _try_json_or_literal(raw)
|
|
229
|
+
if not isinstance(parsed, dict):
|
|
230
|
+
return {}
|
|
231
|
+
|
|
232
|
+
def _coerce_bound(x):
|
|
233
|
+
if x is None:
|
|
234
|
+
return None
|
|
235
|
+
if isinstance(x, (int, float)):
|
|
236
|
+
return float(x)
|
|
237
|
+
s = str(x).strip().lower()
|
|
238
|
+
if s in ("inf", "infty", "infinite"):
|
|
239
|
+
return np.inf
|
|
240
|
+
if s in ("none", ""):
|
|
241
|
+
return None
|
|
242
|
+
try:
|
|
243
|
+
return float(x)
|
|
244
|
+
except Exception:
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
def _coerce_feature_map(feats):
|
|
248
|
+
out = {}
|
|
249
|
+
if not isinstance(feats, dict):
|
|
250
|
+
return out
|
|
251
|
+
for fname, rng in feats.items():
|
|
252
|
+
if rng is None:
|
|
253
|
+
out[fname] = (0.0, np.inf)
|
|
254
|
+
continue
|
|
255
|
+
if isinstance(rng, (list, tuple)) and len(rng) >= 2:
|
|
256
|
+
lo = _coerce_bound(rng[0]) or 0.0
|
|
257
|
+
hi = _coerce_bound(rng[1])
|
|
258
|
+
if hi is None:
|
|
259
|
+
hi = np.inf
|
|
260
|
+
out[fname] = (float(lo), float(hi) if not np.isinf(hi) else np.inf)
|
|
261
|
+
else:
|
|
262
|
+
# scalar -> treat as upper bound
|
|
263
|
+
val = _coerce_bound(rng)
|
|
264
|
+
out[fname] = (0.0, float(val) if val is not None else np.inf)
|
|
265
|
+
return out
|
|
266
|
+
|
|
267
|
+
canonical = {}
|
|
268
|
+
for grp, info in parsed.items():
|
|
269
|
+
if not isinstance(info, dict):
|
|
270
|
+
feats = _coerce_feature_map(info)
|
|
271
|
+
canonical[grp] = {"features": feats, "state": "Modified"}
|
|
272
|
+
continue
|
|
273
|
+
feats = _coerce_feature_map(info.get("features", info.get("ranges", {})))
|
|
274
|
+
state = info.get("state", info.get("label", "Modified"))
|
|
275
|
+
canonical[grp] = {"features": feats, "state": state}
|
|
276
|
+
return canonical
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
# -------------------------
|
|
280
|
+
# LoadExperimentConfig
|
|
281
|
+
# -------------------------
|
|
282
|
+
class LoadExperimentConfig:
|
|
283
|
+
"""
|
|
284
|
+
Load an experiment CSV (or DataFrame / file-like) into a typed var_dict.
|
|
285
|
+
|
|
286
|
+
CSV expected columns: 'variable', 'value', optional 'type'.
|
|
287
|
+
If 'type' missing, the loader will infer type.
|
|
288
|
+
|
|
289
|
+
Example
|
|
290
|
+
-------
|
|
291
|
+
loader = LoadExperimentConfig("experiment_config.csv")
|
|
292
|
+
var_dict = loader.var_dict
|
|
293
|
+
"""
|
|
294
|
+
|
|
295
|
+
def __init__(self, experiment_config: Union[str, Path, IO, pd.DataFrame]):
|
|
296
|
+
self.source = experiment_config
|
|
297
|
+
self.df = self._load_df(experiment_config)
|
|
298
|
+
self.var_dict = self._parse_df(self.df)
|
|
299
|
+
|
|
300
|
+
@staticmethod
|
|
301
|
+
def _load_df(source: Union[str, Path, IO, pd.DataFrame]) -> pd.DataFrame:
|
|
302
|
+
"""Load a pandas DataFrame from path, file-like, or accept if already DataFrame."""
|
|
303
|
+
if isinstance(source, pd.DataFrame):
|
|
304
|
+
df = source.copy()
|
|
305
|
+
else:
|
|
306
|
+
if isinstance(source, (str, Path)):
|
|
307
|
+
p = Path(source)
|
|
308
|
+
if not p.exists():
|
|
309
|
+
raise FileNotFoundError(f"Config file not found: {source}")
|
|
310
|
+
df = pd.read_csv(p, dtype=str, keep_default_na=False, na_values=[""])
|
|
311
|
+
else:
|
|
312
|
+
# file-like
|
|
313
|
+
df = pd.read_csv(source, dtype=str, keep_default_na=False, na_values=[""])
|
|
314
|
+
# normalize column names
|
|
315
|
+
df.columns = [c.strip() for c in df.columns]
|
|
316
|
+
if 'variable' not in df.columns:
|
|
317
|
+
raise ValueError("Config CSV must contain a 'variable' column.")
|
|
318
|
+
if 'value' not in df.columns:
|
|
319
|
+
df['value'] = ''
|
|
320
|
+
if 'type' not in df.columns:
|
|
321
|
+
df['type'] = ''
|
|
322
|
+
return df
|
|
323
|
+
|
|
324
|
+
@staticmethod
|
|
325
|
+
def _parse_value_as_type(value_str: Optional[str], dtype_hint: Optional[str]) -> Any:
|
|
326
|
+
"""
|
|
327
|
+
Parse a single value string into a Python object guided by dtype_hint (or infer).
|
|
328
|
+
Supports int, float, bool, list, JSON, Python literal, or string.
|
|
329
|
+
"""
|
|
330
|
+
if value_str is None:
|
|
331
|
+
return None
|
|
332
|
+
v = str(value_str).strip()
|
|
333
|
+
if v == "" or v.lower() == "none":
|
|
334
|
+
return None
|
|
335
|
+
|
|
336
|
+
hint = "" if dtype_hint is None else str(dtype_hint).strip().lower()
|
|
337
|
+
|
|
338
|
+
def parse_bool(s: str):
|
|
339
|
+
s2 = s.strip().lower()
|
|
340
|
+
if s2 in ('1', 'true', 't', 'yes', 'y', 'on'):
|
|
341
|
+
return True
|
|
342
|
+
if s2 in ('0', 'false', 'f', 'no', 'n', 'off'):
|
|
343
|
+
return False
|
|
344
|
+
raise ValueError(f"Cannot parse boolean from '{s}'")
|
|
345
|
+
|
|
346
|
+
def parse_list_like(s: str):
|
|
347
|
+
# try JSON first
|
|
348
|
+
try:
|
|
349
|
+
val = json.loads(s)
|
|
350
|
+
if isinstance(val, list):
|
|
351
|
+
return val
|
|
352
|
+
except Exception:
|
|
353
|
+
pass
|
|
354
|
+
# try python literal
|
|
355
|
+
try:
|
|
356
|
+
val = ast.literal_eval(s)
|
|
357
|
+
if isinstance(val, (list, tuple)):
|
|
358
|
+
return list(val)
|
|
359
|
+
except Exception:
|
|
360
|
+
pass
|
|
361
|
+
# fallback split
|
|
362
|
+
parts = [p.strip() for p in s.strip("()[] ").split(',') if p.strip() != ""]
|
|
363
|
+
return parts
|
|
364
|
+
|
|
365
|
+
if hint in ('int', 'integer'):
|
|
366
|
+
return int(v)
|
|
367
|
+
if hint in ('float', 'double'):
|
|
368
|
+
return float(v)
|
|
369
|
+
if hint in ('bool', 'boolean'):
|
|
370
|
+
return parse_bool(v)
|
|
371
|
+
if hint in ('list', 'array'):
|
|
372
|
+
return parse_list_like(v)
|
|
373
|
+
if hint in ('string', 'str'):
|
|
374
|
+
return v
|
|
375
|
+
|
|
376
|
+
# infer
|
|
377
|
+
try:
|
|
378
|
+
return int(v)
|
|
379
|
+
except Exception:
|
|
380
|
+
pass
|
|
381
|
+
try:
|
|
382
|
+
return float(v)
|
|
383
|
+
except Exception:
|
|
384
|
+
pass
|
|
385
|
+
try:
|
|
386
|
+
return parse_bool(v)
|
|
387
|
+
except Exception:
|
|
388
|
+
pass
|
|
389
|
+
try:
|
|
390
|
+
j = json.loads(v)
|
|
391
|
+
return j
|
|
392
|
+
except Exception:
|
|
393
|
+
pass
|
|
394
|
+
try:
|
|
395
|
+
lit = ast.literal_eval(v)
|
|
396
|
+
return lit
|
|
397
|
+
except Exception:
|
|
398
|
+
pass
|
|
399
|
+
if (',' in v) and (not any(ch in v for ch in '{}[]()')):
|
|
400
|
+
return [p.strip() for p in v.split(',') if p.strip() != ""]
|
|
401
|
+
return v
|
|
402
|
+
|
|
403
|
+
def _parse_df(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
404
|
+
parsed: Dict[str, Any] = {}
|
|
405
|
+
for idx, row in df.iterrows():
|
|
406
|
+
name = str(row['variable']).strip()
|
|
407
|
+
if name == "":
|
|
408
|
+
continue
|
|
409
|
+
raw_val = row.get('value', "")
|
|
410
|
+
raw_type = row.get('type', "")
|
|
411
|
+
if pd.isna(raw_val) or str(raw_val).strip() == "":
|
|
412
|
+
raw_val = None
|
|
413
|
+
try:
|
|
414
|
+
parsed_val = self._parse_value_as_type(raw_val, raw_type)
|
|
415
|
+
except Exception as e:
|
|
416
|
+
warnings.warn(f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value.")
|
|
417
|
+
parsed_val = None if raw_val is None else raw_val
|
|
418
|
+
if name in parsed:
|
|
419
|
+
warnings.warn(f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value.")
|
|
420
|
+
parsed[name] = parsed_val
|
|
421
|
+
return parsed
|
|
422
|
+
|
|
423
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
424
|
+
"""Return parsed config as a pandas DataFrame (variable, value)."""
|
|
425
|
+
rows = []
|
|
426
|
+
for k, v in self.var_dict.items():
|
|
427
|
+
rows.append({'variable': k, 'value': v})
|
|
428
|
+
return pd.DataFrame(rows)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
# -------------------------
|
|
432
|
+
# deep merge & defaults loader (with inheritance)
|
|
433
|
+
# -------------------------
|
|
434
|
+
def deep_merge(a: Dict[str, Any], b: Dict[str, Any]) -> Dict[str, Any]:
|
|
435
|
+
"""
|
|
436
|
+
Recursively merge two dicts: returns new dict = a merged with b, where b overrides.
|
|
437
|
+
If both values are dicts -> merge recursively; else b replaces a.
|
|
438
|
+
"""
|
|
439
|
+
out = dict(a or {})
|
|
440
|
+
for k, v in (b or {}).items():
|
|
441
|
+
if k in out and isinstance(out[k], dict) and isinstance(v, dict):
|
|
442
|
+
out[k] = deep_merge(out[k], v)
|
|
443
|
+
else:
|
|
444
|
+
out[k] = v
|
|
445
|
+
return out
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _load_defaults_file(path: Path) -> Dict[str, Any]:
|
|
449
|
+
if not path.exists():
|
|
450
|
+
return {}
|
|
451
|
+
text = path.read_text(encoding="utf8")
|
|
452
|
+
suffix = path.suffix.lower()
|
|
453
|
+
if suffix in (".yaml", ".yml"):
|
|
454
|
+
if yaml is None:
|
|
455
|
+
raise RuntimeError("PyYAML required to load YAML defaults (pip install pyyaml).")
|
|
456
|
+
return yaml.safe_load(text) or {}
|
|
457
|
+
elif suffix == ".json":
|
|
458
|
+
return json.loads(text or "{}")
|
|
459
|
+
else:
|
|
460
|
+
# try json then yaml if available
|
|
461
|
+
try:
|
|
462
|
+
return json.loads(text)
|
|
463
|
+
except Exception:
|
|
464
|
+
if yaml is not None:
|
|
465
|
+
return yaml.safe_load(text) or {}
|
|
466
|
+
raise RuntimeError(f"Unknown defaults file type for {path}; provide JSON or YAML.")
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def load_defaults_with_inheritance(
|
|
470
|
+
defaults_dir: Union[str, Path],
|
|
471
|
+
modality: Optional[str],
|
|
472
|
+
*,
|
|
473
|
+
default_basename: str = "default",
|
|
474
|
+
allowed_exts: Tuple[str, ...] = (".yaml", ".yml", ".json"),
|
|
475
|
+
debug: bool = False,
|
|
476
|
+
) -> Tuple[Dict[str, Any], List[str]]:
|
|
477
|
+
"""
|
|
478
|
+
Strict loader: only loads default + modality + any explicit 'extends' chain.
|
|
479
|
+
|
|
480
|
+
- defaults_dir: directory containing defaults files.
|
|
481
|
+
- modality: name of modality (e.g. "GpC"). We look for <modality>.<ext> in defaults_dir.
|
|
482
|
+
- default_basename: name of fallback default file (without extension).
|
|
483
|
+
- allowed_exts: allowed extensions to try.
|
|
484
|
+
- debug: if True, prints what was loaded.
|
|
485
|
+
|
|
486
|
+
Returns (merged_defaults_dict, load_order_list) where load_order_list are resolved file paths read.
|
|
487
|
+
"""
|
|
488
|
+
pdir = Path(defaults_dir) if defaults_dir is not None else None
|
|
489
|
+
if pdir is None or not pdir.exists():
|
|
490
|
+
return {}, []
|
|
491
|
+
|
|
492
|
+
# Resolve a "name" to a file in defaults_dir.
|
|
493
|
+
# Only treat `name` as an explicit path if it contains a path separator or is absolute.
|
|
494
|
+
def resolve_name_to_path(name: str) -> Optional[Path]:
|
|
495
|
+
n = str(name).strip()
|
|
496
|
+
if n == "":
|
|
497
|
+
return None
|
|
498
|
+
cand = Path(n)
|
|
499
|
+
# If user provided a path-like string (contains slash/backslash or absolute), allow it
|
|
500
|
+
if cand.is_absolute() or ("/" in n) or ("\\" in n):
|
|
501
|
+
if cand.exists() and cand.suffix.lower() in allowed_exts:
|
|
502
|
+
return cand.resolve()
|
|
503
|
+
return None
|
|
504
|
+
# Otherwise only look inside defaults_dir for name + ext (do NOT treat bare name as arbitrary file)
|
|
505
|
+
for ext in allowed_exts:
|
|
506
|
+
p = pdir / f"{n}{ext}"
|
|
507
|
+
if p.exists():
|
|
508
|
+
return p.resolve()
|
|
509
|
+
return None
|
|
510
|
+
|
|
511
|
+
visited = set()
|
|
512
|
+
load_order: List[str] = []
|
|
513
|
+
|
|
514
|
+
def _rec_load(name_or_path: Union[str, Path]) -> Dict[str, Any]:
|
|
515
|
+
# Resolve to a file path (strict)
|
|
516
|
+
if isinstance(name_or_path, Path):
|
|
517
|
+
p = name_or_path
|
|
518
|
+
else:
|
|
519
|
+
p = resolve_name_to_path(str(name_or_path))
|
|
520
|
+
if p is None:
|
|
521
|
+
if debug:
|
|
522
|
+
print(f"[defaults loader] resolve failed for '{name_or_path}'")
|
|
523
|
+
return {}
|
|
524
|
+
p = Path(p).resolve()
|
|
525
|
+
p_str = str(p)
|
|
526
|
+
if p_str in visited:
|
|
527
|
+
if debug:
|
|
528
|
+
print(f"[defaults loader] already visited {p_str} (skipping to avoid cycle)")
|
|
529
|
+
return {}
|
|
530
|
+
visited.add(p_str)
|
|
531
|
+
|
|
532
|
+
data = _load_defaults_file(p) # reuse your existing helper
|
|
533
|
+
if not isinstance(data, dict):
|
|
534
|
+
if debug:
|
|
535
|
+
print(f"[defaults loader] file {p_str} did not produce a dict -> ignoring")
|
|
536
|
+
data = {}
|
|
537
|
+
|
|
538
|
+
# Extract any extends/inherits keys (string or list). They reference other named default files.
|
|
539
|
+
bases = []
|
|
540
|
+
for key in ("extends", "inherits", "base"):
|
|
541
|
+
if key in data:
|
|
542
|
+
b = data.pop(key)
|
|
543
|
+
if isinstance(b, (list, tuple)):
|
|
544
|
+
bases = list(b)
|
|
545
|
+
elif isinstance(b, str):
|
|
546
|
+
bases = [b]
|
|
547
|
+
break
|
|
548
|
+
|
|
549
|
+
merged = {}
|
|
550
|
+
# Load bases first (in order); bases are resolved relative to defaults_dir unless given as path
|
|
551
|
+
for base_name in bases:
|
|
552
|
+
base_defaults = _rec_load(base_name)
|
|
553
|
+
merged = deep_merge(merged, base_defaults)
|
|
554
|
+
|
|
555
|
+
# Then merge this file's data (this file overrides its bases)
|
|
556
|
+
merged = deep_merge(merged, data)
|
|
557
|
+
load_order.append(p_str)
|
|
558
|
+
if debug:
|
|
559
|
+
print(f"[defaults loader] loaded {p_str}")
|
|
560
|
+
return merged
|
|
561
|
+
|
|
562
|
+
merged_defaults = {}
|
|
563
|
+
# Load default.* first if present
|
|
564
|
+
def_path = resolve_name_to_path(default_basename)
|
|
565
|
+
if def_path is not None:
|
|
566
|
+
merged_defaults = deep_merge(merged_defaults, _rec_load(def_path))
|
|
567
|
+
|
|
568
|
+
# Load modality.* if present (modality overrides default)
|
|
569
|
+
if modality:
|
|
570
|
+
mod_path = resolve_name_to_path(modality)
|
|
571
|
+
if mod_path is not None:
|
|
572
|
+
merged_defaults = deep_merge(merged_defaults, _rec_load(mod_path))
|
|
573
|
+
else:
|
|
574
|
+
if debug:
|
|
575
|
+
print(f"[defaults loader] no modality file found for '{modality}' in {pdir}")
|
|
576
|
+
|
|
577
|
+
if debug:
|
|
578
|
+
print("[defaults loader] final load order:", load_order)
|
|
579
|
+
return merged_defaults, load_order
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
# -------------------------
|
|
583
|
+
# ExperimentConfig dataclass
|
|
584
|
+
# -------------------------
|
|
585
|
+
@dataclass
|
|
586
|
+
class ExperimentConfig:
|
|
587
|
+
# Compute
|
|
588
|
+
threads: Optional[int] = None
|
|
589
|
+
device: str = "auto"
|
|
590
|
+
|
|
591
|
+
# General I/O
|
|
592
|
+
input_data_path: Optional[str] = None
|
|
593
|
+
output_directory: Optional[str] = None
|
|
594
|
+
fasta: Optional[str] = None
|
|
595
|
+
bam_suffix: str = ".bam"
|
|
596
|
+
recursive_input_search: bool = True
|
|
597
|
+
input_type: Optional[str] = None
|
|
598
|
+
input_files: Optional[List[Path]] = None
|
|
599
|
+
split_dir: str = "demultiplexed_BAMs"
|
|
600
|
+
split_path: Optional[str] = None
|
|
601
|
+
strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
|
|
602
|
+
conversions: List[str] = field(default_factory=lambda: ["unconverted"])
|
|
603
|
+
fasta_regions_of_interest: Optional[str] = None
|
|
604
|
+
sample_sheet_path: Optional[str] = None
|
|
605
|
+
sample_sheet_mapping_column: Optional[str] = 'Barcode'
|
|
606
|
+
experiment_name: Optional[str] = None
|
|
607
|
+
input_already_demuxed: bool = False
|
|
608
|
+
summary_file: Optional[Path] = None
|
|
609
|
+
|
|
610
|
+
# FASTQ input specific
|
|
611
|
+
fastq_barcode_map: Optional[Dict[str, str]] = None
|
|
612
|
+
fastq_auto_pairing: bool = True
|
|
613
|
+
|
|
614
|
+
# Remove intermediate file options
|
|
615
|
+
delete_intermediate_bams: bool = True
|
|
616
|
+
delete_intermediate_tsvs: bool = True
|
|
617
|
+
|
|
618
|
+
# Conversion/Deamination file handling
|
|
619
|
+
delete_intermediate_hdfs: bool = True
|
|
620
|
+
|
|
621
|
+
# Direct SMF specific params for initial AnnData loading
|
|
622
|
+
batch_size: int = 4
|
|
623
|
+
skip_unclassified: bool = True
|
|
624
|
+
delete_batch_hdfs: bool = True
|
|
625
|
+
|
|
626
|
+
# Sequencing modality and general experiment params
|
|
627
|
+
smf_modality: Optional[str] = None
|
|
628
|
+
sequencer: Optional[str] = None
|
|
629
|
+
|
|
630
|
+
# Enzyme / mod targets
|
|
631
|
+
mod_target_bases: List[str] = field(default_factory=lambda: ["GpC", "CpG"])
|
|
632
|
+
enzyme_target_bases: List[str] = field(default_factory=lambda: ["GpC"])
|
|
633
|
+
|
|
634
|
+
# Conversion/deamination
|
|
635
|
+
conversion_types: List[str] = field(default_factory=lambda: ["5mC"])
|
|
636
|
+
|
|
637
|
+
# Nanopore specific for basecalling and demultiplexing
|
|
638
|
+
model_dir: Optional[str] = None
|
|
639
|
+
barcode_kit: Optional[str] = None
|
|
640
|
+
model: str = "hac"
|
|
641
|
+
barcode_both_ends: bool = False
|
|
642
|
+
trim: bool = False
|
|
643
|
+
# General basecalling params
|
|
644
|
+
filter_threshold: float = 0.8
|
|
645
|
+
# Modified basecalling specific params
|
|
646
|
+
m6A_threshold: float = 0.7
|
|
647
|
+
m5C_threshold: float = 0.7
|
|
648
|
+
hm5C_threshold: float = 0.7
|
|
649
|
+
thresholds: List[float] = field(default_factory=list)
|
|
650
|
+
mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"])
|
|
651
|
+
|
|
652
|
+
# Alignment params
|
|
653
|
+
mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
|
|
654
|
+
aligner: str = "minimap2"
|
|
655
|
+
aligner_args: Optional[List[str]] = None
|
|
656
|
+
make_bigwigs: bool = False
|
|
657
|
+
make_beds: bool = False
|
|
658
|
+
|
|
659
|
+
# Anndata structure
|
|
660
|
+
reference_column: Optional[str] = 'Reference_strand'
|
|
661
|
+
sample_column: Optional[str] = 'Barcode'
|
|
662
|
+
|
|
663
|
+
# General Plotting
|
|
664
|
+
sample_name_col_for_plotting: Optional[str] = 'Barcode'
|
|
665
|
+
rows_per_qc_histogram_grid: int = 12
|
|
666
|
+
|
|
667
|
+
# Preprocessing - Read length and quality filter params
|
|
668
|
+
read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
669
|
+
read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
|
|
670
|
+
read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
|
|
671
|
+
read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
|
|
672
|
+
read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
673
|
+
|
|
674
|
+
# Preprocessing - Direct mod detection binarization params
|
|
675
|
+
fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
676
|
+
binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
677
|
+
positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
|
|
678
|
+
negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
|
|
679
|
+
infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
680
|
+
inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
681
|
+
fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
682
|
+
output_binary_layer_name: Optional[str] = "binarized_methylation"
|
|
683
|
+
|
|
684
|
+
# Preprocessing - Read modification filter params
|
|
685
|
+
read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
686
|
+
read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
|
|
687
|
+
read_mod_filtering_any_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
688
|
+
read_mod_filtering_a_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
689
|
+
read_mod_filtering_use_other_c_as_background: bool = True
|
|
690
|
+
min_valid_fraction_positions_in_read_vs_ref: float = 0.2
|
|
691
|
+
|
|
692
|
+
# Preprocessing - Duplicate detection params
|
|
693
|
+
duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
|
|
694
|
+
duplicate_detection_distance_threshold: float = 0.07
|
|
695
|
+
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_any_C_site_modified'])
|
|
696
|
+
duplicate_detection_keep_best_metric: str ='read_quality'
|
|
697
|
+
duplicate_detection_window_size_for_hamming_neighbors: int = 50
|
|
698
|
+
duplicate_detection_min_overlapping_positions: int = 20
|
|
699
|
+
duplicate_detection_do_hierarchical: bool = True
|
|
700
|
+
duplicate_detection_hierarchical_linkage: str = "average"
|
|
701
|
+
duplicate_detection_do_pca: bool = False
|
|
702
|
+
|
|
703
|
+
# Preprocessing - Position QC
|
|
704
|
+
position_max_nan_threshold: float = 0.1
|
|
705
|
+
|
|
706
|
+
# Basic Analysis - Clustermap params
|
|
707
|
+
layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
|
|
708
|
+
|
|
709
|
+
# Basic Analysis - UMAP/Leiden params
|
|
710
|
+
layer_for_umap_plotting: Optional[str] = 'nan_half'
|
|
711
|
+
umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
|
|
712
|
+
|
|
713
|
+
# Basic Analysis - Spatial Autocorrelation params
|
|
714
|
+
rows_per_qc_autocorr_grid: int = 12
|
|
715
|
+
autocorr_rolling_window_size: int = 25
|
|
716
|
+
autocorr_max_lag: int = 800
|
|
717
|
+
autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'any_C'])
|
|
718
|
+
|
|
719
|
+
# Basic Analysis - Correlation Matrix params
|
|
720
|
+
correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
|
|
721
|
+
correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
|
|
722
|
+
correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
|
|
723
|
+
|
|
724
|
+
# HMM params
|
|
725
|
+
hmm_n_states: int = 2
|
|
726
|
+
hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
|
|
727
|
+
hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
|
|
728
|
+
hmm_init_start_probs: List[float] = field(default_factory=lambda: [0.5, 0.5])
|
|
729
|
+
hmm_eps: float = 1e-8
|
|
730
|
+
hmm_dtype: str = "float64"
|
|
731
|
+
hmm_annotation_threshold: float = 0.5
|
|
732
|
+
hmm_batch_size: int = 1024
|
|
733
|
+
hmm_use_viterbi: bool = False
|
|
734
|
+
hmm_device: Optional[str] = None
|
|
735
|
+
hmm_methbases: Optional[List[str]] = None # if None, HMM.annotate_adata will fall back to mod_target_bases
|
|
736
|
+
footprints: Optional[bool] = True
|
|
737
|
+
accessible_patches: Optional[bool] = True
|
|
738
|
+
cpg: Optional[bool] = False
|
|
739
|
+
hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
|
|
740
|
+
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
|
|
741
|
+
|
|
742
|
+
# Pipeline control flow - load adata
|
|
743
|
+
force_redo_load_adata: bool = False
|
|
744
|
+
|
|
745
|
+
# Pipeline control flow - preprocessing and QC
|
|
746
|
+
force_redo_preprocessing: bool = False
|
|
747
|
+
force_reload_sample_sheet: bool = True
|
|
748
|
+
bypass_add_read_length_and_mapping_qc: bool = False
|
|
749
|
+
force_redo_add_read_length_and_mapping_qc: bool = False
|
|
750
|
+
bypass_clean_nan: bool = False
|
|
751
|
+
force_redo_clean_nan: bool = False
|
|
752
|
+
bypass_append_base_context: bool = False
|
|
753
|
+
force_redo_append_base_context: bool = False
|
|
754
|
+
invert_adata: bool = False
|
|
755
|
+
bypass_append_binary_layer_by_base_context: bool = False
|
|
756
|
+
force_redo_append_binary_layer_by_base_context: bool = False
|
|
757
|
+
bypass_calculate_read_modification_stats: bool = False
|
|
758
|
+
force_redo_calculate_read_modification_stats: bool = False
|
|
759
|
+
bypass_filter_reads_on_modification_thresholds: bool = False
|
|
760
|
+
force_redo_filter_reads_on_modification_thresholds: bool = False
|
|
761
|
+
bypass_flag_duplicate_reads: bool = False
|
|
762
|
+
force_redo_flag_duplicate_reads: bool = False
|
|
763
|
+
bypass_complexity_analysis: bool = False
|
|
764
|
+
force_redo_complexity_analysis: bool = False
|
|
765
|
+
|
|
766
|
+
# Pipeline control flow - Basic Analyses
|
|
767
|
+
force_redo_basic_analyses: bool = False
|
|
768
|
+
bypass_basic_clustermaps: bool = False
|
|
769
|
+
force_redo_basic_clustermaps: bool = False
|
|
770
|
+
bypass_basic_umap: bool = False
|
|
771
|
+
force_redo_basic_umap: bool = False
|
|
772
|
+
bypass_spatial_autocorr_calculations: bool = False
|
|
773
|
+
force_redo_spatial_autocorr_calculations: bool = False
|
|
774
|
+
bypass_spatial_autocorr_plotting: bool = False
|
|
775
|
+
force_redo_spatial_autocorr_plotting: bool = False
|
|
776
|
+
bypass_matrix_corr_calculations: bool = False
|
|
777
|
+
force_redo_matrix_corr_calculations: bool = False
|
|
778
|
+
bypass_matrix_corr_plotting: bool = False
|
|
779
|
+
force_redo_matrix_corr_plotting: bool = False
|
|
780
|
+
|
|
781
|
+
# Pipeline control flow - HMM Analyses
|
|
782
|
+
bypass_hmm_fit: bool = False
|
|
783
|
+
force_redo_hmm_fit: bool = False
|
|
784
|
+
bypass_hmm_apply: bool = False
|
|
785
|
+
force_redo_hmm_apply: bool = False
|
|
786
|
+
|
|
787
|
+
# metadata
|
|
788
|
+
config_source: Optional[str] = None
|
|
789
|
+
|
|
790
|
+
# -------------------------
|
|
791
|
+
# Construction helpers
|
|
792
|
+
# -------------------------
|
|
793
|
+
@classmethod
|
|
794
|
+
def from_var_dict(
|
|
795
|
+
cls,
|
|
796
|
+
var_dict: Optional[Dict[str, Any]],
|
|
797
|
+
date_str: Optional[str] = None,
|
|
798
|
+
config_source: Optional[str] = None,
|
|
799
|
+
defaults_dir: Optional[Union[str, Path]] = None,
|
|
800
|
+
defaults_map: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
801
|
+
merge_with_defaults: bool = True,
|
|
802
|
+
override_with_csv: bool = True,
|
|
803
|
+
allow_csv_extends: bool = True,
|
|
804
|
+
allow_null_override: bool = False,
|
|
805
|
+
) -> Tuple["ExperimentConfig", Dict[str, Any]]:
|
|
806
|
+
"""
|
|
807
|
+
Create ExperimentConfig from a raw var_dict (as produced by LoadExperimentConfig).
|
|
808
|
+
Returns (instance, report) where report contains modality/defaults/merged info.
|
|
809
|
+
|
|
810
|
+
merge_with_defaults: load defaults from defaults_dir or defaults_map.
|
|
811
|
+
override_with_csv: CSV values override defaults; if False defaults take precedence.
|
|
812
|
+
allow_csv_extends: allow the CSV to include 'extends' to pull in extra defaults files.
|
|
813
|
+
allow_null_override: if False, CSV keys with value None will NOT override defaults (keeps defaults).
|
|
814
|
+
"""
|
|
815
|
+
var_dict = var_dict or {}
|
|
816
|
+
|
|
817
|
+
# 1) normalize incoming values
|
|
818
|
+
normalized: Dict[str, Any] = {}
|
|
819
|
+
for k, v in var_dict.items():
|
|
820
|
+
if v is None:
|
|
821
|
+
normalized[k] = None
|
|
822
|
+
continue
|
|
823
|
+
if isinstance(v, str):
|
|
824
|
+
s = v.strip()
|
|
825
|
+
if s == "" or s.lower() == "none":
|
|
826
|
+
normalized[k] = None
|
|
827
|
+
else:
|
|
828
|
+
normalized[k] = _try_json_or_literal(s)
|
|
829
|
+
else:
|
|
830
|
+
normalized[k] = v
|
|
831
|
+
|
|
832
|
+
modality = normalized.get("smf_modality")
|
|
833
|
+
if isinstance(modality, (list, tuple)) and len(modality) > 0:
|
|
834
|
+
modality = modality[0]
|
|
835
|
+
|
|
836
|
+
defaults_loaded = {}
|
|
837
|
+
defaults_source_chain: List[str] = []
|
|
838
|
+
if merge_with_defaults:
|
|
839
|
+
if defaults_map and modality in defaults_map:
|
|
840
|
+
defaults_loaded = dict(defaults_map[modality] or {})
|
|
841
|
+
defaults_source_chain = [f"defaults_map['{modality}']"]
|
|
842
|
+
elif defaults_dir is not None:
|
|
843
|
+
defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(defaults_dir, modality)
|
|
844
|
+
|
|
845
|
+
# If CSV asks to extend defaults, load those and merge
|
|
846
|
+
merged = dict(defaults_loaded or {})
|
|
847
|
+
|
|
848
|
+
if allow_csv_extends:
|
|
849
|
+
extends = normalized.get("extends") or normalized.get("inherits")
|
|
850
|
+
if extends:
|
|
851
|
+
if isinstance(extends, str):
|
|
852
|
+
ext_list = [extends]
|
|
853
|
+
elif isinstance(extends, (list, tuple)):
|
|
854
|
+
ext_list = list(extends)
|
|
855
|
+
else:
|
|
856
|
+
ext_list = []
|
|
857
|
+
for ext in ext_list:
|
|
858
|
+
ext_defaults, ext_sources = (load_defaults_with_inheritance(defaults_dir, ext) if defaults_dir else ({}, []))
|
|
859
|
+
merged = deep_merge(merged, ext_defaults)
|
|
860
|
+
for s in ext_sources:
|
|
861
|
+
if s not in defaults_source_chain:
|
|
862
|
+
defaults_source_chain.append(s)
|
|
863
|
+
|
|
864
|
+
# Now overlay CSV values
|
|
865
|
+
# Prepare csv_effective depending on allow_null_override
|
|
866
|
+
csv_effective = {}
|
|
867
|
+
for k, v in normalized.items():
|
|
868
|
+
if k in ("extends", "inherits"):
|
|
869
|
+
continue
|
|
870
|
+
if v is None and not allow_null_override:
|
|
871
|
+
# skip: keep default
|
|
872
|
+
continue
|
|
873
|
+
csv_effective[k] = v
|
|
874
|
+
|
|
875
|
+
if override_with_csv:
|
|
876
|
+
merged = deep_merge(merged, csv_effective)
|
|
877
|
+
else:
|
|
878
|
+
# defaults take precedence: only set keys missing in merged
|
|
879
|
+
for k, v in csv_effective.items():
|
|
880
|
+
if k not in merged:
|
|
881
|
+
merged[k] = v
|
|
882
|
+
|
|
883
|
+
# experiment_name default
|
|
884
|
+
if merged.get("experiment_name") is None and date_str:
|
|
885
|
+
merged["experiment_name"] = f"{date_str}_SMF_experiment"
|
|
886
|
+
|
|
887
|
+
# Input file types and path handling
|
|
888
|
+
input_data_path = Path(merged['input_data_path'])
|
|
889
|
+
|
|
890
|
+
# Detect the input filetype
|
|
891
|
+
if input_data_path.is_file():
|
|
892
|
+
suffix = input_data_path.suffix.lower()
|
|
893
|
+
suffixes = [s.lower() for s in input_data_path.suffixes] # handles multi-part extensions
|
|
894
|
+
|
|
895
|
+
# recognize multi-suffix cases like .fastq.gz or .fq.gz
|
|
896
|
+
if any(s in ['.pod5', '.p5'] for s in suffixes):
|
|
897
|
+
input_type = "pod5"
|
|
898
|
+
input_files = [Path(input_data_path)]
|
|
899
|
+
elif any(s in ['.fast5', '.f5'] for s in suffixes):
|
|
900
|
+
input_type = "fast5"
|
|
901
|
+
input_files = [Path(input_data_path)]
|
|
902
|
+
elif any(s in ['.fastq', '.fq'] for s in suffixes):
|
|
903
|
+
input_type = "fastq"
|
|
904
|
+
input_files = [Path(input_data_path)]
|
|
905
|
+
elif any(s in ['.bam'] for s in suffixes):
|
|
906
|
+
input_type = "bam"
|
|
907
|
+
input_files = [Path(input_data_path)]
|
|
908
|
+
elif any(s in ['.h5ad', ".h5"] for s in suffixes):
|
|
909
|
+
input_type = "h5ad"
|
|
910
|
+
input_files = [Path(input_data_path)]
|
|
911
|
+
else:
|
|
912
|
+
print("Error detecting input file type")
|
|
913
|
+
|
|
914
|
+
elif input_data_path.is_dir():
|
|
915
|
+
found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
|
|
916
|
+
|
|
917
|
+
if found["input_is_pod5"]:
|
|
918
|
+
input_type = "pod5"
|
|
919
|
+
input_files = found["pod5_paths"]
|
|
920
|
+
elif found["input_is_fast5"]:
|
|
921
|
+
input_type = "fast5"
|
|
922
|
+
input_files = found["fast5_paths"]
|
|
923
|
+
elif found["input_is_fastq"]:
|
|
924
|
+
input_type = "fastq"
|
|
925
|
+
input_files = found["fastq_paths"]
|
|
926
|
+
elif found["input_is_bam"]:
|
|
927
|
+
input_type = "bam"
|
|
928
|
+
input_files = found["bam_paths"]
|
|
929
|
+
elif found["input_is_h5ad"]:
|
|
930
|
+
input_type = "h5ad"
|
|
931
|
+
input_files = found["h5ad_paths"]
|
|
932
|
+
|
|
933
|
+
print(f"Found {found['all_files_searched']} files; fastq={len(found["fastq_paths"])}, bam={len(found["bam_paths"])}, pod5={len(found["pod5_paths"])}, fast5={len(found["fast5_paths"])}, , h5ad={len(found["h5ad_paths"])}")
|
|
934
|
+
|
|
935
|
+
# summary file output path
|
|
936
|
+
output_dir = Path(merged['output_directory'])
|
|
937
|
+
summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
|
|
938
|
+
summary_file = output_dir / summary_file_basename
|
|
939
|
+
|
|
940
|
+
# Demultiplexing output path
|
|
941
|
+
split_dir = merged.get("split_dir", "demultiplexed_BAMs")
|
|
942
|
+
split_path = output_dir / split_dir
|
|
943
|
+
|
|
944
|
+
# final normalization
|
|
945
|
+
if "strands" in merged:
|
|
946
|
+
merged["strands"] = _parse_list(merged["strands"])
|
|
947
|
+
if "conversions" in merged:
|
|
948
|
+
merged["conversions"] = _parse_list(merged["conversions"])
|
|
949
|
+
if "mod_target_bases" in merged:
|
|
950
|
+
merged["mod_target_bases"] = _parse_list(merged["mod_target_bases"])
|
|
951
|
+
if "conversion_types" in merged:
|
|
952
|
+
merged["conversion_types"] = _parse_list(merged["conversion_types"])
|
|
953
|
+
|
|
954
|
+
merged["filter_threshold"] = float(_parse_numeric(merged.get("filter_threshold", 0.8), 0.8))
|
|
955
|
+
merged["m6A_threshold"] = float(_parse_numeric(merged.get("m6A_threshold", 0.7), 0.7))
|
|
956
|
+
merged["m5C_threshold"] = float(_parse_numeric(merged.get("m5C_threshold", 0.7), 0.7))
|
|
957
|
+
merged["hm5C_threshold"] = float(_parse_numeric(merged.get("hm5C_threshold", 0.7), 0.7))
|
|
958
|
+
merged["thresholds"] = [
|
|
959
|
+
merged["filter_threshold"],
|
|
960
|
+
merged["m6A_threshold"],
|
|
961
|
+
merged["m5C_threshold"],
|
|
962
|
+
merged["hm5C_threshold"],
|
|
963
|
+
]
|
|
964
|
+
|
|
965
|
+
for bkey in ("barcode_both_ends", "trim", "input_already_demuxed", "make_bigwigs", "skip_unclassified", "delete_batch_hdfs"):
|
|
966
|
+
if bkey in merged:
|
|
967
|
+
merged[bkey] = _parse_bool(merged[bkey])
|
|
968
|
+
|
|
969
|
+
if "batch_size" in merged:
|
|
970
|
+
merged["batch_size"] = int(_parse_numeric(merged.get("batch_size", 4), 4))
|
|
971
|
+
if "threads" in merged:
|
|
972
|
+
tval = _parse_numeric(merged.get("threads", None), None)
|
|
973
|
+
merged["threads"] = None if tval is None else int(tval)
|
|
974
|
+
|
|
975
|
+
if "aligner_args" in merged and merged.get("aligner_args") is None:
|
|
976
|
+
merged.pop("aligner_args", None)
|
|
977
|
+
|
|
978
|
+
# --- Resolve aligner_args into concrete list for the chosen aligner ---
|
|
979
|
+
merged['aligner_args'] = resolve_aligner_args(merged)
|
|
980
|
+
|
|
981
|
+
if "mod_list" in merged:
|
|
982
|
+
merged["mod_list"] = _parse_list(merged.get("mod_list"))
|
|
983
|
+
|
|
984
|
+
# HMM feature set handling
|
|
985
|
+
if "hmm_feature_sets" in merged:
|
|
986
|
+
merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged["hmm_feature_sets"])
|
|
987
|
+
else:
|
|
988
|
+
# allow older names (footprint_ranges, accessible_ranges, cpg_ranges) — optional:
|
|
989
|
+
maybe_fs = {}
|
|
990
|
+
if "footprint_ranges" in merged or "hmm_footprint_ranges" in merged:
|
|
991
|
+
maybe_fs["footprint"] = {"features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")), "state": merged.get("hmm_footprint_state", "Non-Modified")}
|
|
992
|
+
if "accessible_ranges" in merged or "hmm_accessible_ranges" in merged:
|
|
993
|
+
maybe_fs["accessible"] = {"features": merged.get("hmm_accessible_ranges", merged.get("accessible_ranges")), "state": merged.get("hmm_accessible_state", "Modified")}
|
|
994
|
+
if "cpg_ranges" in merged or "hmm_cpg_ranges" in merged:
|
|
995
|
+
maybe_fs["cpg"] = {"features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")), "state": merged.get("hmm_cpg_state", "Modified")}
|
|
996
|
+
if maybe_fs:
|
|
997
|
+
merged.setdefault("hmm_feature_sets", {})
|
|
998
|
+
for k, v in maybe_fs.items():
|
|
999
|
+
merged["hmm_feature_sets"].setdefault(k, v)
|
|
1000
|
+
|
|
1001
|
+
# final normalization will be done below
|
|
1002
|
+
# (do not set local hmm_feature_sets here — do it once below)
|
|
1003
|
+
pass
|
|
1004
|
+
|
|
1005
|
+
# Final normalization of hmm_feature_sets and canonical local variables
|
|
1006
|
+
merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged.get("hmm_feature_sets", {}))
|
|
1007
|
+
hmm_feature_sets = merged.get("hmm_feature_sets", {})
|
|
1008
|
+
hmm_annotation_threshold = merged.get("hmm_annotation_threshold", 0.5)
|
|
1009
|
+
hmm_batch_size = int(merged.get("hmm_batch_size", 1024))
|
|
1010
|
+
hmm_use_viterbi = bool(merged.get("hmm_use_viterbi", False))
|
|
1011
|
+
hmm_device = merged.get("hmm_device", None)
|
|
1012
|
+
hmm_methbases = _parse_list(merged.get("hmm_methbases", None))
|
|
1013
|
+
if not hmm_methbases: # None or []
|
|
1014
|
+
hmm_methbases = _parse_list(merged.get("mod_target_bases", None))
|
|
1015
|
+
if not hmm_methbases:
|
|
1016
|
+
hmm_methbases = ['C']
|
|
1017
|
+
hmm_methbases = list(hmm_methbases)
|
|
1018
|
+
hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
|
|
1019
|
+
|
|
1020
|
+
# instantiate dataclass
|
|
1021
|
+
instance = cls(
|
|
1022
|
+
smf_modality = merged.get("smf_modality"),
|
|
1023
|
+
input_data_path = input_data_path,
|
|
1024
|
+
recursive_input_search = merged.get("recursive_input_search"),
|
|
1025
|
+
input_type = input_type,
|
|
1026
|
+
input_files = input_files,
|
|
1027
|
+
output_directory = output_dir,
|
|
1028
|
+
summary_file = summary_file,
|
|
1029
|
+
fasta = merged.get("fasta"),
|
|
1030
|
+
sequencer = merged.get("sequencer"),
|
|
1031
|
+
model_dir = merged.get("model_dir"),
|
|
1032
|
+
barcode_kit = merged.get("barcode_kit"),
|
|
1033
|
+
fastq_barcode_map = merged.get("fastq_barcode_map"),
|
|
1034
|
+
fastq_auto_pairing = merged.get("fastq_auto_pairing"),
|
|
1035
|
+
bam_suffix = merged.get("bam_suffix", ".bam"),
|
|
1036
|
+
split_dir = split_dir,
|
|
1037
|
+
split_path = split_path,
|
|
1038
|
+
strands = merged.get("strands", ["bottom","top"]),
|
|
1039
|
+
conversions = merged.get("conversions", ["unconverted"]),
|
|
1040
|
+
fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
|
|
1041
|
+
mapping_threshold = float(merged.get("mapping_threshold", 0.01)),
|
|
1042
|
+
experiment_name = merged.get("experiment_name"),
|
|
1043
|
+
model = merged.get("model", "hac"),
|
|
1044
|
+
barcode_both_ends = merged.get("barcode_both_ends", False),
|
|
1045
|
+
trim = merged.get("trim", False),
|
|
1046
|
+
input_already_demuxed = merged.get("input_already_demuxed", False),
|
|
1047
|
+
threads = merged.get("threads"),
|
|
1048
|
+
sample_sheet_path = merged.get("sample_sheet_path"),
|
|
1049
|
+
sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
|
|
1050
|
+
delete_intermediate_bams = merged.get("delete_intermediate_bams", True),
|
|
1051
|
+
delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
|
|
1052
|
+
aligner = merged.get("aligner", "minimap2"),
|
|
1053
|
+
aligner_args = merged.get("aligner_args", None),
|
|
1054
|
+
device = merged.get("device", "auto"),
|
|
1055
|
+
make_bigwigs = merged.get("make_bigwigs", False),
|
|
1056
|
+
make_beds = merged.get("make_beds", False),
|
|
1057
|
+
delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
|
|
1058
|
+
mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
|
|
1059
|
+
enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
|
|
1060
|
+
conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
|
|
1061
|
+
filter_threshold = merged.get("filter_threshold", 0.8),
|
|
1062
|
+
m6A_threshold = merged.get("m6A_threshold", 0.7),
|
|
1063
|
+
m5C_threshold = merged.get("m5C_threshold", 0.7),
|
|
1064
|
+
hm5C_threshold = merged.get("hm5C_threshold", 0.7),
|
|
1065
|
+
thresholds = merged.get("thresholds", []),
|
|
1066
|
+
mod_list = merged.get("mod_list", ["5mC_5hmC","6mA"]),
|
|
1067
|
+
batch_size = merged.get("batch_size", 4),
|
|
1068
|
+
skip_unclassified = merged.get("skip_unclassified", True),
|
|
1069
|
+
delete_batch_hdfs = merged.get("delete_batch_hdfs", True),
|
|
1070
|
+
reference_column = merged.get("reference_column", 'Reference_strand'),
|
|
1071
|
+
sample_column = merged.get("sample_column", 'Barcode'),
|
|
1072
|
+
sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
|
|
1073
|
+
fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
|
|
1074
|
+
binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
|
|
1075
|
+
positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
|
|
1076
|
+
negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
|
|
1077
|
+
infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
|
|
1078
|
+
inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
|
|
1079
|
+
fit_j_threshold = merged.get("fit_j_threshold", 0.5),
|
|
1080
|
+
output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
|
|
1081
|
+
layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
|
|
1082
|
+
layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
|
|
1083
|
+
umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
|
|
1084
|
+
rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
|
|
1085
|
+
rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
|
|
1086
|
+
autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
|
|
1087
|
+
autocorr_max_lag = merged.get("autocorr_max_lag", 800),
|
|
1088
|
+
autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'any_C']),
|
|
1089
|
+
hmm_n_states = merged.get("hmm_n_states", 2),
|
|
1090
|
+
hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
|
|
1091
|
+
hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
|
|
1092
|
+
hmm_init_start_probs = merged.get("hmm_init_start_probs",[0.5, 0.5]),
|
|
1093
|
+
hmm_eps = merged.get("hmm_eps", 1e-8),
|
|
1094
|
+
hmm_dtype = merged.get("hmm_dtype", "float64"),
|
|
1095
|
+
hmm_feature_sets = hmm_feature_sets,
|
|
1096
|
+
hmm_annotation_threshold = hmm_annotation_threshold,
|
|
1097
|
+
hmm_batch_size = hmm_batch_size,
|
|
1098
|
+
hmm_use_viterbi = hmm_use_viterbi,
|
|
1099
|
+
hmm_methbases = hmm_methbases,
|
|
1100
|
+
hmm_device = hmm_device,
|
|
1101
|
+
hmm_merge_layer_features = hmm_merge_layer_features,
|
|
1102
|
+
footprints = merged.get("footprints", None),
|
|
1103
|
+
accessible_patches = merged.get("accessible_patches", None),
|
|
1104
|
+
cpg = merged.get("cpg", None),
|
|
1105
|
+
read_coord_filter = merged.get("read_coord_filter", [None, None]),
|
|
1106
|
+
read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
|
|
1107
|
+
read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
|
|
1108
|
+
read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
|
|
1109
|
+
read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
|
|
1110
|
+
read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
|
|
1111
|
+
read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
|
|
1112
|
+
read_mod_filtering_any_c_thresholds = merged.get("read_mod_filtering_any_c_thresholds", [0.025, 0.975]),
|
|
1113
|
+
read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
|
|
1114
|
+
read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
|
|
1115
|
+
min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
|
|
1116
|
+
duplicate_detection_site_types = merged.get("duplicate_detection_site_types", ['GpC', 'CpG', 'ambiguous_GpC_CpG']),
|
|
1117
|
+
duplicate_detection_distance_threshold = merged.get("duplicate_detection_distance_threshold", 0.07),
|
|
1118
|
+
duplicate_detection_keep_best_metric = merged.get("duplicate_detection_keep_best_metric", "read_quality"),
|
|
1119
|
+
duplicate_detection_window_size_for_hamming_neighbors = merged.get("duplicate_detection_window_size_for_hamming_neighbors", 50),
|
|
1120
|
+
duplicate_detection_min_overlapping_positions = merged.get("duplicate_detection_min_overlapping_positions", 20),
|
|
1121
|
+
duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
|
|
1122
|
+
duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
|
|
1123
|
+
duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
|
|
1124
|
+
position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
|
|
1125
|
+
correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
|
|
1126
|
+
correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
|
|
1127
|
+
correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
|
|
1128
|
+
hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
|
|
1129
|
+
force_redo_load_adata = merged.get("force_redo_load_adata", False),
|
|
1130
|
+
force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
|
|
1131
|
+
force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
|
|
1132
|
+
bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),
|
|
1133
|
+
force_redo_add_read_length_and_mapping_qc = merged.get("force_redo_add_read_length_and_mapping_qc", False),
|
|
1134
|
+
bypass_clean_nan = merged.get("bypass_clean_nan", False),
|
|
1135
|
+
force_redo_clean_nan = merged.get("force_redo_clean_nan", False),
|
|
1136
|
+
bypass_append_base_context = merged.get("bypass_append_base_context", False),
|
|
1137
|
+
force_redo_append_base_context = merged.get("force_redo_append_base_context", False),
|
|
1138
|
+
invert_adata = merged.get("invert_adata", False),
|
|
1139
|
+
bypass_append_binary_layer_by_base_context = merged.get("bypass_append_binary_layer_by_base_context", False),
|
|
1140
|
+
force_redo_append_binary_layer_by_base_context = merged.get("force_redo_append_binary_layer_by_base_context", False),
|
|
1141
|
+
bypass_calculate_read_modification_stats = merged.get("bypass_calculate_read_modification_stats", False),
|
|
1142
|
+
force_redo_calculate_read_modification_stats = merged.get("force_redo_calculate_read_modification_stats", False),
|
|
1143
|
+
bypass_filter_reads_on_modification_thresholds = merged.get("bypass_filter_reads_on_modification_thresholds", False),
|
|
1144
|
+
force_redo_filter_reads_on_modification_thresholds = merged.get("force_redo_filter_reads_on_modification_thresholds", False),
|
|
1145
|
+
bypass_flag_duplicate_reads = merged.get("bypass_flag_duplicate_reads", False),
|
|
1146
|
+
force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
|
|
1147
|
+
bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
|
|
1148
|
+
force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
|
|
1149
|
+
force_redo_basic_analyses = merged.get("force_redo_basic_analyses", False),
|
|
1150
|
+
bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
|
|
1151
|
+
force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
|
|
1152
|
+
bypass_basic_umap = merged.get("bypass_basic_umap", False),
|
|
1153
|
+
force_redo_basic_umap = merged.get("force_redo_basic_umap", False),
|
|
1154
|
+
bypass_spatial_autocorr_calculations = merged.get("bypass_spatial_autocorr_calculations", False),
|
|
1155
|
+
force_redo_spatial_autocorr_calculations = merged.get("force_redo_spatial_autocorr_calculations", False),
|
|
1156
|
+
bypass_spatial_autocorr_plotting = merged.get("bypass_spatial_autocorr_plotting", False),
|
|
1157
|
+
force_redo_spatial_autocorr_plotting = merged.get("force_redo_spatial_autocorr_plotting", False),
|
|
1158
|
+
bypass_matrix_corr_calculations = merged.get("bypass_matrix_corr_calculations", False),
|
|
1159
|
+
force_redo_matrix_corr_calculations = merged.get("force_redo_matrix_corr_calculations", False),
|
|
1160
|
+
bypass_matrix_corr_plotting = merged.get("bypass_matrix_corr_plotting", False),
|
|
1161
|
+
force_redo_matrix_corr_plotting = merged.get("force_redo_matrix_corr_plotting", False),
|
|
1162
|
+
bypass_hmm_fit = merged.get("bypass_hmm_fit", False),
|
|
1163
|
+
force_redo_hmm_fit = merged.get("force_redo_hmm_fit", False),
|
|
1164
|
+
bypass_hmm_apply = merged.get("bypass_hmm_apply", False),
|
|
1165
|
+
force_redo_hmm_apply = merged.get("force_redo_hmm_apply", False),
|
|
1166
|
+
|
|
1167
|
+
config_source = config_source or "<var_dict>",
|
|
1168
|
+
)
|
|
1169
|
+
|
|
1170
|
+
report = {
|
|
1171
|
+
"modality": modality,
|
|
1172
|
+
"defaults_source_chain": defaults_source_chain,
|
|
1173
|
+
"defaults_loaded": defaults_loaded,
|
|
1174
|
+
"csv_normalized": normalized,
|
|
1175
|
+
"merged": merged,
|
|
1176
|
+
}
|
|
1177
|
+
return instance, report
|
|
1178
|
+
|
|
1179
|
+
# convenience: load from CSV via LoadExperimentConfig
|
|
1180
|
+
@classmethod
|
|
1181
|
+
def from_csv(
|
|
1182
|
+
cls,
|
|
1183
|
+
csv_input: Union[str, Path, IO, pd.DataFrame],
|
|
1184
|
+
date_str: Optional[str] = None,
|
|
1185
|
+
config_source: Optional[str] = None,
|
|
1186
|
+
defaults_dir: Optional[Union[str, Path]] = None,
|
|
1187
|
+
defaults_map: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
1188
|
+
**kwargs,
|
|
1189
|
+
) -> Tuple["ExperimentConfig", Dict[str, Any]]:
|
|
1190
|
+
"""
|
|
1191
|
+
Load CSV using LoadExperimentConfig (or accept DataFrame) and build ExperimentConfig.
|
|
1192
|
+
Additional kwargs passed to from_var_dict().
|
|
1193
|
+
"""
|
|
1194
|
+
loader = LoadExperimentConfig(csv_input) if not isinstance(csv_input, pd.DataFrame) else LoadExperimentConfig(pd.DataFrame(csv_input))
|
|
1195
|
+
var_dict = loader.var_dict
|
|
1196
|
+
return cls.from_var_dict(var_dict, date_str=date_str, config_source=config_source, defaults_dir=defaults_dir, defaults_map=defaults_map, **kwargs)
|
|
1197
|
+
|
|
1198
|
+
# -------------------------
|
|
1199
|
+
# validation & serialization
|
|
1200
|
+
# -------------------------
|
|
1201
|
+
def _validate_hmm_features_structure(hfs: dict) -> List[str]:
|
|
1202
|
+
errs = []
|
|
1203
|
+
if not isinstance(hfs, dict):
|
|
1204
|
+
errs.append("hmm_feature_sets must be a mapping if provided.")
|
|
1205
|
+
return errs
|
|
1206
|
+
for g, info in hfs.items():
|
|
1207
|
+
if not isinstance(info, dict):
|
|
1208
|
+
errs.append(f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'.")
|
|
1209
|
+
continue
|
|
1210
|
+
feats = info.get("features")
|
|
1211
|
+
if not isinstance(feats, dict) or len(feats) == 0:
|
|
1212
|
+
errs.append(f"hmm_feature_sets['{g}'] must include non-empty 'features' mapping.")
|
|
1213
|
+
continue
|
|
1214
|
+
for fname, rng in feats.items():
|
|
1215
|
+
try:
|
|
1216
|
+
lo, hi = float(rng[0]), float(rng[1])
|
|
1217
|
+
if lo < 0 or hi <= lo:
|
|
1218
|
+
errs.append(f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}.")
|
|
1219
|
+
except Exception:
|
|
1220
|
+
errs.append(f"Feature range for {g}:{fname} is invalid: {rng}")
|
|
1221
|
+
return errs
|
|
1222
|
+
|
|
1223
|
+
def validate(self, require_paths: bool = True, raise_on_error: bool = True) -> List[str]:
|
|
1224
|
+
"""
|
|
1225
|
+
Validate the config. If require_paths True, check paths (input_data_path, fasta) exist;
|
|
1226
|
+
attempt to create output_directory if missing.
|
|
1227
|
+
Returns a list of error messages (empty if none). Raises ValueError if raise_on_error True.
|
|
1228
|
+
"""
|
|
1229
|
+
errors: List[str] = []
|
|
1230
|
+
if not self.input_data_path:
|
|
1231
|
+
errors.append("input_data_path is required but missing.")
|
|
1232
|
+
if not self.output_directory:
|
|
1233
|
+
errors.append("output_directory is required but missing.")
|
|
1234
|
+
if not self.fasta:
|
|
1235
|
+
errors.append("fasta (reference FASTA) is required but missing.")
|
|
1236
|
+
|
|
1237
|
+
if require_paths:
|
|
1238
|
+
if self.input_data_path and not Path(self.input_data_path).exists():
|
|
1239
|
+
errors.append(f"input_data_path does not exist: {self.input_data_path}")
|
|
1240
|
+
if self.fasta and not Path(self.fasta).exists():
|
|
1241
|
+
errors.append(f"fasta does not exist: {self.fasta}")
|
|
1242
|
+
outp = Path(self.output_directory) if self.output_directory else None
|
|
1243
|
+
if outp and not outp.exists():
|
|
1244
|
+
try:
|
|
1245
|
+
outp.mkdir(parents=True, exist_ok=True)
|
|
1246
|
+
except Exception as e:
|
|
1247
|
+
errors.append(f"Could not create output_directory {self.output_directory}: {e}")
|
|
1248
|
+
|
|
1249
|
+
if not (0.0 <= float(self.mapping_threshold) <= 1.0):
|
|
1250
|
+
errors.append("mapping_threshold must be in [0,1].")
|
|
1251
|
+
for t in (self.filter_threshold, self.m6A_threshold, self.m5C_threshold, self.hm5C_threshold):
|
|
1252
|
+
if not (0.0 <= float(t) <= 1.0):
|
|
1253
|
+
errors.append(f"threshold value {t} must be in [0,1].")
|
|
1254
|
+
|
|
1255
|
+
if raise_on_error and errors:
|
|
1256
|
+
raise ValueError("ExperimentConfig validation failed:\n " + "\n ".join(errors))
|
|
1257
|
+
|
|
1258
|
+
errs = _validate_hmm_features_structure(self.hmm_feature_sets)
|
|
1259
|
+
errors.extend(errs)
|
|
1260
|
+
|
|
1261
|
+
return errors
|
|
1262
|
+
|
|
1263
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
1264
|
+
return asdict(self)
|
|
1265
|
+
|
|
1266
|
+
def to_yaml(self, path: Optional[Union[str, Path]] = None) -> str:
|
|
1267
|
+
"""
|
|
1268
|
+
Dump config to YAML (string if path None) or save to file at path.
|
|
1269
|
+
If pyyaml is not installed, fallback to JSON for file write.
|
|
1270
|
+
"""
|
|
1271
|
+
data = self.to_dict()
|
|
1272
|
+
if path is None:
|
|
1273
|
+
if yaml is None:
|
|
1274
|
+
return json.dumps(data, indent=2)
|
|
1275
|
+
return yaml.safe_dump(data, sort_keys=False)
|
|
1276
|
+
else:
|
|
1277
|
+
p = Path(path)
|
|
1278
|
+
if yaml is None:
|
|
1279
|
+
p.write_text(json.dumps(data, indent=2), encoding="utf8")
|
|
1280
|
+
else:
|
|
1281
|
+
p.write_text(yaml.safe_dump(data, sort_keys=False), encoding="utf8")
|
|
1282
|
+
return str(p)
|
|
1283
|
+
|
|
1284
|
+
def save(self, path: Union[str, Path]) -> str:
|
|
1285
|
+
return self.to_yaml(path)
|
|
1286
|
+
|
|
1287
|
+
def __repr__(self) -> str:
|
|
1288
|
+
return f"<ExperimentConfig modality={self.smf_modality} experiment_name={self.experiment_name} source={self.config_source}>"
|