smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,26 @@
|
|
|
1
1
|
# experiment_config.py
|
|
2
2
|
from __future__ import annotations
|
|
3
|
+
|
|
3
4
|
import ast
|
|
4
5
|
import json
|
|
5
6
|
import warnings
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
|
+
from dataclasses import asdict, dataclass, field
|
|
7
8
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
|
+
from typing import IO, Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
10
|
+
|
|
11
|
+
from smftools.constants import (
|
|
12
|
+
BAM_SUFFIX,
|
|
13
|
+
BARCODE_BOTH_ENDS,
|
|
14
|
+
CONVERSIONS,
|
|
15
|
+
MOD_LIST,
|
|
16
|
+
MOD_MAP,
|
|
17
|
+
REF_COL,
|
|
18
|
+
SAMPLE_COL,
|
|
19
|
+
SPLIT_DIR,
|
|
20
|
+
STRANDS,
|
|
21
|
+
TRIM,
|
|
22
|
+
)
|
|
23
|
+
|
|
9
24
|
from .discover_input_files import discover_input_files
|
|
10
25
|
|
|
11
26
|
# Optional dependency for YAML handling
|
|
@@ -14,8 +29,8 @@ try:
|
|
|
14
29
|
except Exception:
|
|
15
30
|
yaml = None
|
|
16
31
|
|
|
17
|
-
import pandas as pd
|
|
18
32
|
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
19
34
|
|
|
20
35
|
|
|
21
36
|
# -------------------------
|
|
@@ -81,6 +96,7 @@ def _parse_numeric(v: Any, fallback: Any = None) -> Any:
|
|
|
81
96
|
except Exception:
|
|
82
97
|
return fallback
|
|
83
98
|
|
|
99
|
+
|
|
84
100
|
def _try_json_or_literal(s: Any) -> Any:
|
|
85
101
|
"""Try parse JSON or python literal; otherwise return original string."""
|
|
86
102
|
if s is None:
|
|
@@ -123,8 +139,8 @@ def resolve_aligner_args(
|
|
|
123
139
|
"""
|
|
124
140
|
# builtin defaults (aligner -> args)
|
|
125
141
|
builtin_defaults = {
|
|
126
|
-
"minimap2": [
|
|
127
|
-
"dorado": [
|
|
142
|
+
"minimap2": ["-a", "-x", "map-ont", "--MD", "-Y", "-y", "-N", "5", "--secondary=no"],
|
|
143
|
+
"dorado": ["--mm2-opts", "-N", "5"],
|
|
128
144
|
}
|
|
129
145
|
if default_by_aligner is None:
|
|
130
146
|
default_by_aligner = builtin_defaults
|
|
@@ -214,7 +230,7 @@ def resolve_aligner_args(
|
|
|
214
230
|
return list(default_by_aligner.get(key_align, []))
|
|
215
231
|
|
|
216
232
|
|
|
217
|
-
# HMM default params and
|
|
233
|
+
# HMM default params and helper functions
|
|
218
234
|
def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
|
|
219
235
|
"""
|
|
220
236
|
Normalize user-provided `hmm_feature_sets` into canonical structure:
|
|
@@ -276,6 +292,59 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
|
|
|
276
292
|
return canonical
|
|
277
293
|
|
|
278
294
|
|
|
295
|
+
def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
|
|
296
|
+
"""
|
|
297
|
+
Normalize user-provided `hmm_peak_feature_configs` into:
|
|
298
|
+
{
|
|
299
|
+
layer_name: {
|
|
300
|
+
"min_distance": int,
|
|
301
|
+
"peak_width": int,
|
|
302
|
+
"peak_prominence": float,
|
|
303
|
+
"peak_threshold": float,
|
|
304
|
+
"rolling_window": int,
|
|
305
|
+
},
|
|
306
|
+
...
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
Accepts dict, JSON/string, None. Returns {} for empty input.
|
|
310
|
+
"""
|
|
311
|
+
if raw is None:
|
|
312
|
+
return {}
|
|
313
|
+
|
|
314
|
+
parsed = raw
|
|
315
|
+
if isinstance(raw, str):
|
|
316
|
+
parsed = _try_json_or_literal(raw)
|
|
317
|
+
if not isinstance(parsed, dict):
|
|
318
|
+
return {}
|
|
319
|
+
|
|
320
|
+
defaults = {
|
|
321
|
+
"min_distance": 200,
|
|
322
|
+
"peak_width": 200,
|
|
323
|
+
"peak_prominence": 0.2,
|
|
324
|
+
"peak_threshold": 0.8,
|
|
325
|
+
"rolling_window": 1,
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
out: Dict[str, dict] = {}
|
|
329
|
+
for layer, conf in parsed.items():
|
|
330
|
+
if conf is None:
|
|
331
|
+
conf = {}
|
|
332
|
+
if not isinstance(conf, dict):
|
|
333
|
+
# allow shorthand like 300 -> interpreted as peak_width
|
|
334
|
+
conf = {"peak_width": conf}
|
|
335
|
+
|
|
336
|
+
full = defaults.copy()
|
|
337
|
+
full.update(conf)
|
|
338
|
+
out[str(layer)] = {
|
|
339
|
+
"min_distance": int(full["min_distance"]),
|
|
340
|
+
"peak_width": int(full["peak_width"]),
|
|
341
|
+
"peak_prominence": float(full["peak_prominence"]),
|
|
342
|
+
"peak_threshold": float(full["peak_threshold"]),
|
|
343
|
+
"rolling_window": int(full["rolling_window"]),
|
|
344
|
+
}
|
|
345
|
+
return out
|
|
346
|
+
|
|
347
|
+
|
|
279
348
|
# -------------------------
|
|
280
349
|
# LoadExperimentConfig
|
|
281
350
|
# -------------------------
|
|
@@ -313,12 +382,12 @@ class LoadExperimentConfig:
|
|
|
313
382
|
df = pd.read_csv(source, dtype=str, keep_default_na=False, na_values=[""])
|
|
314
383
|
# normalize column names
|
|
315
384
|
df.columns = [c.strip() for c in df.columns]
|
|
316
|
-
if
|
|
385
|
+
if "variable" not in df.columns:
|
|
317
386
|
raise ValueError("Config CSV must contain a 'variable' column.")
|
|
318
|
-
if
|
|
319
|
-
df[
|
|
320
|
-
if
|
|
321
|
-
df[
|
|
387
|
+
if "value" not in df.columns:
|
|
388
|
+
df["value"] = ""
|
|
389
|
+
if "type" not in df.columns:
|
|
390
|
+
df["type"] = ""
|
|
322
391
|
return df
|
|
323
392
|
|
|
324
393
|
@staticmethod
|
|
@@ -337,9 +406,9 @@ class LoadExperimentConfig:
|
|
|
337
406
|
|
|
338
407
|
def parse_bool(s: str):
|
|
339
408
|
s2 = s.strip().lower()
|
|
340
|
-
if s2 in (
|
|
409
|
+
if s2 in ("1", "true", "t", "yes", "y", "on"):
|
|
341
410
|
return True
|
|
342
|
-
if s2 in (
|
|
411
|
+
if s2 in ("0", "false", "f", "no", "n", "off"):
|
|
343
412
|
return False
|
|
344
413
|
raise ValueError(f"Cannot parse boolean from '{s}'")
|
|
345
414
|
|
|
@@ -359,18 +428,18 @@ class LoadExperimentConfig:
|
|
|
359
428
|
except Exception:
|
|
360
429
|
pass
|
|
361
430
|
# fallback split
|
|
362
|
-
parts = [p.strip() for p in s.strip("()[] ").split(
|
|
431
|
+
parts = [p.strip() for p in s.strip("()[] ").split(",") if p.strip() != ""]
|
|
363
432
|
return parts
|
|
364
433
|
|
|
365
|
-
if hint in (
|
|
434
|
+
if hint in ("int", "integer"):
|
|
366
435
|
return int(v)
|
|
367
|
-
if hint in (
|
|
436
|
+
if hint in ("float", "double"):
|
|
368
437
|
return float(v)
|
|
369
|
-
if hint in (
|
|
438
|
+
if hint in ("bool", "boolean"):
|
|
370
439
|
return parse_bool(v)
|
|
371
|
-
if hint in (
|
|
440
|
+
if hint in ("list", "array"):
|
|
372
441
|
return parse_list_like(v)
|
|
373
|
-
if hint in (
|
|
442
|
+
if hint in ("string", "str"):
|
|
374
443
|
return v
|
|
375
444
|
|
|
376
445
|
# infer
|
|
@@ -396,27 +465,31 @@ class LoadExperimentConfig:
|
|
|
396
465
|
return lit
|
|
397
466
|
except Exception:
|
|
398
467
|
pass
|
|
399
|
-
if (
|
|
400
|
-
return [p.strip() for p in v.split(
|
|
468
|
+
if ("," in v) and (not any(ch in v for ch in "{}[]()")):
|
|
469
|
+
return [p.strip() for p in v.split(",") if p.strip() != ""]
|
|
401
470
|
return v
|
|
402
471
|
|
|
403
472
|
def _parse_df(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
404
473
|
parsed: Dict[str, Any] = {}
|
|
405
474
|
for idx, row in df.iterrows():
|
|
406
|
-
name = str(row[
|
|
475
|
+
name = str(row["variable"]).strip()
|
|
407
476
|
if name == "":
|
|
408
477
|
continue
|
|
409
|
-
raw_val = row.get(
|
|
410
|
-
raw_type = row.get(
|
|
478
|
+
raw_val = row.get("value", "")
|
|
479
|
+
raw_type = row.get("type", "")
|
|
411
480
|
if pd.isna(raw_val) or str(raw_val).strip() == "":
|
|
412
481
|
raw_val = None
|
|
413
482
|
try:
|
|
414
483
|
parsed_val = self._parse_value_as_type(raw_val, raw_type)
|
|
415
484
|
except Exception as e:
|
|
416
|
-
warnings.warn(
|
|
485
|
+
warnings.warn(
|
|
486
|
+
f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value."
|
|
487
|
+
)
|
|
417
488
|
parsed_val = None if raw_val is None else raw_val
|
|
418
489
|
if name in parsed:
|
|
419
|
-
warnings.warn(
|
|
490
|
+
warnings.warn(
|
|
491
|
+
f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value."
|
|
492
|
+
)
|
|
420
493
|
parsed[name] = parsed_val
|
|
421
494
|
return parsed
|
|
422
495
|
|
|
@@ -424,7 +497,7 @@ class LoadExperimentConfig:
|
|
|
424
497
|
"""Return parsed config as a pandas DataFrame (variable, value)."""
|
|
425
498
|
rows = []
|
|
426
499
|
for k, v in self.var_dict.items():
|
|
427
|
-
rows.append({
|
|
500
|
+
rows.append({"variable": k, "value": v})
|
|
428
501
|
return pd.DataFrame(rows)
|
|
429
502
|
|
|
430
503
|
|
|
@@ -592,17 +665,17 @@ class ExperimentConfig:
|
|
|
592
665
|
input_data_path: Optional[str] = None
|
|
593
666
|
output_directory: Optional[str] = None
|
|
594
667
|
fasta: Optional[str] = None
|
|
595
|
-
bam_suffix: str =
|
|
668
|
+
bam_suffix: str = BAM_SUFFIX
|
|
596
669
|
recursive_input_search: bool = True
|
|
597
670
|
input_type: Optional[str] = None
|
|
598
671
|
input_files: Optional[List[Path]] = None
|
|
599
|
-
split_dir: str =
|
|
672
|
+
split_dir: str = SPLIT_DIR
|
|
600
673
|
split_path: Optional[str] = None
|
|
601
|
-
strands: List[str] = field(default_factory=lambda:
|
|
602
|
-
conversions: List[str] = field(default_factory=lambda:
|
|
674
|
+
strands: List[str] = field(default_factory=lambda: STRANDS)
|
|
675
|
+
conversions: List[str] = field(default_factory=lambda: CONVERSIONS)
|
|
603
676
|
fasta_regions_of_interest: Optional[str] = None
|
|
604
677
|
sample_sheet_path: Optional[str] = None
|
|
605
|
-
sample_sheet_mapping_column: Optional[str] =
|
|
678
|
+
sample_sheet_mapping_column: Optional[str] = "Experiment_name_and_barcode"
|
|
606
679
|
experiment_name: Optional[str] = None
|
|
607
680
|
input_already_demuxed: bool = False
|
|
608
681
|
summary_file: Optional[Path] = None
|
|
@@ -612,7 +685,7 @@ class ExperimentConfig:
|
|
|
612
685
|
fastq_auto_pairing: bool = True
|
|
613
686
|
|
|
614
687
|
# Remove intermediate file options
|
|
615
|
-
delete_intermediate_bams: bool =
|
|
688
|
+
delete_intermediate_bams: bool = False
|
|
616
689
|
delete_intermediate_tsvs: bool = True
|
|
617
690
|
|
|
618
691
|
# Conversion/Deamination file handling
|
|
@@ -638,8 +711,8 @@ class ExperimentConfig:
|
|
|
638
711
|
model_dir: Optional[str] = None
|
|
639
712
|
barcode_kit: Optional[str] = None
|
|
640
713
|
model: str = "hac"
|
|
641
|
-
barcode_both_ends: bool =
|
|
642
|
-
trim: bool =
|
|
714
|
+
barcode_both_ends: bool = BARCODE_BOTH_ENDS
|
|
715
|
+
trim: bool = TRIM
|
|
643
716
|
# General basecalling params
|
|
644
717
|
filter_threshold: float = 0.8
|
|
645
718
|
# Modified basecalling specific params
|
|
@@ -647,53 +720,102 @@ class ExperimentConfig:
|
|
|
647
720
|
m5C_threshold: float = 0.7
|
|
648
721
|
hm5C_threshold: float = 0.7
|
|
649
722
|
thresholds: List[float] = field(default_factory=list)
|
|
650
|
-
mod_list: List[str] = field(
|
|
723
|
+
mod_list: List[str] = field(
|
|
724
|
+
default_factory=lambda: list(MOD_LIST)
|
|
725
|
+
) # Dorado modified basecalling codes
|
|
726
|
+
mod_map: Dict[str, str] = field(
|
|
727
|
+
default_factory=lambda: dict(MOD_MAP)
|
|
728
|
+
) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
|
|
651
729
|
|
|
652
730
|
# Alignment params
|
|
653
|
-
mapping_threshold: float = 0.01
|
|
654
|
-
|
|
731
|
+
mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
|
|
732
|
+
align_from_bam: bool = (
|
|
733
|
+
False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
|
|
734
|
+
)
|
|
735
|
+
aligner: str = "dorado"
|
|
655
736
|
aligner_args: Optional[List[str]] = None
|
|
656
737
|
make_bigwigs: bool = False
|
|
657
738
|
make_beds: bool = False
|
|
658
739
|
|
|
659
740
|
# Anndata structure
|
|
660
|
-
reference_column: Optional[str] =
|
|
661
|
-
sample_column: Optional[str] =
|
|
741
|
+
reference_column: Optional[str] = REF_COL
|
|
742
|
+
sample_column: Optional[str] = SAMPLE_COL
|
|
662
743
|
|
|
663
744
|
# General Plotting
|
|
664
|
-
sample_name_col_for_plotting: Optional[str] =
|
|
745
|
+
sample_name_col_for_plotting: Optional[str] = "Barcode"
|
|
665
746
|
rows_per_qc_histogram_grid: int = 12
|
|
666
747
|
|
|
667
748
|
# Preprocessing - Read length and quality filter params
|
|
668
749
|
read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
669
|
-
read_len_filter_thresholds: Optional[Sequence[float]] = field(
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
750
|
+
read_len_filter_thresholds: Optional[Sequence[float]] = field(
|
|
751
|
+
default_factory=lambda: [100, None]
|
|
752
|
+
)
|
|
753
|
+
read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(
|
|
754
|
+
default_factory=lambda: [0.4, 1.5]
|
|
755
|
+
)
|
|
756
|
+
read_quality_filter_thresholds: Optional[Sequence[float]] = field(
|
|
757
|
+
default_factory=lambda: [15, None]
|
|
758
|
+
)
|
|
759
|
+
read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(
|
|
760
|
+
default_factory=lambda: [None, None]
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
# Preprocessing - Optional reindexing params
|
|
764
|
+
reindexing_offsets: Dict[str, int] = field(default_factory=dict)
|
|
765
|
+
reindexed_var_suffix: Optional[str] = "reindexed"
|
|
673
766
|
|
|
674
767
|
# Preprocessing - Direct mod detection binarization params
|
|
675
|
-
fit_position_methylation_thresholds: Optional[bool] =
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
768
|
+
fit_position_methylation_thresholds: Optional[bool] = (
|
|
769
|
+
False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
770
|
+
)
|
|
771
|
+
binarize_on_fixed_methlyation_threshold: Optional[float] = (
|
|
772
|
+
0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
773
|
+
)
|
|
774
|
+
positive_control_sample_methylation_fitting: Optional[str] = (
|
|
775
|
+
None # A positive control Sample_name to use for fully modified template data
|
|
776
|
+
)
|
|
777
|
+
negative_control_sample_methylation_fitting: Optional[str] = (
|
|
778
|
+
None # A negative control Sample_name to use for fully unmodified template data
|
|
779
|
+
)
|
|
780
|
+
infer_on_percentile_sample_methylation_fitting: Optional[int] = (
|
|
781
|
+
10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
782
|
+
)
|
|
783
|
+
inference_variable_sample_methylation_fitting: Optional[str] = (
|
|
784
|
+
"Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
785
|
+
)
|
|
786
|
+
fit_j_threshold: Optional[float] = (
|
|
787
|
+
0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
788
|
+
)
|
|
682
789
|
output_binary_layer_name: Optional[str] = "binarized_methylation"
|
|
683
790
|
|
|
684
791
|
# Preprocessing - Read modification filter params
|
|
685
792
|
read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
686
793
|
read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
|
|
687
|
-
|
|
794
|
+
read_mod_filtering_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
688
795
|
read_mod_filtering_a_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
689
796
|
read_mod_filtering_use_other_c_as_background: bool = True
|
|
690
797
|
min_valid_fraction_positions_in_read_vs_ref: float = 0.2
|
|
691
798
|
|
|
799
|
+
# Preprocessing - plotting params
|
|
800
|
+
obs_to_plot_pp_qc: List[str] = field(
|
|
801
|
+
default_factory=lambda: [
|
|
802
|
+
"read_length",
|
|
803
|
+
"mapped_length",
|
|
804
|
+
"read_quality",
|
|
805
|
+
"mapping_quality",
|
|
806
|
+
"mapped_length_to_reference_length_ratio",
|
|
807
|
+
"mapped_length_to_read_length_ratio",
|
|
808
|
+
"Raw_modification_signal",
|
|
809
|
+
]
|
|
810
|
+
)
|
|
811
|
+
|
|
692
812
|
# Preprocessing - Duplicate detection params
|
|
693
|
-
duplicate_detection_site_types: List[str] = field(
|
|
813
|
+
duplicate_detection_site_types: List[str] = field(
|
|
814
|
+
default_factory=lambda: ["GpC", "CpG", "ambiguous_GpC_CpG"]
|
|
815
|
+
)
|
|
694
816
|
duplicate_detection_distance_threshold: float = 0.07
|
|
695
|
-
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: [
|
|
696
|
-
duplicate_detection_keep_best_metric: str =
|
|
817
|
+
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ["Fraction_C_site_modified"])
|
|
818
|
+
duplicate_detection_keep_best_metric: str = "read_quality"
|
|
697
819
|
duplicate_detection_window_size_for_hamming_neighbors: int = 50
|
|
698
820
|
duplicate_detection_min_overlapping_positions: int = 20
|
|
699
821
|
duplicate_detection_do_hierarchical: bool = True
|
|
@@ -703,28 +825,38 @@ class ExperimentConfig:
|
|
|
703
825
|
# Preprocessing - Position QC
|
|
704
826
|
position_max_nan_threshold: float = 0.1
|
|
705
827
|
|
|
706
|
-
#
|
|
707
|
-
layer_for_clustermap_plotting: Optional[str] =
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
828
|
+
# Spatial Analysis - Clustermap params
|
|
829
|
+
layer_for_clustermap_plotting: Optional[str] = "nan0_0minus1"
|
|
830
|
+
clustermap_cmap_c: Optional[str] = "coolwarm"
|
|
831
|
+
clustermap_cmap_gpc: Optional[str] = "coolwarm"
|
|
832
|
+
clustermap_cmap_cpg: Optional[str] = "coolwarm"
|
|
833
|
+
clustermap_cmap_a: Optional[str] = "coolwarm"
|
|
834
|
+
spatial_clustermap_sortby: Optional[str] = "gpc"
|
|
835
|
+
|
|
836
|
+
# Spatial Analysis - UMAP/Leiden params
|
|
837
|
+
layer_for_umap_plotting: Optional[str] = "nan_half"
|
|
838
|
+
umap_layers_to_plot: List[str] = field(
|
|
839
|
+
default_factory=lambda: ["mapped_length", "Raw_modification_signal"]
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
# Spatial Analysis - Spatial Autocorrelation params
|
|
843
|
+
autocorr_normalization_method: str = "pearson"
|
|
714
844
|
rows_per_qc_autocorr_grid: int = 12
|
|
715
845
|
autocorr_rolling_window_size: int = 25
|
|
716
846
|
autocorr_max_lag: int = 800
|
|
717
|
-
autocorr_site_types: List[str] = field(default_factory=lambda: [
|
|
847
|
+
autocorr_site_types: List[str] = field(default_factory=lambda: ["GpC", "CpG", "C"])
|
|
718
848
|
|
|
719
|
-
#
|
|
720
|
-
correlation_matrix_types: List[str] = field(
|
|
721
|
-
|
|
722
|
-
|
|
849
|
+
# Spatial Analysis - Correlation Matrix params
|
|
850
|
+
correlation_matrix_types: List[str] = field(
|
|
851
|
+
default_factory=lambda: ["pearson", "binary_covariance"]
|
|
852
|
+
)
|
|
853
|
+
correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
|
|
854
|
+
correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
|
|
723
855
|
|
|
724
856
|
# HMM params
|
|
725
857
|
hmm_n_states: int = 2
|
|
726
|
-
hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
|
|
727
|
-
hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
|
|
858
|
+
hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
|
|
859
|
+
hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
|
|
728
860
|
hmm_init_start_probs: List[float] = field(default_factory=lambda: [0.5, 0.5])
|
|
729
861
|
hmm_eps: float = 1e-8
|
|
730
862
|
hmm_dtype: str = "float64"
|
|
@@ -732,12 +864,29 @@ class ExperimentConfig:
|
|
|
732
864
|
hmm_batch_size: int = 1024
|
|
733
865
|
hmm_use_viterbi: bool = False
|
|
734
866
|
hmm_device: Optional[str] = None
|
|
735
|
-
hmm_methbases: Optional[List[str]] =
|
|
867
|
+
hmm_methbases: Optional[List[str]] = (
|
|
868
|
+
None # if None, HMM.annotate_adata will fall back to mod_target_bases
|
|
869
|
+
)
|
|
870
|
+
# HMM fitting/application strategy
|
|
871
|
+
hmm_fit_strategy: str = "per_group" # "per_group" | "shared_transitions"
|
|
872
|
+
hmm_shared_scope: List[str] = field(default_factory=lambda: ["reference", "methbase"])
|
|
873
|
+
hmm_groupby: List[str] = field(default_factory=lambda: ["sample", "reference", "methbase"])
|
|
874
|
+
# Shared-transitions adaptation behavior
|
|
875
|
+
hmm_adapt_emissions: bool = True
|
|
876
|
+
hmm_adapt_startprobs: bool = True
|
|
877
|
+
hmm_emission_adapt_iters: int = 5
|
|
878
|
+
hmm_emission_adapt_tol: float = 1e-4
|
|
736
879
|
footprints: Optional[bool] = True
|
|
737
880
|
accessible_patches: Optional[bool] = True
|
|
738
881
|
cpg: Optional[bool] = False
|
|
739
882
|
hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
|
|
740
|
-
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None,
|
|
883
|
+
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 60)])
|
|
884
|
+
clustermap_cmap_hmm: Optional[str] = "coolwarm"
|
|
885
|
+
hmm_clustermap_feature_layers: List[str] = field(
|
|
886
|
+
default_factory=lambda: ["all_accessible_features"]
|
|
887
|
+
)
|
|
888
|
+
hmm_clustermap_sortby: Optional[str] = "hmm"
|
|
889
|
+
hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
|
|
741
890
|
|
|
742
891
|
# Pipeline control flow - load adata
|
|
743
892
|
force_redo_load_adata: bool = False
|
|
@@ -760,11 +909,11 @@ class ExperimentConfig:
|
|
|
760
909
|
force_redo_filter_reads_on_modification_thresholds: bool = False
|
|
761
910
|
bypass_flag_duplicate_reads: bool = False
|
|
762
911
|
force_redo_flag_duplicate_reads: bool = False
|
|
763
|
-
bypass_complexity_analysis: bool = False
|
|
912
|
+
bypass_complexity_analysis: bool = False
|
|
764
913
|
force_redo_complexity_analysis: bool = False
|
|
765
914
|
|
|
766
|
-
# Pipeline control flow -
|
|
767
|
-
|
|
915
|
+
# Pipeline control flow - Spatial Analyses
|
|
916
|
+
force_redo_spatial_analyses: bool = False
|
|
768
917
|
bypass_basic_clustermaps: bool = False
|
|
769
918
|
force_redo_basic_clustermaps: bool = False
|
|
770
919
|
bypass_basic_umap: bool = False
|
|
@@ -840,7 +989,9 @@ class ExperimentConfig:
|
|
|
840
989
|
defaults_loaded = dict(defaults_map[modality] or {})
|
|
841
990
|
defaults_source_chain = [f"defaults_map['{modality}']"]
|
|
842
991
|
elif defaults_dir is not None:
|
|
843
|
-
defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
|
|
992
|
+
defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
|
|
993
|
+
defaults_dir, modality
|
|
994
|
+
)
|
|
844
995
|
|
|
845
996
|
# If CSV asks to extend defaults, load those and merge
|
|
846
997
|
merged = dict(defaults_loaded or {})
|
|
@@ -855,7 +1006,11 @@ class ExperimentConfig:
|
|
|
855
1006
|
else:
|
|
856
1007
|
ext_list = []
|
|
857
1008
|
for ext in ext_list:
|
|
858
|
-
ext_defaults, ext_sources = (
|
|
1009
|
+
ext_defaults, ext_sources = (
|
|
1010
|
+
load_defaults_with_inheritance(defaults_dir, ext)
|
|
1011
|
+
if defaults_dir
|
|
1012
|
+
else ({}, [])
|
|
1013
|
+
)
|
|
859
1014
|
merged = deep_merge(merged, ext_defaults)
|
|
860
1015
|
for s in ext_sources:
|
|
861
1016
|
if s not in defaults_source_chain:
|
|
@@ -885,34 +1040,40 @@ class ExperimentConfig:
|
|
|
885
1040
|
merged["experiment_name"] = f"{date_str}_SMF_experiment"
|
|
886
1041
|
|
|
887
1042
|
# Input file types and path handling
|
|
888
|
-
input_data_path = Path(merged[
|
|
1043
|
+
input_data_path = Path(merged["input_data_path"])
|
|
889
1044
|
|
|
890
1045
|
# Detect the input filetype
|
|
891
1046
|
if input_data_path.is_file():
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
1047
|
+
suffix = input_data_path.suffix.lower()
|
|
1048
|
+
suffixes = [
|
|
1049
|
+
s.lower() for s in input_data_path.suffixes
|
|
1050
|
+
] # handles multi-part extensions
|
|
1051
|
+
|
|
1052
|
+
# recognize multi-suffix cases like .fastq.gz or .fq.gz
|
|
1053
|
+
if any(s in [".pod5", ".p5"] for s in suffixes):
|
|
1054
|
+
input_type = "pod5"
|
|
1055
|
+
input_files = [Path(input_data_path)]
|
|
1056
|
+
elif any(s in [".fast5", ".f5"] for s in suffixes):
|
|
1057
|
+
input_type = "fast5"
|
|
1058
|
+
input_files = [Path(input_data_path)]
|
|
1059
|
+
elif any(s in [".fastq", ".fq"] for s in suffixes):
|
|
1060
|
+
input_type = "fastq"
|
|
1061
|
+
input_files = [Path(input_data_path)]
|
|
1062
|
+
elif any(s in [".bam"] for s in suffixes):
|
|
1063
|
+
input_type = "bam"
|
|
1064
|
+
input_files = [Path(input_data_path)]
|
|
1065
|
+
elif any(s in [".h5ad", ".h5"] for s in suffixes):
|
|
1066
|
+
input_type = "h5ad"
|
|
1067
|
+
input_files = [Path(input_data_path)]
|
|
1068
|
+
else:
|
|
1069
|
+
print("Error detecting input file type")
|
|
913
1070
|
|
|
914
1071
|
elif input_data_path.is_dir():
|
|
915
|
-
found = discover_input_files(
|
|
1072
|
+
found = discover_input_files(
|
|
1073
|
+
input_data_path,
|
|
1074
|
+
bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
|
|
1075
|
+
recursive=merged["recursive_input_search"],
|
|
1076
|
+
)
|
|
916
1077
|
|
|
917
1078
|
if found["input_is_pod5"]:
|
|
918
1079
|
input_type = "pod5"
|
|
@@ -930,15 +1091,22 @@ class ExperimentConfig:
|
|
|
930
1091
|
input_type = "h5ad"
|
|
931
1092
|
input_files = found["h5ad_paths"]
|
|
932
1093
|
|
|
933
|
-
print(
|
|
1094
|
+
print(
|
|
1095
|
+
f"Found {found['all_files_searched']} files; "
|
|
1096
|
+
f"fastq={len(found['fastq_paths'])}, "
|
|
1097
|
+
f"bam={len(found['bam_paths'])}, "
|
|
1098
|
+
f"pod5={len(found['pod5_paths'])}, "
|
|
1099
|
+
f"fast5={len(found['fast5_paths'])}, "
|
|
1100
|
+
f"h5ad={len(found['h5ad_paths'])}"
|
|
1101
|
+
)
|
|
934
1102
|
|
|
935
1103
|
# summary file output path
|
|
936
|
-
output_dir = Path(merged[
|
|
937
|
-
summary_file_basename = merged["experiment_name"] +
|
|
1104
|
+
output_dir = Path(merged["output_directory"])
|
|
1105
|
+
summary_file_basename = merged["experiment_name"] + "_output_summary.csv"
|
|
938
1106
|
summary_file = output_dir / summary_file_basename
|
|
939
1107
|
|
|
940
1108
|
# Demultiplexing output path
|
|
941
|
-
split_dir = merged.get("split_dir",
|
|
1109
|
+
split_dir = merged.get("split_dir", SPLIT_DIR)
|
|
942
1110
|
split_path = output_dir / split_dir
|
|
943
1111
|
|
|
944
1112
|
# final normalization
|
|
@@ -962,7 +1130,14 @@ class ExperimentConfig:
|
|
|
962
1130
|
merged["hm5C_threshold"],
|
|
963
1131
|
]
|
|
964
1132
|
|
|
965
|
-
for bkey in (
|
|
1133
|
+
for bkey in (
|
|
1134
|
+
"barcode_both_ends",
|
|
1135
|
+
"trim",
|
|
1136
|
+
"input_already_demuxed",
|
|
1137
|
+
"make_bigwigs",
|
|
1138
|
+
"skip_unclassified",
|
|
1139
|
+
"delete_batch_hdfs",
|
|
1140
|
+
):
|
|
966
1141
|
if bkey in merged:
|
|
967
1142
|
merged[bkey] = _parse_bool(merged[bkey])
|
|
968
1143
|
|
|
@@ -971,16 +1146,19 @@ class ExperimentConfig:
|
|
|
971
1146
|
if "threads" in merged:
|
|
972
1147
|
tval = _parse_numeric(merged.get("threads", None), None)
|
|
973
1148
|
merged["threads"] = None if tval is None else int(tval)
|
|
974
|
-
|
|
1149
|
+
|
|
975
1150
|
if "aligner_args" in merged and merged.get("aligner_args") is None:
|
|
976
1151
|
merged.pop("aligner_args", None)
|
|
977
1152
|
|
|
978
1153
|
# --- Resolve aligner_args into concrete list for the chosen aligner ---
|
|
979
|
-
merged[
|
|
1154
|
+
merged["aligner_args"] = resolve_aligner_args(merged)
|
|
980
1155
|
|
|
981
1156
|
if "mod_list" in merged:
|
|
982
1157
|
merged["mod_list"] = _parse_list(merged.get("mod_list"))
|
|
983
1158
|
|
|
1159
|
+
# Preprocessing args
|
|
1160
|
+
obs_to_plot_pp_qc = _parse_list(merged.get("obs_to_plot_pp_qc", None))
|
|
1161
|
+
|
|
984
1162
|
# HMM feature set handling
|
|
985
1163
|
if "hmm_feature_sets" in merged:
|
|
986
1164
|
merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged["hmm_feature_sets"])
|
|
@@ -988,11 +1166,22 @@ class ExperimentConfig:
|
|
|
988
1166
|
# allow older names (footprint_ranges, accessible_ranges, cpg_ranges) — optional:
|
|
989
1167
|
maybe_fs = {}
|
|
990
1168
|
if "footprint_ranges" in merged or "hmm_footprint_ranges" in merged:
|
|
991
|
-
maybe_fs["footprint"] = {
|
|
1169
|
+
maybe_fs["footprint"] = {
|
|
1170
|
+
"features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")),
|
|
1171
|
+
"state": merged.get("hmm_footprint_state", "Non-Modified"),
|
|
1172
|
+
}
|
|
992
1173
|
if "accessible_ranges" in merged or "hmm_accessible_ranges" in merged:
|
|
993
|
-
maybe_fs["accessible"] = {
|
|
1174
|
+
maybe_fs["accessible"] = {
|
|
1175
|
+
"features": merged.get(
|
|
1176
|
+
"hmm_accessible_ranges", merged.get("accessible_ranges")
|
|
1177
|
+
),
|
|
1178
|
+
"state": merged.get("hmm_accessible_state", "Modified"),
|
|
1179
|
+
}
|
|
994
1180
|
if "cpg_ranges" in merged or "hmm_cpg_ranges" in merged:
|
|
995
|
-
maybe_fs["cpg"] = {
|
|
1181
|
+
maybe_fs["cpg"] = {
|
|
1182
|
+
"features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")),
|
|
1183
|
+
"state": merged.get("hmm_cpg_state", "Modified"),
|
|
1184
|
+
}
|
|
996
1185
|
if maybe_fs:
|
|
997
1186
|
merged.setdefault("hmm_feature_sets", {})
|
|
998
1187
|
for k, v in maybe_fs.items():
|
|
@@ -1013,158 +1202,278 @@ class ExperimentConfig:
|
|
|
1013
1202
|
if not hmm_methbases: # None or []
|
|
1014
1203
|
hmm_methbases = _parse_list(merged.get("mod_target_bases", None))
|
|
1015
1204
|
if not hmm_methbases:
|
|
1016
|
-
hmm_methbases = [
|
|
1205
|
+
hmm_methbases = ["C"]
|
|
1017
1206
|
hmm_methbases = list(hmm_methbases)
|
|
1018
1207
|
hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
|
|
1208
|
+
hmm_clustermap_feature_layers = _parse_list(
|
|
1209
|
+
merged.get("hmm_clustermap_feature_layers", "all_accessible_features")
|
|
1210
|
+
)
|
|
1211
|
+
|
|
1212
|
+
hmm_fit_strategy = str(merged.get("hmm_fit_strategy", "per_group")).strip()
|
|
1213
|
+
hmm_shared_scope = _parse_list(merged.get("hmm_shared_scope", ["reference", "methbase"]))
|
|
1214
|
+
hmm_groupby = _parse_list(merged.get("hmm_groupby", ["sample", "reference", "methbase"]))
|
|
1215
|
+
|
|
1216
|
+
hmm_adapt_emissions = _parse_bool(merged.get("hmm_adapt_emissions", True))
|
|
1217
|
+
hmm_adapt_startprobs = _parse_bool(merged.get("hmm_adapt_startprobs", True))
|
|
1218
|
+
hmm_emission_adapt_iters = int(_parse_numeric(merged.get("hmm_emission_adapt_iters", 5), 5))
|
|
1219
|
+
hmm_emission_adapt_tol = float(
|
|
1220
|
+
_parse_numeric(merged.get("hmm_emission_adapt_tol", 1e-4), 1e-4)
|
|
1221
|
+
)
|
|
1222
|
+
|
|
1223
|
+
# HMM peak feature configs (for call_hmm_peaks)
|
|
1224
|
+
merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
|
|
1225
|
+
merged.get("hmm_peak_feature_configs", {})
|
|
1226
|
+
)
|
|
1227
|
+
hmm_peak_feature_configs = merged.get("hmm_peak_feature_configs", {})
|
|
1019
1228
|
|
|
1020
1229
|
# instantiate dataclass
|
|
1021
1230
|
instance = cls(
|
|
1022
|
-
smf_modality
|
|
1023
|
-
input_data_path
|
|
1024
|
-
recursive_input_search
|
|
1025
|
-
input_type
|
|
1026
|
-
input_files
|
|
1027
|
-
output_directory
|
|
1028
|
-
summary_file
|
|
1029
|
-
fasta
|
|
1030
|
-
sequencer
|
|
1031
|
-
model_dir
|
|
1032
|
-
barcode_kit
|
|
1033
|
-
fastq_barcode_map
|
|
1034
|
-
fastq_auto_pairing
|
|
1035
|
-
bam_suffix
|
|
1036
|
-
split_dir
|
|
1037
|
-
split_path
|
|
1038
|
-
strands
|
|
1039
|
-
conversions
|
|
1040
|
-
fasta_regions_of_interest
|
|
1041
|
-
mapping_threshold
|
|
1042
|
-
experiment_name
|
|
1043
|
-
model
|
|
1044
|
-
barcode_both_ends
|
|
1045
|
-
trim
|
|
1046
|
-
input_already_demuxed
|
|
1047
|
-
threads
|
|
1048
|
-
sample_sheet_path
|
|
1049
|
-
sample_sheet_mapping_column
|
|
1050
|
-
delete_intermediate_bams
|
|
1051
|
-
delete_intermediate_tsvs
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1231
|
+
smf_modality=merged.get("smf_modality"),
|
|
1232
|
+
input_data_path=input_data_path,
|
|
1233
|
+
recursive_input_search=merged.get("recursive_input_search"),
|
|
1234
|
+
input_type=input_type,
|
|
1235
|
+
input_files=input_files,
|
|
1236
|
+
output_directory=output_dir,
|
|
1237
|
+
summary_file=summary_file,
|
|
1238
|
+
fasta=merged.get("fasta"),
|
|
1239
|
+
sequencer=merged.get("sequencer"),
|
|
1240
|
+
model_dir=merged.get("model_dir"),
|
|
1241
|
+
barcode_kit=merged.get("barcode_kit"),
|
|
1242
|
+
fastq_barcode_map=merged.get("fastq_barcode_map"),
|
|
1243
|
+
fastq_auto_pairing=merged.get("fastq_auto_pairing"),
|
|
1244
|
+
bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
|
|
1245
|
+
split_dir=split_dir,
|
|
1246
|
+
split_path=split_path,
|
|
1247
|
+
strands=merged.get("strands", STRANDS),
|
|
1248
|
+
conversions=merged.get("conversions", CONVERSIONS),
|
|
1249
|
+
fasta_regions_of_interest=merged.get("fasta_regions_of_interest"),
|
|
1250
|
+
mapping_threshold=float(merged.get("mapping_threshold", 0.01)),
|
|
1251
|
+
experiment_name=merged.get("experiment_name"),
|
|
1252
|
+
model=merged.get("model", "hac"),
|
|
1253
|
+
barcode_both_ends=merged.get("barcode_both_ends", BARCODE_BOTH_ENDS),
|
|
1254
|
+
trim=merged.get("trim", TRIM),
|
|
1255
|
+
input_already_demuxed=merged.get("input_already_demuxed", False),
|
|
1256
|
+
threads=merged.get("threads"),
|
|
1257
|
+
sample_sheet_path=merged.get("sample_sheet_path"),
|
|
1258
|
+
sample_sheet_mapping_column=merged.get("sample_sheet_mapping_column"),
|
|
1259
|
+
delete_intermediate_bams=merged.get("delete_intermediate_bams", False),
|
|
1260
|
+
delete_intermediate_tsvs=merged.get("delete_intermediate_tsvs", True),
|
|
1261
|
+
align_from_bam=merged.get("align_from_bam", False),
|
|
1262
|
+
aligner=merged.get("aligner", "minimap2"),
|
|
1263
|
+
aligner_args=merged.get("aligner_args", None),
|
|
1264
|
+
device=merged.get("device", "auto"),
|
|
1265
|
+
make_bigwigs=merged.get("make_bigwigs", False),
|
|
1266
|
+
make_beds=merged.get("make_beds", False),
|
|
1267
|
+
delete_intermediate_hdfs=merged.get("delete_intermediate_hdfs", True),
|
|
1268
|
+
mod_target_bases=merged.get("mod_target_bases", ["GpC", "CpG"]),
|
|
1269
|
+
enzyme_target_bases=merged.get("enzyme_target_bases", ["GpC"]),
|
|
1270
|
+
conversion_types=merged.get("conversions", ["unconverted"])
|
|
1271
|
+
+ merged.get("conversion_types", ["5mC"]),
|
|
1272
|
+
filter_threshold=merged.get("filter_threshold", 0.8),
|
|
1273
|
+
m6A_threshold=merged.get("m6A_threshold", 0.7),
|
|
1274
|
+
m5C_threshold=merged.get("m5C_threshold", 0.7),
|
|
1275
|
+
hm5C_threshold=merged.get("hm5C_threshold", 0.7),
|
|
1276
|
+
thresholds=merged.get("thresholds", []),
|
|
1277
|
+
mod_list=merged.get("mod_list", list(MOD_LIST)),
|
|
1278
|
+
mod_map=merged.get("mod_map", list(MOD_MAP)),
|
|
1279
|
+
batch_size=merged.get("batch_size", 4),
|
|
1280
|
+
skip_unclassified=merged.get("skip_unclassified", True),
|
|
1281
|
+
delete_batch_hdfs=merged.get("delete_batch_hdfs", True),
|
|
1282
|
+
reference_column=merged.get("reference_column", REF_COL),
|
|
1283
|
+
sample_column=merged.get("sample_column", SAMPLE_COL),
|
|
1284
|
+
sample_name_col_for_plotting=merged.get("sample_name_col_for_plotting", "Barcode"),
|
|
1285
|
+
obs_to_plot_pp_qc=obs_to_plot_pp_qc,
|
|
1286
|
+
fit_position_methylation_thresholds=merged.get(
|
|
1287
|
+
"fit_position_methylation_thresholds", False
|
|
1288
|
+
),
|
|
1289
|
+
binarize_on_fixed_methlyation_threshold=merged.get(
|
|
1290
|
+
"binarize_on_fixed_methlyation_threshold", 0.7
|
|
1291
|
+
),
|
|
1292
|
+
positive_control_sample_methylation_fitting=merged.get(
|
|
1293
|
+
"positive_control_sample_methylation_fitting", None
|
|
1294
|
+
),
|
|
1295
|
+
negative_control_sample_methylation_fitting=merged.get(
|
|
1296
|
+
"negative_control_sample_methylation_fitting", None
|
|
1297
|
+
),
|
|
1298
|
+
infer_on_percentile_sample_methylation_fitting=merged.get(
|
|
1299
|
+
"infer_on_percentile_sample_methylation_fitting", 10
|
|
1300
|
+
),
|
|
1301
|
+
inference_variable_sample_methylation_fitting=merged.get(
|
|
1302
|
+
"inference_variable_sample_methylation_fitting", "Raw_modification_signal"
|
|
1303
|
+
),
|
|
1304
|
+
fit_j_threshold=merged.get("fit_j_threshold", 0.5),
|
|
1305
|
+
output_binary_layer_name=merged.get(
|
|
1306
|
+
"output_binary_layer_name", "binarized_methylation"
|
|
1307
|
+
),
|
|
1308
|
+
reindexing_offsets=merged.get("reindexing_offsets", {None: None}),
|
|
1309
|
+
reindexed_var_suffix=merged.get("reindexed_var_suffix", "reindexed"),
|
|
1310
|
+
layer_for_clustermap_plotting=merged.get(
|
|
1311
|
+
"layer_for_clustermap_plotting", "nan0_0minus1"
|
|
1312
|
+
),
|
|
1313
|
+
clustermap_cmap_c=merged.get("clustermap_cmap_c", "coolwarm"),
|
|
1314
|
+
clustermap_cmap_gpc=merged.get("clustermap_cmap_gpc", "coolwarm"),
|
|
1315
|
+
clustermap_cmap_cpg=merged.get("clustermap_cmap_cpg", "coolwarm"),
|
|
1316
|
+
clustermap_cmap_a=merged.get("clustermap_cmap_a", "coolwarm"),
|
|
1317
|
+
spatial_clustermap_sortby=merged.get("spatial_clustermap_sortby", "gpc"),
|
|
1318
|
+
layer_for_umap_plotting=merged.get("layer_for_umap_plotting", "nan_half"),
|
|
1319
|
+
umap_layers_to_plot=merged.get(
|
|
1320
|
+
"umap_layers_to_plot", ["mapped_length", "Raw_modification_signal"]
|
|
1321
|
+
),
|
|
1322
|
+
rows_per_qc_histogram_grid=merged.get("rows_per_qc_histogram_grid", 12),
|
|
1323
|
+
rows_per_qc_autocorr_grid=merged.get("rows_per_qc_autocorr_grid", 12),
|
|
1324
|
+
autocorr_normalization_method=merged.get("autocorr_normalization_method", "pearson"),
|
|
1325
|
+
autocorr_rolling_window_size=merged.get("autocorr_rolling_window_size", 25),
|
|
1326
|
+
autocorr_max_lag=merged.get("autocorr_max_lag", 800),
|
|
1327
|
+
autocorr_site_types=merged.get("autocorr_site_types", ["GpC", "CpG", "C"]),
|
|
1328
|
+
hmm_n_states=merged.get("hmm_n_states", 2),
|
|
1329
|
+
hmm_init_emission_probs=merged.get("hmm_init_emission_probs", [[0.8, 0.2], [0.2, 0.8]]),
|
|
1330
|
+
hmm_init_transition_probs=merged.get(
|
|
1331
|
+
"hmm_init_transition_probs", [[0.9, 0.1], [0.1, 0.9]]
|
|
1332
|
+
),
|
|
1333
|
+
hmm_init_start_probs=merged.get("hmm_init_start_probs", [0.5, 0.5]),
|
|
1334
|
+
hmm_eps=merged.get("hmm_eps", 1e-8),
|
|
1335
|
+
hmm_fit_strategy=hmm_fit_strategy,
|
|
1336
|
+
hmm_shared_scope=hmm_shared_scope,
|
|
1337
|
+
hmm_groupby=hmm_groupby,
|
|
1338
|
+
hmm_adapt_emissions=hmm_adapt_emissions,
|
|
1339
|
+
hmm_adapt_startprobs=hmm_adapt_startprobs,
|
|
1340
|
+
hmm_emission_adapt_iters=hmm_emission_adapt_iters,
|
|
1341
|
+
hmm_emission_adapt_tol=hmm_emission_adapt_tol,
|
|
1342
|
+
hmm_dtype=merged.get("hmm_dtype", "float64"),
|
|
1343
|
+
hmm_feature_sets=hmm_feature_sets,
|
|
1344
|
+
hmm_annotation_threshold=hmm_annotation_threshold,
|
|
1345
|
+
hmm_batch_size=hmm_batch_size,
|
|
1346
|
+
hmm_use_viterbi=hmm_use_viterbi,
|
|
1347
|
+
hmm_methbases=hmm_methbases,
|
|
1348
|
+
hmm_device=hmm_device,
|
|
1349
|
+
hmm_merge_layer_features=hmm_merge_layer_features,
|
|
1350
|
+
clustermap_cmap_hmm=merged.get("clustermap_cmap_hmm", "coolwarm"),
|
|
1351
|
+
hmm_clustermap_feature_layers=hmm_clustermap_feature_layers,
|
|
1352
|
+
hmm_clustermap_sortby=merged.get("hmm_clustermap_sortby", "hmm"),
|
|
1353
|
+
hmm_peak_feature_configs=hmm_peak_feature_configs,
|
|
1354
|
+
footprints=merged.get("footprints", None),
|
|
1355
|
+
accessible_patches=merged.get("accessible_patches", None),
|
|
1356
|
+
cpg=merged.get("cpg", None),
|
|
1357
|
+
read_coord_filter=merged.get("read_coord_filter", [None, None]),
|
|
1358
|
+
read_len_filter_thresholds=merged.get("read_len_filter_thresholds", [100, None]),
|
|
1359
|
+
read_len_to_ref_ratio_filter_thresholds=merged.get(
|
|
1360
|
+
"read_len_to_ref_ratio_filter_thresholds", [0.3, None]
|
|
1361
|
+
),
|
|
1362
|
+
read_quality_filter_thresholds=merged.get("read_quality_filter_thresholds", [15, None]),
|
|
1363
|
+
read_mapping_quality_filter_thresholds=merged.get(
|
|
1364
|
+
"read_mapping_quality_filter_thresholds", [None, None]
|
|
1365
|
+
),
|
|
1366
|
+
read_mod_filtering_gpc_thresholds=merged.get(
|
|
1367
|
+
"read_mod_filtering_gpc_thresholds", [0.025, 0.975]
|
|
1368
|
+
),
|
|
1369
|
+
read_mod_filtering_cpg_thresholds=merged.get(
|
|
1370
|
+
"read_mod_filtering_cpg_thresholds", [0.0, 1.0]
|
|
1371
|
+
),
|
|
1372
|
+
read_mod_filtering_c_thresholds=merged.get(
|
|
1373
|
+
"read_mod_filtering_c_thresholds", [0.025, 0.975]
|
|
1374
|
+
),
|
|
1375
|
+
read_mod_filtering_a_thresholds=merged.get(
|
|
1376
|
+
"read_mod_filtering_a_thresholds", [0.025, 0.975]
|
|
1377
|
+
),
|
|
1378
|
+
read_mod_filtering_use_other_c_as_background=merged.get(
|
|
1379
|
+
"read_mod_filtering_use_other_c_as_background", True
|
|
1380
|
+
),
|
|
1381
|
+
min_valid_fraction_positions_in_read_vs_ref=merged.get(
|
|
1382
|
+
"min_valid_fraction_positions_in_read_vs_ref", 0.2
|
|
1383
|
+
),
|
|
1384
|
+
duplicate_detection_site_types=merged.get(
|
|
1385
|
+
"duplicate_detection_site_types", ["GpC", "CpG", "ambiguous_GpC_CpG"]
|
|
1386
|
+
),
|
|
1387
|
+
duplicate_detection_distance_threshold=merged.get(
|
|
1388
|
+
"duplicate_detection_distance_threshold", 0.07
|
|
1389
|
+
),
|
|
1390
|
+
duplicate_detection_keep_best_metric=merged.get(
|
|
1391
|
+
"duplicate_detection_keep_best_metric", "read_quality"
|
|
1392
|
+
),
|
|
1393
|
+
duplicate_detection_window_size_for_hamming_neighbors=merged.get(
|
|
1394
|
+
"duplicate_detection_window_size_for_hamming_neighbors", 50
|
|
1395
|
+
),
|
|
1396
|
+
duplicate_detection_min_overlapping_positions=merged.get(
|
|
1397
|
+
"duplicate_detection_min_overlapping_positions", 20
|
|
1398
|
+
),
|
|
1399
|
+
duplicate_detection_do_hierarchical=merged.get(
|
|
1400
|
+
"duplicate_detection_do_hierarchical", True
|
|
1401
|
+
),
|
|
1402
|
+
duplicate_detection_hierarchical_linkage=merged.get(
|
|
1403
|
+
"duplicate_detection_hierarchical_linkage", "average"
|
|
1404
|
+
),
|
|
1405
|
+
duplicate_detection_do_pca=merged.get("duplicate_detection_do_pca", False),
|
|
1406
|
+
position_max_nan_threshold=merged.get("position_max_nan_threshold", 0.1),
|
|
1407
|
+
correlation_matrix_types=merged.get(
|
|
1408
|
+
"correlation_matrix_types", ["pearson", "binary_covariance"]
|
|
1409
|
+
),
|
|
1410
|
+
correlation_matrix_cmaps=merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
|
|
1411
|
+
correlation_matrix_site_types=merged.get("correlation_matrix_site_types", ["GpC_site"]),
|
|
1412
|
+
hamming_vs_metric_keys=merged.get(
|
|
1413
|
+
"hamming_vs_metric_keys", ["Fraction_C_site_modified"]
|
|
1414
|
+
),
|
|
1415
|
+
force_redo_load_adata=merged.get("force_redo_load_adata", False),
|
|
1416
|
+
force_redo_preprocessing=merged.get("force_redo_preprocessing", False),
|
|
1417
|
+
force_reload_sample_sheet=merged.get("force_reload_sample_sheet", True),
|
|
1418
|
+
bypass_add_read_length_and_mapping_qc=merged.get(
|
|
1419
|
+
"bypass_add_read_length_and_mapping_qc", False
|
|
1420
|
+
),
|
|
1421
|
+
force_redo_add_read_length_and_mapping_qc=merged.get(
|
|
1422
|
+
"force_redo_add_read_length_and_mapping_qc", False
|
|
1423
|
+
),
|
|
1424
|
+
bypass_clean_nan=merged.get("bypass_clean_nan", False),
|
|
1425
|
+
force_redo_clean_nan=merged.get("force_redo_clean_nan", False),
|
|
1426
|
+
bypass_append_base_context=merged.get("bypass_append_base_context", False),
|
|
1427
|
+
force_redo_append_base_context=merged.get("force_redo_append_base_context", False),
|
|
1428
|
+
invert_adata=merged.get("invert_adata", False),
|
|
1429
|
+
bypass_append_binary_layer_by_base_context=merged.get(
|
|
1430
|
+
"bypass_append_binary_layer_by_base_context", False
|
|
1431
|
+
),
|
|
1432
|
+
force_redo_append_binary_layer_by_base_context=merged.get(
|
|
1433
|
+
"force_redo_append_binary_layer_by_base_context", False
|
|
1434
|
+
),
|
|
1435
|
+
bypass_calculate_read_modification_stats=merged.get(
|
|
1436
|
+
"bypass_calculate_read_modification_stats", False
|
|
1437
|
+
),
|
|
1438
|
+
force_redo_calculate_read_modification_stats=merged.get(
|
|
1439
|
+
"force_redo_calculate_read_modification_stats", False
|
|
1440
|
+
),
|
|
1441
|
+
bypass_filter_reads_on_modification_thresholds=merged.get(
|
|
1442
|
+
"bypass_filter_reads_on_modification_thresholds", False
|
|
1443
|
+
),
|
|
1444
|
+
force_redo_filter_reads_on_modification_thresholds=merged.get(
|
|
1445
|
+
"force_redo_filter_reads_on_modification_thresholds", False
|
|
1446
|
+
),
|
|
1447
|
+
bypass_flag_duplicate_reads=merged.get("bypass_flag_duplicate_reads", False),
|
|
1448
|
+
force_redo_flag_duplicate_reads=merged.get("force_redo_flag_duplicate_reads", False),
|
|
1449
|
+
bypass_complexity_analysis=merged.get("bypass_complexity_analysis", False),
|
|
1450
|
+
force_redo_complexity_analysis=merged.get("force_redo_complexity_analysis", False),
|
|
1451
|
+
force_redo_spatial_analyses=merged.get("force_redo_spatial_analyses", False),
|
|
1452
|
+
bypass_basic_clustermaps=merged.get("bypass_basic_clustermaps", False),
|
|
1453
|
+
force_redo_basic_clustermaps=merged.get("force_redo_basic_clustermaps", False),
|
|
1454
|
+
bypass_basic_umap=merged.get("bypass_basic_umap", False),
|
|
1455
|
+
force_redo_basic_umap=merged.get("force_redo_basic_umap", False),
|
|
1456
|
+
bypass_spatial_autocorr_calculations=merged.get(
|
|
1457
|
+
"bypass_spatial_autocorr_calculations", False
|
|
1458
|
+
),
|
|
1459
|
+
force_redo_spatial_autocorr_calculations=merged.get(
|
|
1460
|
+
"force_redo_spatial_autocorr_calculations", False
|
|
1461
|
+
),
|
|
1462
|
+
bypass_spatial_autocorr_plotting=merged.get("bypass_spatial_autocorr_plotting", False),
|
|
1463
|
+
force_redo_spatial_autocorr_plotting=merged.get(
|
|
1464
|
+
"force_redo_spatial_autocorr_plotting", False
|
|
1465
|
+
),
|
|
1466
|
+
bypass_matrix_corr_calculations=merged.get("bypass_matrix_corr_calculations", False),
|
|
1467
|
+
force_redo_matrix_corr_calculations=merged.get(
|
|
1468
|
+
"force_redo_matrix_corr_calculations", False
|
|
1469
|
+
),
|
|
1470
|
+
bypass_matrix_corr_plotting=merged.get("bypass_matrix_corr_plotting", False),
|
|
1471
|
+
force_redo_matrix_corr_plotting=merged.get("force_redo_matrix_corr_plotting", False),
|
|
1472
|
+
bypass_hmm_fit=merged.get("bypass_hmm_fit", False),
|
|
1473
|
+
force_redo_hmm_fit=merged.get("force_redo_hmm_fit", False),
|
|
1474
|
+
bypass_hmm_apply=merged.get("bypass_hmm_apply", False),
|
|
1475
|
+
force_redo_hmm_apply=merged.get("force_redo_hmm_apply", False),
|
|
1476
|
+
config_source=config_source or "<var_dict>",
|
|
1168
1477
|
)
|
|
1169
1478
|
|
|
1170
1479
|
report = {
|
|
@@ -1191,13 +1500,25 @@ class ExperimentConfig:
|
|
|
1191
1500
|
Load CSV using LoadExperimentConfig (or accept DataFrame) and build ExperimentConfig.
|
|
1192
1501
|
Additional kwargs passed to from_var_dict().
|
|
1193
1502
|
"""
|
|
1194
|
-
loader =
|
|
1503
|
+
loader = (
|
|
1504
|
+
LoadExperimentConfig(csv_input)
|
|
1505
|
+
if not isinstance(csv_input, pd.DataFrame)
|
|
1506
|
+
else LoadExperimentConfig(pd.DataFrame(csv_input))
|
|
1507
|
+
)
|
|
1195
1508
|
var_dict = loader.var_dict
|
|
1196
|
-
return cls.from_var_dict(
|
|
1509
|
+
return cls.from_var_dict(
|
|
1510
|
+
var_dict,
|
|
1511
|
+
date_str=date_str,
|
|
1512
|
+
config_source=config_source,
|
|
1513
|
+
defaults_dir=defaults_dir,
|
|
1514
|
+
defaults_map=defaults_map,
|
|
1515
|
+
**kwargs,
|
|
1516
|
+
)
|
|
1197
1517
|
|
|
1198
1518
|
# -------------------------
|
|
1199
1519
|
# validation & serialization
|
|
1200
1520
|
# -------------------------
|
|
1521
|
+
@staticmethod
|
|
1201
1522
|
def _validate_hmm_features_structure(hfs: dict) -> List[str]:
|
|
1202
1523
|
errs = []
|
|
1203
1524
|
if not isinstance(hfs, dict):
|
|
@@ -1205,7 +1526,9 @@ class ExperimentConfig:
|
|
|
1205
1526
|
return errs
|
|
1206
1527
|
for g, info in hfs.items():
|
|
1207
1528
|
if not isinstance(info, dict):
|
|
1208
|
-
errs.append(
|
|
1529
|
+
errs.append(
|
|
1530
|
+
f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'."
|
|
1531
|
+
)
|
|
1209
1532
|
continue
|
|
1210
1533
|
feats = info.get("features")
|
|
1211
1534
|
if not isinstance(feats, dict) or len(feats) == 0:
|
|
@@ -1215,7 +1538,9 @@ class ExperimentConfig:
|
|
|
1215
1538
|
try:
|
|
1216
1539
|
lo, hi = float(rng[0]), float(rng[1])
|
|
1217
1540
|
if lo < 0 or hi <= lo:
|
|
1218
|
-
errs.append(
|
|
1541
|
+
errs.append(
|
|
1542
|
+
f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}."
|
|
1543
|
+
)
|
|
1219
1544
|
except Exception:
|
|
1220
1545
|
errs.append(f"Feature range for {g}:{fname} is invalid: {rng}")
|
|
1221
1546
|
return errs
|
|
@@ -1248,13 +1573,18 @@ class ExperimentConfig:
|
|
|
1248
1573
|
|
|
1249
1574
|
if not (0.0 <= float(self.mapping_threshold) <= 1.0):
|
|
1250
1575
|
errors.append("mapping_threshold must be in [0,1].")
|
|
1251
|
-
for t in (
|
|
1576
|
+
for t in (
|
|
1577
|
+
self.filter_threshold,
|
|
1578
|
+
self.m6A_threshold,
|
|
1579
|
+
self.m5C_threshold,
|
|
1580
|
+
self.hm5C_threshold,
|
|
1581
|
+
):
|
|
1252
1582
|
if not (0.0 <= float(t) <= 1.0):
|
|
1253
1583
|
errors.append(f"threshold value {t} must be in [0,1].")
|
|
1254
1584
|
|
|
1255
1585
|
if raise_on_error and errors:
|
|
1256
1586
|
raise ValueError("ExperimentConfig validation failed:\n " + "\n ".join(errors))
|
|
1257
|
-
|
|
1587
|
+
|
|
1258
1588
|
errs = _validate_hmm_features_structure(self.hmm_feature_sets)
|
|
1259
1589
|
errors.extend(errs)
|
|
1260
1590
|
|