smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,26 @@
|
|
|
1
1
|
# experiment_config.py
|
|
2
2
|
from __future__ import annotations
|
|
3
|
+
|
|
3
4
|
import ast
|
|
4
5
|
import json
|
|
5
6
|
import warnings
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
|
+
from dataclasses import asdict, dataclass, field
|
|
7
8
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
|
+
from typing import IO, Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
10
|
+
|
|
11
|
+
from smftools.constants import (
|
|
12
|
+
BAM_SUFFIX,
|
|
13
|
+
BARCODE_BOTH_ENDS,
|
|
14
|
+
CONVERSIONS,
|
|
15
|
+
MOD_LIST,
|
|
16
|
+
MOD_MAP,
|
|
17
|
+
REF_COL,
|
|
18
|
+
SAMPLE_COL,
|
|
19
|
+
SPLIT_DIR,
|
|
20
|
+
STRANDS,
|
|
21
|
+
TRIM,
|
|
22
|
+
)
|
|
23
|
+
|
|
9
24
|
from .discover_input_files import discover_input_files
|
|
10
25
|
|
|
11
26
|
# Optional dependency for YAML handling
|
|
@@ -14,8 +29,8 @@ try:
|
|
|
14
29
|
except Exception:
|
|
15
30
|
yaml = None
|
|
16
31
|
|
|
17
|
-
import pandas as pd
|
|
18
32
|
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
19
34
|
|
|
20
35
|
|
|
21
36
|
# -------------------------
|
|
@@ -81,6 +96,7 @@ def _parse_numeric(v: Any, fallback: Any = None) -> Any:
|
|
|
81
96
|
except Exception:
|
|
82
97
|
return fallback
|
|
83
98
|
|
|
99
|
+
|
|
84
100
|
def _try_json_or_literal(s: Any) -> Any:
|
|
85
101
|
"""Try parse JSON or python literal; otherwise return original string."""
|
|
86
102
|
if s is None:
|
|
@@ -123,8 +139,8 @@ def resolve_aligner_args(
|
|
|
123
139
|
"""
|
|
124
140
|
# builtin defaults (aligner -> args)
|
|
125
141
|
builtin_defaults = {
|
|
126
|
-
"minimap2": [
|
|
127
|
-
"dorado": [
|
|
142
|
+
"minimap2": ["-a", "-x", "map-ont", "--MD", "-Y", "-y", "-N", "5", "--secondary=no"],
|
|
143
|
+
"dorado": ["--mm2-opts", "-N", "5"],
|
|
128
144
|
}
|
|
129
145
|
if default_by_aligner is None:
|
|
130
146
|
default_by_aligner = builtin_defaults
|
|
@@ -275,6 +291,7 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
|
|
|
275
291
|
canonical[grp] = {"features": feats, "state": state}
|
|
276
292
|
return canonical
|
|
277
293
|
|
|
294
|
+
|
|
278
295
|
def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
|
|
279
296
|
"""
|
|
280
297
|
Normalize user-provided `hmm_peak_feature_configs` into:
|
|
@@ -365,12 +382,12 @@ class LoadExperimentConfig:
|
|
|
365
382
|
df = pd.read_csv(source, dtype=str, keep_default_na=False, na_values=[""])
|
|
366
383
|
# normalize column names
|
|
367
384
|
df.columns = [c.strip() for c in df.columns]
|
|
368
|
-
if
|
|
385
|
+
if "variable" not in df.columns:
|
|
369
386
|
raise ValueError("Config CSV must contain a 'variable' column.")
|
|
370
|
-
if
|
|
371
|
-
df[
|
|
372
|
-
if
|
|
373
|
-
df[
|
|
387
|
+
if "value" not in df.columns:
|
|
388
|
+
df["value"] = ""
|
|
389
|
+
if "type" not in df.columns:
|
|
390
|
+
df["type"] = ""
|
|
374
391
|
return df
|
|
375
392
|
|
|
376
393
|
@staticmethod
|
|
@@ -389,9 +406,9 @@ class LoadExperimentConfig:
|
|
|
389
406
|
|
|
390
407
|
def parse_bool(s: str):
|
|
391
408
|
s2 = s.strip().lower()
|
|
392
|
-
if s2 in (
|
|
409
|
+
if s2 in ("1", "true", "t", "yes", "y", "on"):
|
|
393
410
|
return True
|
|
394
|
-
if s2 in (
|
|
411
|
+
if s2 in ("0", "false", "f", "no", "n", "off"):
|
|
395
412
|
return False
|
|
396
413
|
raise ValueError(f"Cannot parse boolean from '{s}'")
|
|
397
414
|
|
|
@@ -411,18 +428,18 @@ class LoadExperimentConfig:
|
|
|
411
428
|
except Exception:
|
|
412
429
|
pass
|
|
413
430
|
# fallback split
|
|
414
|
-
parts = [p.strip() for p in s.strip("()[] ").split(
|
|
431
|
+
parts = [p.strip() for p in s.strip("()[] ").split(",") if p.strip() != ""]
|
|
415
432
|
return parts
|
|
416
433
|
|
|
417
|
-
if hint in (
|
|
434
|
+
if hint in ("int", "integer"):
|
|
418
435
|
return int(v)
|
|
419
|
-
if hint in (
|
|
436
|
+
if hint in ("float", "double"):
|
|
420
437
|
return float(v)
|
|
421
|
-
if hint in (
|
|
438
|
+
if hint in ("bool", "boolean"):
|
|
422
439
|
return parse_bool(v)
|
|
423
|
-
if hint in (
|
|
440
|
+
if hint in ("list", "array"):
|
|
424
441
|
return parse_list_like(v)
|
|
425
|
-
if hint in (
|
|
442
|
+
if hint in ("string", "str"):
|
|
426
443
|
return v
|
|
427
444
|
|
|
428
445
|
# infer
|
|
@@ -448,27 +465,31 @@ class LoadExperimentConfig:
|
|
|
448
465
|
return lit
|
|
449
466
|
except Exception:
|
|
450
467
|
pass
|
|
451
|
-
if (
|
|
452
|
-
return [p.strip() for p in v.split(
|
|
468
|
+
if ("," in v) and (not any(ch in v for ch in "{}[]()")):
|
|
469
|
+
return [p.strip() for p in v.split(",") if p.strip() != ""]
|
|
453
470
|
return v
|
|
454
471
|
|
|
455
472
|
def _parse_df(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
456
473
|
parsed: Dict[str, Any] = {}
|
|
457
474
|
for idx, row in df.iterrows():
|
|
458
|
-
name = str(row[
|
|
475
|
+
name = str(row["variable"]).strip()
|
|
459
476
|
if name == "":
|
|
460
477
|
continue
|
|
461
|
-
raw_val = row.get(
|
|
462
|
-
raw_type = row.get(
|
|
478
|
+
raw_val = row.get("value", "")
|
|
479
|
+
raw_type = row.get("type", "")
|
|
463
480
|
if pd.isna(raw_val) or str(raw_val).strip() == "":
|
|
464
481
|
raw_val = None
|
|
465
482
|
try:
|
|
466
483
|
parsed_val = self._parse_value_as_type(raw_val, raw_type)
|
|
467
484
|
except Exception as e:
|
|
468
|
-
warnings.warn(
|
|
485
|
+
warnings.warn(
|
|
486
|
+
f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value."
|
|
487
|
+
)
|
|
469
488
|
parsed_val = None if raw_val is None else raw_val
|
|
470
489
|
if name in parsed:
|
|
471
|
-
warnings.warn(
|
|
490
|
+
warnings.warn(
|
|
491
|
+
f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value."
|
|
492
|
+
)
|
|
472
493
|
parsed[name] = parsed_val
|
|
473
494
|
return parsed
|
|
474
495
|
|
|
@@ -476,7 +497,7 @@ class LoadExperimentConfig:
|
|
|
476
497
|
"""Return parsed config as a pandas DataFrame (variable, value)."""
|
|
477
498
|
rows = []
|
|
478
499
|
for k, v in self.var_dict.items():
|
|
479
|
-
rows.append({
|
|
500
|
+
rows.append({"variable": k, "value": v})
|
|
480
501
|
return pd.DataFrame(rows)
|
|
481
502
|
|
|
482
503
|
|
|
@@ -644,17 +665,17 @@ class ExperimentConfig:
|
|
|
644
665
|
input_data_path: Optional[str] = None
|
|
645
666
|
output_directory: Optional[str] = None
|
|
646
667
|
fasta: Optional[str] = None
|
|
647
|
-
bam_suffix: str =
|
|
668
|
+
bam_suffix: str = BAM_SUFFIX
|
|
648
669
|
recursive_input_search: bool = True
|
|
649
670
|
input_type: Optional[str] = None
|
|
650
671
|
input_files: Optional[List[Path]] = None
|
|
651
|
-
split_dir: str =
|
|
672
|
+
split_dir: str = SPLIT_DIR
|
|
652
673
|
split_path: Optional[str] = None
|
|
653
|
-
strands: List[str] = field(default_factory=lambda:
|
|
654
|
-
conversions: List[str] = field(default_factory=lambda:
|
|
674
|
+
strands: List[str] = field(default_factory=lambda: STRANDS)
|
|
675
|
+
conversions: List[str] = field(default_factory=lambda: CONVERSIONS)
|
|
655
676
|
fasta_regions_of_interest: Optional[str] = None
|
|
656
677
|
sample_sheet_path: Optional[str] = None
|
|
657
|
-
sample_sheet_mapping_column: Optional[str] =
|
|
678
|
+
sample_sheet_mapping_column: Optional[str] = "Experiment_name_and_barcode"
|
|
658
679
|
experiment_name: Optional[str] = None
|
|
659
680
|
input_already_demuxed: bool = False
|
|
660
681
|
summary_file: Optional[Path] = None
|
|
@@ -690,8 +711,8 @@ class ExperimentConfig:
|
|
|
690
711
|
model_dir: Optional[str] = None
|
|
691
712
|
barcode_kit: Optional[str] = None
|
|
692
713
|
model: str = "hac"
|
|
693
|
-
barcode_both_ends: bool =
|
|
694
|
-
trim: bool =
|
|
714
|
+
barcode_both_ends: bool = BARCODE_BOTH_ENDS
|
|
715
|
+
trim: bool = TRIM
|
|
695
716
|
# General basecalling params
|
|
696
717
|
filter_threshold: float = 0.8
|
|
697
718
|
# Modified basecalling specific params
|
|
@@ -699,44 +720,75 @@ class ExperimentConfig:
|
|
|
699
720
|
m5C_threshold: float = 0.7
|
|
700
721
|
hm5C_threshold: float = 0.7
|
|
701
722
|
thresholds: List[float] = field(default_factory=list)
|
|
702
|
-
mod_list: List[str] = field(
|
|
703
|
-
|
|
723
|
+
mod_list: List[str] = field(
|
|
724
|
+
default_factory=lambda: list(MOD_LIST)
|
|
725
|
+
) # Dorado modified basecalling codes
|
|
726
|
+
mod_map: Dict[str, str] = field(
|
|
727
|
+
default_factory=lambda: dict(MOD_MAP)
|
|
728
|
+
) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
|
|
704
729
|
|
|
705
730
|
# Alignment params
|
|
706
|
-
mapping_threshold: float = 0.01
|
|
707
|
-
align_from_bam: bool =
|
|
731
|
+
mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
|
|
732
|
+
align_from_bam: bool = (
|
|
733
|
+
False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
|
|
734
|
+
)
|
|
708
735
|
aligner: str = "dorado"
|
|
709
736
|
aligner_args: Optional[List[str]] = None
|
|
710
737
|
make_bigwigs: bool = False
|
|
711
738
|
make_beds: bool = False
|
|
739
|
+
samtools_backend: str = "auto"
|
|
740
|
+
bedtools_backend: str = "auto"
|
|
741
|
+
bigwig_backend: str = "auto"
|
|
712
742
|
|
|
713
743
|
# Anndata structure
|
|
714
|
-
reference_column: Optional[str] =
|
|
715
|
-
sample_column: Optional[str] =
|
|
744
|
+
reference_column: Optional[str] = REF_COL
|
|
745
|
+
sample_column: Optional[str] = SAMPLE_COL
|
|
716
746
|
|
|
717
747
|
# General Plotting
|
|
718
|
-
sample_name_col_for_plotting: Optional[str] =
|
|
748
|
+
sample_name_col_for_plotting: Optional[str] = "Barcode"
|
|
719
749
|
rows_per_qc_histogram_grid: int = 12
|
|
720
750
|
|
|
721
751
|
# Preprocessing - Read length and quality filter params
|
|
722
752
|
read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
723
|
-
read_len_filter_thresholds: Optional[Sequence[float]] = field(
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
753
|
+
read_len_filter_thresholds: Optional[Sequence[float]] = field(
|
|
754
|
+
default_factory=lambda: [100, None]
|
|
755
|
+
)
|
|
756
|
+
read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(
|
|
757
|
+
default_factory=lambda: [0.4, 1.5]
|
|
758
|
+
)
|
|
759
|
+
read_quality_filter_thresholds: Optional[Sequence[float]] = field(
|
|
760
|
+
default_factory=lambda: [15, None]
|
|
761
|
+
)
|
|
762
|
+
read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(
|
|
763
|
+
default_factory=lambda: [None, None]
|
|
764
|
+
)
|
|
727
765
|
|
|
728
766
|
# Preprocessing - Optional reindexing params
|
|
729
767
|
reindexing_offsets: Dict[str, int] = field(default_factory=dict)
|
|
730
768
|
reindexed_var_suffix: Optional[str] = "reindexed"
|
|
731
769
|
|
|
732
770
|
# Preprocessing - Direct mod detection binarization params
|
|
733
|
-
fit_position_methylation_thresholds: Optional[bool] =
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
771
|
+
fit_position_methylation_thresholds: Optional[bool] = (
|
|
772
|
+
False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
773
|
+
)
|
|
774
|
+
binarize_on_fixed_methlyation_threshold: Optional[float] = (
|
|
775
|
+
0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
776
|
+
)
|
|
777
|
+
positive_control_sample_methylation_fitting: Optional[str] = (
|
|
778
|
+
None # A positive control Sample_name to use for fully modified template data
|
|
779
|
+
)
|
|
780
|
+
negative_control_sample_methylation_fitting: Optional[str] = (
|
|
781
|
+
None # A negative control Sample_name to use for fully unmodified template data
|
|
782
|
+
)
|
|
783
|
+
infer_on_percentile_sample_methylation_fitting: Optional[int] = (
|
|
784
|
+
10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
785
|
+
)
|
|
786
|
+
inference_variable_sample_methylation_fitting: Optional[str] = (
|
|
787
|
+
"Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
788
|
+
)
|
|
789
|
+
fit_j_threshold: Optional[float] = (
|
|
790
|
+
0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
791
|
+
)
|
|
740
792
|
output_binary_layer_name: Optional[str] = "binarized_methylation"
|
|
741
793
|
|
|
742
794
|
# Preprocessing - Read modification filter params
|
|
@@ -748,13 +800,25 @@ class ExperimentConfig:
|
|
|
748
800
|
min_valid_fraction_positions_in_read_vs_ref: float = 0.2
|
|
749
801
|
|
|
750
802
|
# Preprocessing - plotting params
|
|
751
|
-
obs_to_plot_pp_qc: List[str] = field(
|
|
803
|
+
obs_to_plot_pp_qc: List[str] = field(
|
|
804
|
+
default_factory=lambda: [
|
|
805
|
+
"read_length",
|
|
806
|
+
"mapped_length",
|
|
807
|
+
"read_quality",
|
|
808
|
+
"mapping_quality",
|
|
809
|
+
"mapped_length_to_reference_length_ratio",
|
|
810
|
+
"mapped_length_to_read_length_ratio",
|
|
811
|
+
"Raw_modification_signal",
|
|
812
|
+
]
|
|
813
|
+
)
|
|
752
814
|
|
|
753
815
|
# Preprocessing - Duplicate detection params
|
|
754
|
-
duplicate_detection_site_types: List[str] = field(
|
|
816
|
+
duplicate_detection_site_types: List[str] = field(
|
|
817
|
+
default_factory=lambda: ["GpC", "CpG", "ambiguous_GpC_CpG"]
|
|
818
|
+
)
|
|
755
819
|
duplicate_detection_distance_threshold: float = 0.07
|
|
756
|
-
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: [
|
|
757
|
-
duplicate_detection_keep_best_metric: str =
|
|
820
|
+
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ["Fraction_C_site_modified"])
|
|
821
|
+
duplicate_detection_keep_best_metric: str = "read_quality"
|
|
758
822
|
duplicate_detection_window_size_for_hamming_neighbors: int = 50
|
|
759
823
|
duplicate_detection_min_overlapping_positions: int = 20
|
|
760
824
|
duplicate_detection_do_hierarchical: bool = True
|
|
@@ -765,32 +829,37 @@ class ExperimentConfig:
|
|
|
765
829
|
position_max_nan_threshold: float = 0.1
|
|
766
830
|
|
|
767
831
|
# Spatial Analysis - Clustermap params
|
|
768
|
-
layer_for_clustermap_plotting: Optional[str] =
|
|
769
|
-
clustermap_cmap_c: Optional[str] =
|
|
770
|
-
clustermap_cmap_gpc: Optional[str] =
|
|
771
|
-
clustermap_cmap_cpg: Optional[str] =
|
|
772
|
-
clustermap_cmap_a: Optional[str] =
|
|
773
|
-
spatial_clustermap_sortby: Optional[str] =
|
|
832
|
+
layer_for_clustermap_plotting: Optional[str] = "nan0_0minus1"
|
|
833
|
+
clustermap_cmap_c: Optional[str] = "coolwarm"
|
|
834
|
+
clustermap_cmap_gpc: Optional[str] = "coolwarm"
|
|
835
|
+
clustermap_cmap_cpg: Optional[str] = "coolwarm"
|
|
836
|
+
clustermap_cmap_a: Optional[str] = "coolwarm"
|
|
837
|
+
spatial_clustermap_sortby: Optional[str] = "gpc"
|
|
774
838
|
|
|
775
839
|
# Spatial Analysis - UMAP/Leiden params
|
|
776
|
-
layer_for_umap_plotting: Optional[str] =
|
|
777
|
-
umap_layers_to_plot: List[str] = field(
|
|
840
|
+
layer_for_umap_plotting: Optional[str] = "nan_half"
|
|
841
|
+
umap_layers_to_plot: List[str] = field(
|
|
842
|
+
default_factory=lambda: ["mapped_length", "Raw_modification_signal"]
|
|
843
|
+
)
|
|
778
844
|
|
|
779
845
|
# Spatial Analysis - Spatial Autocorrelation params
|
|
846
|
+
autocorr_normalization_method: str = "pearson"
|
|
780
847
|
rows_per_qc_autocorr_grid: int = 12
|
|
781
848
|
autocorr_rolling_window_size: int = 25
|
|
782
849
|
autocorr_max_lag: int = 800
|
|
783
|
-
autocorr_site_types: List[str] = field(default_factory=lambda: [
|
|
850
|
+
autocorr_site_types: List[str] = field(default_factory=lambda: ["GpC", "CpG", "C"])
|
|
784
851
|
|
|
785
852
|
# Spatial Analysis - Correlation Matrix params
|
|
786
|
-
correlation_matrix_types: List[str] = field(
|
|
787
|
-
|
|
788
|
-
|
|
853
|
+
correlation_matrix_types: List[str] = field(
|
|
854
|
+
default_factory=lambda: ["pearson", "binary_covariance"]
|
|
855
|
+
)
|
|
856
|
+
correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
|
|
857
|
+
correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
|
|
789
858
|
|
|
790
859
|
# HMM params
|
|
791
860
|
hmm_n_states: int = 2
|
|
792
|
-
hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
|
|
793
|
-
hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
|
|
861
|
+
hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
|
|
862
|
+
hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
|
|
794
863
|
hmm_init_start_probs: List[float] = field(default_factory=lambda: [0.5, 0.5])
|
|
795
864
|
hmm_eps: float = 1e-8
|
|
796
865
|
hmm_dtype: str = "float64"
|
|
@@ -798,15 +867,28 @@ class ExperimentConfig:
|
|
|
798
867
|
hmm_batch_size: int = 1024
|
|
799
868
|
hmm_use_viterbi: bool = False
|
|
800
869
|
hmm_device: Optional[str] = None
|
|
801
|
-
hmm_methbases: Optional[List[str]] =
|
|
870
|
+
hmm_methbases: Optional[List[str]] = (
|
|
871
|
+
None # if None, HMM.annotate_adata will fall back to mod_target_bases
|
|
872
|
+
)
|
|
873
|
+
# HMM fitting/application strategy
|
|
874
|
+
hmm_fit_strategy: str = "per_group" # "per_group" | "shared_transitions"
|
|
875
|
+
hmm_shared_scope: List[str] = field(default_factory=lambda: ["reference", "methbase"])
|
|
876
|
+
hmm_groupby: List[str] = field(default_factory=lambda: ["sample", "reference", "methbase"])
|
|
877
|
+
# Shared-transitions adaptation behavior
|
|
878
|
+
hmm_adapt_emissions: bool = True
|
|
879
|
+
hmm_adapt_startprobs: bool = True
|
|
880
|
+
hmm_emission_adapt_iters: int = 5
|
|
881
|
+
hmm_emission_adapt_tol: float = 1e-4
|
|
802
882
|
footprints: Optional[bool] = True
|
|
803
883
|
accessible_patches: Optional[bool] = True
|
|
804
884
|
cpg: Optional[bool] = False
|
|
805
885
|
hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
|
|
806
|
-
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None,
|
|
807
|
-
clustermap_cmap_hmm: Optional[str] =
|
|
808
|
-
hmm_clustermap_feature_layers: List[str] = field(
|
|
809
|
-
|
|
886
|
+
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 60)])
|
|
887
|
+
clustermap_cmap_hmm: Optional[str] = "coolwarm"
|
|
888
|
+
hmm_clustermap_feature_layers: List[str] = field(
|
|
889
|
+
default_factory=lambda: ["all_accessible_features"]
|
|
890
|
+
)
|
|
891
|
+
hmm_clustermap_sortby: Optional[str] = "hmm"
|
|
810
892
|
hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
|
|
811
893
|
|
|
812
894
|
# Pipeline control flow - load adata
|
|
@@ -830,7 +912,7 @@ class ExperimentConfig:
|
|
|
830
912
|
force_redo_filter_reads_on_modification_thresholds: bool = False
|
|
831
913
|
bypass_flag_duplicate_reads: bool = False
|
|
832
914
|
force_redo_flag_duplicate_reads: bool = False
|
|
833
|
-
bypass_complexity_analysis: bool = False
|
|
915
|
+
bypass_complexity_analysis: bool = False
|
|
834
916
|
force_redo_complexity_analysis: bool = False
|
|
835
917
|
|
|
836
918
|
# Pipeline control flow - Spatial Analyses
|
|
@@ -910,7 +992,9 @@ class ExperimentConfig:
|
|
|
910
992
|
defaults_loaded = dict(defaults_map[modality] or {})
|
|
911
993
|
defaults_source_chain = [f"defaults_map['{modality}']"]
|
|
912
994
|
elif defaults_dir is not None:
|
|
913
|
-
defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
|
|
995
|
+
defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
|
|
996
|
+
defaults_dir, modality
|
|
997
|
+
)
|
|
914
998
|
|
|
915
999
|
# If CSV asks to extend defaults, load those and merge
|
|
916
1000
|
merged = dict(defaults_loaded or {})
|
|
@@ -925,7 +1009,11 @@ class ExperimentConfig:
|
|
|
925
1009
|
else:
|
|
926
1010
|
ext_list = []
|
|
927
1011
|
for ext in ext_list:
|
|
928
|
-
ext_defaults, ext_sources = (
|
|
1012
|
+
ext_defaults, ext_sources = (
|
|
1013
|
+
load_defaults_with_inheritance(defaults_dir, ext)
|
|
1014
|
+
if defaults_dir
|
|
1015
|
+
else ({}, [])
|
|
1016
|
+
)
|
|
929
1017
|
merged = deep_merge(merged, ext_defaults)
|
|
930
1018
|
for s in ext_sources:
|
|
931
1019
|
if s not in defaults_source_chain:
|
|
@@ -955,34 +1043,40 @@ class ExperimentConfig:
|
|
|
955
1043
|
merged["experiment_name"] = f"{date_str}_SMF_experiment"
|
|
956
1044
|
|
|
957
1045
|
# Input file types and path handling
|
|
958
|
-
input_data_path = Path(merged[
|
|
1046
|
+
input_data_path = Path(merged["input_data_path"])
|
|
959
1047
|
|
|
960
1048
|
# Detect the input filetype
|
|
961
1049
|
if input_data_path.is_file():
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
1050
|
+
suffix = input_data_path.suffix.lower()
|
|
1051
|
+
suffixes = [
|
|
1052
|
+
s.lower() for s in input_data_path.suffixes
|
|
1053
|
+
] # handles multi-part extensions
|
|
1054
|
+
|
|
1055
|
+
# recognize multi-suffix cases like .fastq.gz or .fq.gz
|
|
1056
|
+
if any(s in [".pod5", ".p5"] for s in suffixes):
|
|
1057
|
+
input_type = "pod5"
|
|
1058
|
+
input_files = [Path(input_data_path)]
|
|
1059
|
+
elif any(s in [".fast5", ".f5"] for s in suffixes):
|
|
1060
|
+
input_type = "fast5"
|
|
1061
|
+
input_files = [Path(input_data_path)]
|
|
1062
|
+
elif any(s in [".fastq", ".fq"] for s in suffixes):
|
|
1063
|
+
input_type = "fastq"
|
|
1064
|
+
input_files = [Path(input_data_path)]
|
|
1065
|
+
elif any(s in [".bam"] for s in suffixes):
|
|
1066
|
+
input_type = "bam"
|
|
1067
|
+
input_files = [Path(input_data_path)]
|
|
1068
|
+
elif any(s in [".h5ad", ".h5"] for s in suffixes):
|
|
1069
|
+
input_type = "h5ad"
|
|
1070
|
+
input_files = [Path(input_data_path)]
|
|
1071
|
+
else:
|
|
1072
|
+
print("Error detecting input file type")
|
|
983
1073
|
|
|
984
1074
|
elif input_data_path.is_dir():
|
|
985
|
-
found = discover_input_files(
|
|
1075
|
+
found = discover_input_files(
|
|
1076
|
+
input_data_path,
|
|
1077
|
+
bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
|
|
1078
|
+
recursive=merged["recursive_input_search"],
|
|
1079
|
+
)
|
|
986
1080
|
|
|
987
1081
|
if found["input_is_pod5"]:
|
|
988
1082
|
input_type = "pod5"
|
|
@@ -1010,12 +1104,12 @@ class ExperimentConfig:
|
|
|
1010
1104
|
)
|
|
1011
1105
|
|
|
1012
1106
|
# summary file output path
|
|
1013
|
-
output_dir = Path(merged[
|
|
1014
|
-
summary_file_basename = merged["experiment_name"] +
|
|
1107
|
+
output_dir = Path(merged["output_directory"])
|
|
1108
|
+
summary_file_basename = merged["experiment_name"] + "_output_summary.csv"
|
|
1015
1109
|
summary_file = output_dir / summary_file_basename
|
|
1016
1110
|
|
|
1017
1111
|
# Demultiplexing output path
|
|
1018
|
-
split_dir = merged.get("split_dir",
|
|
1112
|
+
split_dir = merged.get("split_dir", SPLIT_DIR)
|
|
1019
1113
|
split_path = output_dir / split_dir
|
|
1020
1114
|
|
|
1021
1115
|
# final normalization
|
|
@@ -1039,7 +1133,14 @@ class ExperimentConfig:
|
|
|
1039
1133
|
merged["hm5C_threshold"],
|
|
1040
1134
|
]
|
|
1041
1135
|
|
|
1042
|
-
for bkey in (
|
|
1136
|
+
for bkey in (
|
|
1137
|
+
"barcode_both_ends",
|
|
1138
|
+
"trim",
|
|
1139
|
+
"input_already_demuxed",
|
|
1140
|
+
"make_bigwigs",
|
|
1141
|
+
"skip_unclassified",
|
|
1142
|
+
"delete_batch_hdfs",
|
|
1143
|
+
):
|
|
1043
1144
|
if bkey in merged:
|
|
1044
1145
|
merged[bkey] = _parse_bool(merged[bkey])
|
|
1045
1146
|
|
|
@@ -1048,12 +1149,12 @@ class ExperimentConfig:
|
|
|
1048
1149
|
if "threads" in merged:
|
|
1049
1150
|
tval = _parse_numeric(merged.get("threads", None), None)
|
|
1050
1151
|
merged["threads"] = None if tval is None else int(tval)
|
|
1051
|
-
|
|
1152
|
+
|
|
1052
1153
|
if "aligner_args" in merged and merged.get("aligner_args") is None:
|
|
1053
1154
|
merged.pop("aligner_args", None)
|
|
1054
1155
|
|
|
1055
1156
|
# --- Resolve aligner_args into concrete list for the chosen aligner ---
|
|
1056
|
-
merged[
|
|
1157
|
+
merged["aligner_args"] = resolve_aligner_args(merged)
|
|
1057
1158
|
|
|
1058
1159
|
if "mod_list" in merged:
|
|
1059
1160
|
merged["mod_list"] = _parse_list(merged.get("mod_list"))
|
|
@@ -1068,11 +1169,22 @@ class ExperimentConfig:
|
|
|
1068
1169
|
# allow older names (footprint_ranges, accessible_ranges, cpg_ranges) — optional:
|
|
1069
1170
|
maybe_fs = {}
|
|
1070
1171
|
if "footprint_ranges" in merged or "hmm_footprint_ranges" in merged:
|
|
1071
|
-
maybe_fs["footprint"] = {
|
|
1172
|
+
maybe_fs["footprint"] = {
|
|
1173
|
+
"features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")),
|
|
1174
|
+
"state": merged.get("hmm_footprint_state", "Non-Modified"),
|
|
1175
|
+
}
|
|
1072
1176
|
if "accessible_ranges" in merged or "hmm_accessible_ranges" in merged:
|
|
1073
|
-
maybe_fs["accessible"] = {
|
|
1177
|
+
maybe_fs["accessible"] = {
|
|
1178
|
+
"features": merged.get(
|
|
1179
|
+
"hmm_accessible_ranges", merged.get("accessible_ranges")
|
|
1180
|
+
),
|
|
1181
|
+
"state": merged.get("hmm_accessible_state", "Modified"),
|
|
1182
|
+
}
|
|
1074
1183
|
if "cpg_ranges" in merged or "hmm_cpg_ranges" in merged:
|
|
1075
|
-
maybe_fs["cpg"] = {
|
|
1184
|
+
maybe_fs["cpg"] = {
|
|
1185
|
+
"features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")),
|
|
1186
|
+
"state": merged.get("hmm_cpg_state", "Modified"),
|
|
1187
|
+
}
|
|
1076
1188
|
if maybe_fs:
|
|
1077
1189
|
merged.setdefault("hmm_feature_sets", {})
|
|
1078
1190
|
for k, v in maybe_fs.items():
|
|
@@ -1093,10 +1205,23 @@ class ExperimentConfig:
|
|
|
1093
1205
|
if not hmm_methbases: # None or []
|
|
1094
1206
|
hmm_methbases = _parse_list(merged.get("mod_target_bases", None))
|
|
1095
1207
|
if not hmm_methbases:
|
|
1096
|
-
hmm_methbases = [
|
|
1208
|
+
hmm_methbases = ["C"]
|
|
1097
1209
|
hmm_methbases = list(hmm_methbases)
|
|
1098
1210
|
hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
|
|
1099
|
-
hmm_clustermap_feature_layers = _parse_list(
|
|
1211
|
+
hmm_clustermap_feature_layers = _parse_list(
|
|
1212
|
+
merged.get("hmm_clustermap_feature_layers", "all_accessible_features")
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
hmm_fit_strategy = str(merged.get("hmm_fit_strategy", "per_group")).strip()
|
|
1216
|
+
hmm_shared_scope = _parse_list(merged.get("hmm_shared_scope", ["reference", "methbase"]))
|
|
1217
|
+
hmm_groupby = _parse_list(merged.get("hmm_groupby", ["sample", "reference", "methbase"]))
|
|
1218
|
+
|
|
1219
|
+
hmm_adapt_emissions = _parse_bool(merged.get("hmm_adapt_emissions", True))
|
|
1220
|
+
hmm_adapt_startprobs = _parse_bool(merged.get("hmm_adapt_startprobs", True))
|
|
1221
|
+
hmm_emission_adapt_iters = int(_parse_numeric(merged.get("hmm_emission_adapt_iters", 5), 5))
|
|
1222
|
+
hmm_emission_adapt_tol = float(
|
|
1223
|
+
_parse_numeric(merged.get("hmm_emission_adapt_tol", 1e-4), 1e-4)
|
|
1224
|
+
)
|
|
1100
1225
|
|
|
1101
1226
|
# HMM peak feature configs (for call_hmm_peaks)
|
|
1102
1227
|
merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
|
|
@@ -1106,165 +1231,255 @@ class ExperimentConfig:
|
|
|
1106
1231
|
|
|
1107
1232
|
# instantiate dataclass
|
|
1108
1233
|
instance = cls(
|
|
1109
|
-
smf_modality
|
|
1110
|
-
input_data_path
|
|
1111
|
-
recursive_input_search
|
|
1112
|
-
input_type
|
|
1113
|
-
input_files
|
|
1114
|
-
output_directory
|
|
1115
|
-
summary_file
|
|
1116
|
-
fasta
|
|
1117
|
-
sequencer
|
|
1118
|
-
model_dir
|
|
1119
|
-
barcode_kit
|
|
1120
|
-
fastq_barcode_map
|
|
1121
|
-
fastq_auto_pairing
|
|
1122
|
-
bam_suffix
|
|
1123
|
-
split_dir
|
|
1124
|
-
split_path
|
|
1125
|
-
strands
|
|
1126
|
-
conversions
|
|
1127
|
-
fasta_regions_of_interest
|
|
1128
|
-
mapping_threshold
|
|
1129
|
-
experiment_name
|
|
1130
|
-
model
|
|
1131
|
-
barcode_both_ends
|
|
1132
|
-
trim
|
|
1133
|
-
input_already_demuxed
|
|
1134
|
-
threads
|
|
1135
|
-
sample_sheet_path
|
|
1136
|
-
sample_sheet_mapping_column
|
|
1137
|
-
delete_intermediate_bams
|
|
1138
|
-
delete_intermediate_tsvs
|
|
1139
|
-
align_from_bam
|
|
1140
|
-
aligner
|
|
1141
|
-
aligner_args
|
|
1142
|
-
device
|
|
1143
|
-
make_bigwigs
|
|
1144
|
-
make_beds
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1234
|
+
smf_modality=merged.get("smf_modality"),
|
|
1235
|
+
input_data_path=input_data_path,
|
|
1236
|
+
recursive_input_search=merged.get("recursive_input_search"),
|
|
1237
|
+
input_type=input_type,
|
|
1238
|
+
input_files=input_files,
|
|
1239
|
+
output_directory=output_dir,
|
|
1240
|
+
summary_file=summary_file,
|
|
1241
|
+
fasta=merged.get("fasta"),
|
|
1242
|
+
sequencer=merged.get("sequencer"),
|
|
1243
|
+
model_dir=merged.get("model_dir"),
|
|
1244
|
+
barcode_kit=merged.get("barcode_kit"),
|
|
1245
|
+
fastq_barcode_map=merged.get("fastq_barcode_map"),
|
|
1246
|
+
fastq_auto_pairing=merged.get("fastq_auto_pairing"),
|
|
1247
|
+
bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
|
|
1248
|
+
split_dir=split_dir,
|
|
1249
|
+
split_path=split_path,
|
|
1250
|
+
strands=merged.get("strands", STRANDS),
|
|
1251
|
+
conversions=merged.get("conversions", CONVERSIONS),
|
|
1252
|
+
fasta_regions_of_interest=merged.get("fasta_regions_of_interest"),
|
|
1253
|
+
mapping_threshold=float(merged.get("mapping_threshold", 0.01)),
|
|
1254
|
+
experiment_name=merged.get("experiment_name"),
|
|
1255
|
+
model=merged.get("model", "hac"),
|
|
1256
|
+
barcode_both_ends=merged.get("barcode_both_ends", BARCODE_BOTH_ENDS),
|
|
1257
|
+
trim=merged.get("trim", TRIM),
|
|
1258
|
+
input_already_demuxed=merged.get("input_already_demuxed", False),
|
|
1259
|
+
threads=merged.get("threads"),
|
|
1260
|
+
sample_sheet_path=merged.get("sample_sheet_path"),
|
|
1261
|
+
sample_sheet_mapping_column=merged.get("sample_sheet_mapping_column"),
|
|
1262
|
+
delete_intermediate_bams=merged.get("delete_intermediate_bams", False),
|
|
1263
|
+
delete_intermediate_tsvs=merged.get("delete_intermediate_tsvs", True),
|
|
1264
|
+
align_from_bam=merged.get("align_from_bam", False),
|
|
1265
|
+
aligner=merged.get("aligner", "minimap2"),
|
|
1266
|
+
aligner_args=merged.get("aligner_args", None),
|
|
1267
|
+
device=merged.get("device", "auto"),
|
|
1268
|
+
make_bigwigs=merged.get("make_bigwigs", False),
|
|
1269
|
+
make_beds=merged.get("make_beds", False),
|
|
1270
|
+
samtools_backend=merged.get("samtools_backend", "auto"),
|
|
1271
|
+
bedtools_backend=merged.get("bedtools_backend", "auto"),
|
|
1272
|
+
bigwig_backend=merged.get("bigwig_backend", "auto"),
|
|
1273
|
+
delete_intermediate_hdfs=merged.get("delete_intermediate_hdfs", True),
|
|
1274
|
+
mod_target_bases=merged.get("mod_target_bases", ["GpC", "CpG"]),
|
|
1275
|
+
enzyme_target_bases=merged.get("enzyme_target_bases", ["GpC"]),
|
|
1276
|
+
conversion_types=merged.get("conversions", ["unconverted"])
|
|
1277
|
+
+ merged.get("conversion_types", ["5mC"]),
|
|
1278
|
+
filter_threshold=merged.get("filter_threshold", 0.8),
|
|
1279
|
+
m6A_threshold=merged.get("m6A_threshold", 0.7),
|
|
1280
|
+
m5C_threshold=merged.get("m5C_threshold", 0.7),
|
|
1281
|
+
hm5C_threshold=merged.get("hm5C_threshold", 0.7),
|
|
1282
|
+
thresholds=merged.get("thresholds", []),
|
|
1283
|
+
mod_list=merged.get("mod_list", list(MOD_LIST)),
|
|
1284
|
+
mod_map=merged.get("mod_map", list(MOD_MAP)),
|
|
1285
|
+
batch_size=merged.get("batch_size", 4),
|
|
1286
|
+
skip_unclassified=merged.get("skip_unclassified", True),
|
|
1287
|
+
delete_batch_hdfs=merged.get("delete_batch_hdfs", True),
|
|
1288
|
+
reference_column=merged.get("reference_column", REF_COL),
|
|
1289
|
+
sample_column=merged.get("sample_column", SAMPLE_COL),
|
|
1290
|
+
sample_name_col_for_plotting=merged.get("sample_name_col_for_plotting", "Barcode"),
|
|
1291
|
+
obs_to_plot_pp_qc=obs_to_plot_pp_qc,
|
|
1292
|
+
fit_position_methylation_thresholds=merged.get(
|
|
1293
|
+
"fit_position_methylation_thresholds", False
|
|
1294
|
+
),
|
|
1295
|
+
binarize_on_fixed_methlyation_threshold=merged.get(
|
|
1296
|
+
"binarize_on_fixed_methlyation_threshold", 0.7
|
|
1297
|
+
),
|
|
1298
|
+
positive_control_sample_methylation_fitting=merged.get(
|
|
1299
|
+
"positive_control_sample_methylation_fitting", None
|
|
1300
|
+
),
|
|
1301
|
+
negative_control_sample_methylation_fitting=merged.get(
|
|
1302
|
+
"negative_control_sample_methylation_fitting", None
|
|
1303
|
+
),
|
|
1304
|
+
infer_on_percentile_sample_methylation_fitting=merged.get(
|
|
1305
|
+
"infer_on_percentile_sample_methylation_fitting", 10
|
|
1306
|
+
),
|
|
1307
|
+
inference_variable_sample_methylation_fitting=merged.get(
|
|
1308
|
+
"inference_variable_sample_methylation_fitting", "Raw_modification_signal"
|
|
1309
|
+
),
|
|
1310
|
+
fit_j_threshold=merged.get("fit_j_threshold", 0.5),
|
|
1311
|
+
output_binary_layer_name=merged.get(
|
|
1312
|
+
"output_binary_layer_name", "binarized_methylation"
|
|
1313
|
+
),
|
|
1314
|
+
reindexing_offsets=merged.get("reindexing_offsets", {None: None}),
|
|
1315
|
+
reindexed_var_suffix=merged.get("reindexed_var_suffix", "reindexed"),
|
|
1316
|
+
layer_for_clustermap_plotting=merged.get(
|
|
1317
|
+
"layer_for_clustermap_plotting", "nan0_0minus1"
|
|
1318
|
+
),
|
|
1319
|
+
clustermap_cmap_c=merged.get("clustermap_cmap_c", "coolwarm"),
|
|
1320
|
+
clustermap_cmap_gpc=merged.get("clustermap_cmap_gpc", "coolwarm"),
|
|
1321
|
+
clustermap_cmap_cpg=merged.get("clustermap_cmap_cpg", "coolwarm"),
|
|
1322
|
+
clustermap_cmap_a=merged.get("clustermap_cmap_a", "coolwarm"),
|
|
1323
|
+
spatial_clustermap_sortby=merged.get("spatial_clustermap_sortby", "gpc"),
|
|
1324
|
+
layer_for_umap_plotting=merged.get("layer_for_umap_plotting", "nan_half"),
|
|
1325
|
+
umap_layers_to_plot=merged.get(
|
|
1326
|
+
"umap_layers_to_plot", ["mapped_length", "Raw_modification_signal"]
|
|
1327
|
+
),
|
|
1328
|
+
rows_per_qc_histogram_grid=merged.get("rows_per_qc_histogram_grid", 12),
|
|
1329
|
+
rows_per_qc_autocorr_grid=merged.get("rows_per_qc_autocorr_grid", 12),
|
|
1330
|
+
autocorr_normalization_method=merged.get("autocorr_normalization_method", "pearson"),
|
|
1331
|
+
autocorr_rolling_window_size=merged.get("autocorr_rolling_window_size", 25),
|
|
1332
|
+
autocorr_max_lag=merged.get("autocorr_max_lag", 800),
|
|
1333
|
+
autocorr_site_types=merged.get("autocorr_site_types", ["GpC", "CpG", "C"]),
|
|
1334
|
+
hmm_n_states=merged.get("hmm_n_states", 2),
|
|
1335
|
+
hmm_init_emission_probs=merged.get("hmm_init_emission_probs", [[0.8, 0.2], [0.2, 0.8]]),
|
|
1336
|
+
hmm_init_transition_probs=merged.get(
|
|
1337
|
+
"hmm_init_transition_probs", [[0.9, 0.1], [0.1, 0.9]]
|
|
1338
|
+
),
|
|
1339
|
+
hmm_init_start_probs=merged.get("hmm_init_start_probs", [0.5, 0.5]),
|
|
1340
|
+
hmm_eps=merged.get("hmm_eps", 1e-8),
|
|
1341
|
+
hmm_fit_strategy=hmm_fit_strategy,
|
|
1342
|
+
hmm_shared_scope=hmm_shared_scope,
|
|
1343
|
+
hmm_groupby=hmm_groupby,
|
|
1344
|
+
hmm_adapt_emissions=hmm_adapt_emissions,
|
|
1345
|
+
hmm_adapt_startprobs=hmm_adapt_startprobs,
|
|
1346
|
+
hmm_emission_adapt_iters=hmm_emission_adapt_iters,
|
|
1347
|
+
hmm_emission_adapt_tol=hmm_emission_adapt_tol,
|
|
1348
|
+
hmm_dtype=merged.get("hmm_dtype", "float64"),
|
|
1349
|
+
hmm_feature_sets=hmm_feature_sets,
|
|
1350
|
+
hmm_annotation_threshold=hmm_annotation_threshold,
|
|
1351
|
+
hmm_batch_size=hmm_batch_size,
|
|
1352
|
+
hmm_use_viterbi=hmm_use_viterbi,
|
|
1353
|
+
hmm_methbases=hmm_methbases,
|
|
1354
|
+
hmm_device=hmm_device,
|
|
1355
|
+
hmm_merge_layer_features=hmm_merge_layer_features,
|
|
1356
|
+
clustermap_cmap_hmm=merged.get("clustermap_cmap_hmm", "coolwarm"),
|
|
1357
|
+
hmm_clustermap_feature_layers=hmm_clustermap_feature_layers,
|
|
1358
|
+
hmm_clustermap_sortby=merged.get("hmm_clustermap_sortby", "hmm"),
|
|
1359
|
+
hmm_peak_feature_configs=hmm_peak_feature_configs,
|
|
1360
|
+
footprints=merged.get("footprints", None),
|
|
1361
|
+
accessible_patches=merged.get("accessible_patches", None),
|
|
1362
|
+
cpg=merged.get("cpg", None),
|
|
1363
|
+
read_coord_filter=merged.get("read_coord_filter", [None, None]),
|
|
1364
|
+
read_len_filter_thresholds=merged.get("read_len_filter_thresholds", [100, None]),
|
|
1365
|
+
read_len_to_ref_ratio_filter_thresholds=merged.get(
|
|
1366
|
+
"read_len_to_ref_ratio_filter_thresholds", [0.3, None]
|
|
1367
|
+
),
|
|
1368
|
+
read_quality_filter_thresholds=merged.get("read_quality_filter_thresholds", [15, None]),
|
|
1369
|
+
read_mapping_quality_filter_thresholds=merged.get(
|
|
1370
|
+
"read_mapping_quality_filter_thresholds", [None, None]
|
|
1371
|
+
),
|
|
1372
|
+
read_mod_filtering_gpc_thresholds=merged.get(
|
|
1373
|
+
"read_mod_filtering_gpc_thresholds", [0.025, 0.975]
|
|
1374
|
+
),
|
|
1375
|
+
read_mod_filtering_cpg_thresholds=merged.get(
|
|
1376
|
+
"read_mod_filtering_cpg_thresholds", [0.0, 1.0]
|
|
1377
|
+
),
|
|
1378
|
+
read_mod_filtering_c_thresholds=merged.get(
|
|
1379
|
+
"read_mod_filtering_c_thresholds", [0.025, 0.975]
|
|
1380
|
+
),
|
|
1381
|
+
read_mod_filtering_a_thresholds=merged.get(
|
|
1382
|
+
"read_mod_filtering_a_thresholds", [0.025, 0.975]
|
|
1383
|
+
),
|
|
1384
|
+
read_mod_filtering_use_other_c_as_background=merged.get(
|
|
1385
|
+
"read_mod_filtering_use_other_c_as_background", True
|
|
1386
|
+
),
|
|
1387
|
+
min_valid_fraction_positions_in_read_vs_ref=merged.get(
|
|
1388
|
+
"min_valid_fraction_positions_in_read_vs_ref", 0.2
|
|
1389
|
+
),
|
|
1390
|
+
duplicate_detection_site_types=merged.get(
|
|
1391
|
+
"duplicate_detection_site_types", ["GpC", "CpG", "ambiguous_GpC_CpG"]
|
|
1392
|
+
),
|
|
1393
|
+
duplicate_detection_distance_threshold=merged.get(
|
|
1394
|
+
"duplicate_detection_distance_threshold", 0.07
|
|
1395
|
+
),
|
|
1396
|
+
duplicate_detection_keep_best_metric=merged.get(
|
|
1397
|
+
"duplicate_detection_keep_best_metric", "read_quality"
|
|
1398
|
+
),
|
|
1399
|
+
duplicate_detection_window_size_for_hamming_neighbors=merged.get(
|
|
1400
|
+
"duplicate_detection_window_size_for_hamming_neighbors", 50
|
|
1401
|
+
),
|
|
1402
|
+
duplicate_detection_min_overlapping_positions=merged.get(
|
|
1403
|
+
"duplicate_detection_min_overlapping_positions", 20
|
|
1404
|
+
),
|
|
1405
|
+
duplicate_detection_do_hierarchical=merged.get(
|
|
1406
|
+
"duplicate_detection_do_hierarchical", True
|
|
1407
|
+
),
|
|
1408
|
+
duplicate_detection_hierarchical_linkage=merged.get(
|
|
1409
|
+
"duplicate_detection_hierarchical_linkage", "average"
|
|
1410
|
+
),
|
|
1411
|
+
duplicate_detection_do_pca=merged.get("duplicate_detection_do_pca", False),
|
|
1412
|
+
position_max_nan_threshold=merged.get("position_max_nan_threshold", 0.1),
|
|
1413
|
+
correlation_matrix_types=merged.get(
|
|
1414
|
+
"correlation_matrix_types", ["pearson", "binary_covariance"]
|
|
1415
|
+
),
|
|
1416
|
+
correlation_matrix_cmaps=merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
|
|
1417
|
+
correlation_matrix_site_types=merged.get("correlation_matrix_site_types", ["GpC_site"]),
|
|
1418
|
+
hamming_vs_metric_keys=merged.get(
|
|
1419
|
+
"hamming_vs_metric_keys", ["Fraction_C_site_modified"]
|
|
1420
|
+
),
|
|
1421
|
+
force_redo_load_adata=merged.get("force_redo_load_adata", False),
|
|
1422
|
+
force_redo_preprocessing=merged.get("force_redo_preprocessing", False),
|
|
1423
|
+
force_reload_sample_sheet=merged.get("force_reload_sample_sheet", True),
|
|
1424
|
+
bypass_add_read_length_and_mapping_qc=merged.get(
|
|
1425
|
+
"bypass_add_read_length_and_mapping_qc", False
|
|
1426
|
+
),
|
|
1427
|
+
force_redo_add_read_length_and_mapping_qc=merged.get(
|
|
1428
|
+
"force_redo_add_read_length_and_mapping_qc", False
|
|
1429
|
+
),
|
|
1430
|
+
bypass_clean_nan=merged.get("bypass_clean_nan", False),
|
|
1431
|
+
force_redo_clean_nan=merged.get("force_redo_clean_nan", False),
|
|
1432
|
+
bypass_append_base_context=merged.get("bypass_append_base_context", False),
|
|
1433
|
+
force_redo_append_base_context=merged.get("force_redo_append_base_context", False),
|
|
1434
|
+
invert_adata=merged.get("invert_adata", False),
|
|
1435
|
+
bypass_append_binary_layer_by_base_context=merged.get(
|
|
1436
|
+
"bypass_append_binary_layer_by_base_context", False
|
|
1437
|
+
),
|
|
1438
|
+
force_redo_append_binary_layer_by_base_context=merged.get(
|
|
1439
|
+
"force_redo_append_binary_layer_by_base_context", False
|
|
1440
|
+
),
|
|
1441
|
+
bypass_calculate_read_modification_stats=merged.get(
|
|
1442
|
+
"bypass_calculate_read_modification_stats", False
|
|
1443
|
+
),
|
|
1444
|
+
force_redo_calculate_read_modification_stats=merged.get(
|
|
1445
|
+
"force_redo_calculate_read_modification_stats", False
|
|
1446
|
+
),
|
|
1447
|
+
bypass_filter_reads_on_modification_thresholds=merged.get(
|
|
1448
|
+
"bypass_filter_reads_on_modification_thresholds", False
|
|
1449
|
+
),
|
|
1450
|
+
force_redo_filter_reads_on_modification_thresholds=merged.get(
|
|
1451
|
+
"force_redo_filter_reads_on_modification_thresholds", False
|
|
1452
|
+
),
|
|
1453
|
+
bypass_flag_duplicate_reads=merged.get("bypass_flag_duplicate_reads", False),
|
|
1454
|
+
force_redo_flag_duplicate_reads=merged.get("force_redo_flag_duplicate_reads", False),
|
|
1455
|
+
bypass_complexity_analysis=merged.get("bypass_complexity_analysis", False),
|
|
1456
|
+
force_redo_complexity_analysis=merged.get("force_redo_complexity_analysis", False),
|
|
1457
|
+
force_redo_spatial_analyses=merged.get("force_redo_spatial_analyses", False),
|
|
1458
|
+
bypass_basic_clustermaps=merged.get("bypass_basic_clustermaps", False),
|
|
1459
|
+
force_redo_basic_clustermaps=merged.get("force_redo_basic_clustermaps", False),
|
|
1460
|
+
bypass_basic_umap=merged.get("bypass_basic_umap", False),
|
|
1461
|
+
force_redo_basic_umap=merged.get("force_redo_basic_umap", False),
|
|
1462
|
+
bypass_spatial_autocorr_calculations=merged.get(
|
|
1463
|
+
"bypass_spatial_autocorr_calculations", False
|
|
1464
|
+
),
|
|
1465
|
+
force_redo_spatial_autocorr_calculations=merged.get(
|
|
1466
|
+
"force_redo_spatial_autocorr_calculations", False
|
|
1467
|
+
),
|
|
1468
|
+
bypass_spatial_autocorr_plotting=merged.get("bypass_spatial_autocorr_plotting", False),
|
|
1469
|
+
force_redo_spatial_autocorr_plotting=merged.get(
|
|
1470
|
+
"force_redo_spatial_autocorr_plotting", False
|
|
1471
|
+
),
|
|
1472
|
+
bypass_matrix_corr_calculations=merged.get("bypass_matrix_corr_calculations", False),
|
|
1473
|
+
force_redo_matrix_corr_calculations=merged.get(
|
|
1474
|
+
"force_redo_matrix_corr_calculations", False
|
|
1475
|
+
),
|
|
1476
|
+
bypass_matrix_corr_plotting=merged.get("bypass_matrix_corr_plotting", False),
|
|
1477
|
+
force_redo_matrix_corr_plotting=merged.get("force_redo_matrix_corr_plotting", False),
|
|
1478
|
+
bypass_hmm_fit=merged.get("bypass_hmm_fit", False),
|
|
1479
|
+
force_redo_hmm_fit=merged.get("force_redo_hmm_fit", False),
|
|
1480
|
+
bypass_hmm_apply=merged.get("bypass_hmm_apply", False),
|
|
1481
|
+
force_redo_hmm_apply=merged.get("force_redo_hmm_apply", False),
|
|
1482
|
+
config_source=config_source or "<var_dict>",
|
|
1268
1483
|
)
|
|
1269
1484
|
|
|
1270
1485
|
report = {
|
|
@@ -1291,9 +1506,20 @@ class ExperimentConfig:
|
|
|
1291
1506
|
Load CSV using LoadExperimentConfig (or accept DataFrame) and build ExperimentConfig.
|
|
1292
1507
|
Additional kwargs passed to from_var_dict().
|
|
1293
1508
|
"""
|
|
1294
|
-
loader =
|
|
1509
|
+
loader = (
|
|
1510
|
+
LoadExperimentConfig(csv_input)
|
|
1511
|
+
if not isinstance(csv_input, pd.DataFrame)
|
|
1512
|
+
else LoadExperimentConfig(pd.DataFrame(csv_input))
|
|
1513
|
+
)
|
|
1295
1514
|
var_dict = loader.var_dict
|
|
1296
|
-
return cls.from_var_dict(
|
|
1515
|
+
return cls.from_var_dict(
|
|
1516
|
+
var_dict,
|
|
1517
|
+
date_str=date_str,
|
|
1518
|
+
config_source=config_source,
|
|
1519
|
+
defaults_dir=defaults_dir,
|
|
1520
|
+
defaults_map=defaults_map,
|
|
1521
|
+
**kwargs,
|
|
1522
|
+
)
|
|
1297
1523
|
|
|
1298
1524
|
# -------------------------
|
|
1299
1525
|
# validation & serialization
|
|
@@ -1306,7 +1532,9 @@ class ExperimentConfig:
|
|
|
1306
1532
|
return errs
|
|
1307
1533
|
for g, info in hfs.items():
|
|
1308
1534
|
if not isinstance(info, dict):
|
|
1309
|
-
errs.append(
|
|
1535
|
+
errs.append(
|
|
1536
|
+
f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'."
|
|
1537
|
+
)
|
|
1310
1538
|
continue
|
|
1311
1539
|
feats = info.get("features")
|
|
1312
1540
|
if not isinstance(feats, dict) or len(feats) == 0:
|
|
@@ -1316,7 +1544,9 @@ class ExperimentConfig:
|
|
|
1316
1544
|
try:
|
|
1317
1545
|
lo, hi = float(rng[0]), float(rng[1])
|
|
1318
1546
|
if lo < 0 or hi <= lo:
|
|
1319
|
-
errs.append(
|
|
1547
|
+
errs.append(
|
|
1548
|
+
f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}."
|
|
1549
|
+
)
|
|
1320
1550
|
except Exception:
|
|
1321
1551
|
errs.append(f"Feature range for {g}:{fname} is invalid: {rng}")
|
|
1322
1552
|
return errs
|
|
@@ -1349,13 +1579,18 @@ class ExperimentConfig:
|
|
|
1349
1579
|
|
|
1350
1580
|
if not (0.0 <= float(self.mapping_threshold) <= 1.0):
|
|
1351
1581
|
errors.append("mapping_threshold must be in [0,1].")
|
|
1352
|
-
for t in (
|
|
1582
|
+
for t in (
|
|
1583
|
+
self.filter_threshold,
|
|
1584
|
+
self.m6A_threshold,
|
|
1585
|
+
self.m5C_threshold,
|
|
1586
|
+
self.hm5C_threshold,
|
|
1587
|
+
):
|
|
1353
1588
|
if not (0.0 <= float(t) <= 1.0):
|
|
1354
1589
|
errors.append(f"threshold value {t} must be in [0,1].")
|
|
1355
1590
|
|
|
1356
1591
|
if raise_on_error and errors:
|
|
1357
1592
|
raise ValueError("ExperimentConfig validation failed:\n " + "\n ".join(errors))
|
|
1358
|
-
|
|
1593
|
+
|
|
1359
1594
|
errs = _validate_hmm_features_structure(self.hmm_feature_sets)
|
|
1360
1595
|
errors.extend(errs)
|
|
1361
1596
|
|