smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +7 -1
- smftools/cli/hmm_adata.py +902 -244
- smftools/cli/load_adata.py +318 -198
- smftools/cli/preprocess_adata.py +285 -171
- smftools/cli/spatial_adata.py +137 -53
- smftools/cli_entry.py +94 -178
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +22 -17
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +505 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2125 -1426
- smftools/hmm/__init__.py +2 -3
- smftools/hmm/archived/call_hmm_peaks.py +16 -1
- smftools/hmm/call_hmm_peaks.py +173 -193
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +379 -156
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +195 -29
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +347 -168
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +145 -85
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +8 -8
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +103 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +688 -271
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.4.dist-info/RECORD +0 -176
- /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,26 @@
|
|
|
1
1
|
# experiment_config.py
|
|
2
2
|
from __future__ import annotations
|
|
3
|
+
|
|
3
4
|
import ast
|
|
4
5
|
import json
|
|
5
6
|
import warnings
|
|
6
|
-
from dataclasses import dataclass, field
|
|
7
|
+
from dataclasses import asdict, dataclass, field
|
|
7
8
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
|
+
from typing import IO, Any, Dict, List, Optional, Sequence, Tuple, Union
|
|
10
|
+
|
|
11
|
+
from smftools.constants import (
|
|
12
|
+
BAM_SUFFIX,
|
|
13
|
+
BARCODE_BOTH_ENDS,
|
|
14
|
+
CONVERSIONS,
|
|
15
|
+
MOD_LIST,
|
|
16
|
+
MOD_MAP,
|
|
17
|
+
REF_COL,
|
|
18
|
+
SAMPLE_COL,
|
|
19
|
+
SPLIT_DIR,
|
|
20
|
+
STRANDS,
|
|
21
|
+
TRIM,
|
|
22
|
+
)
|
|
23
|
+
|
|
9
24
|
from .discover_input_files import discover_input_files
|
|
10
25
|
|
|
11
26
|
# Optional dependency for YAML handling
|
|
@@ -14,8 +29,8 @@ try:
|
|
|
14
29
|
except Exception:
|
|
15
30
|
yaml = None
|
|
16
31
|
|
|
17
|
-
import pandas as pd
|
|
18
32
|
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
19
34
|
|
|
20
35
|
|
|
21
36
|
# -------------------------
|
|
@@ -81,6 +96,7 @@ def _parse_numeric(v: Any, fallback: Any = None) -> Any:
|
|
|
81
96
|
except Exception:
|
|
82
97
|
return fallback
|
|
83
98
|
|
|
99
|
+
|
|
84
100
|
def _try_json_or_literal(s: Any) -> Any:
|
|
85
101
|
"""Try parse JSON or python literal; otherwise return original string."""
|
|
86
102
|
if s is None:
|
|
@@ -123,8 +139,8 @@ def resolve_aligner_args(
|
|
|
123
139
|
"""
|
|
124
140
|
# builtin defaults (aligner -> args)
|
|
125
141
|
builtin_defaults = {
|
|
126
|
-
"minimap2": [
|
|
127
|
-
"dorado": [
|
|
142
|
+
"minimap2": ["-a", "-x", "map-ont", "--MD", "-Y", "-y", "-N", "5", "--secondary=no"],
|
|
143
|
+
"dorado": ["--mm2-opts", "-N", "5"],
|
|
128
144
|
}
|
|
129
145
|
if default_by_aligner is None:
|
|
130
146
|
default_by_aligner = builtin_defaults
|
|
@@ -275,6 +291,7 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
|
|
|
275
291
|
canonical[grp] = {"features": feats, "state": state}
|
|
276
292
|
return canonical
|
|
277
293
|
|
|
294
|
+
|
|
278
295
|
def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
|
|
279
296
|
"""
|
|
280
297
|
Normalize user-provided `hmm_peak_feature_configs` into:
|
|
@@ -365,12 +382,12 @@ class LoadExperimentConfig:
|
|
|
365
382
|
df = pd.read_csv(source, dtype=str, keep_default_na=False, na_values=[""])
|
|
366
383
|
# normalize column names
|
|
367
384
|
df.columns = [c.strip() for c in df.columns]
|
|
368
|
-
if
|
|
385
|
+
if "variable" not in df.columns:
|
|
369
386
|
raise ValueError("Config CSV must contain a 'variable' column.")
|
|
370
|
-
if
|
|
371
|
-
df[
|
|
372
|
-
if
|
|
373
|
-
df[
|
|
387
|
+
if "value" not in df.columns:
|
|
388
|
+
df["value"] = ""
|
|
389
|
+
if "type" not in df.columns:
|
|
390
|
+
df["type"] = ""
|
|
374
391
|
return df
|
|
375
392
|
|
|
376
393
|
@staticmethod
|
|
@@ -389,9 +406,9 @@ class LoadExperimentConfig:
|
|
|
389
406
|
|
|
390
407
|
def parse_bool(s: str):
|
|
391
408
|
s2 = s.strip().lower()
|
|
392
|
-
if s2 in (
|
|
409
|
+
if s2 in ("1", "true", "t", "yes", "y", "on"):
|
|
393
410
|
return True
|
|
394
|
-
if s2 in (
|
|
411
|
+
if s2 in ("0", "false", "f", "no", "n", "off"):
|
|
395
412
|
return False
|
|
396
413
|
raise ValueError(f"Cannot parse boolean from '{s}'")
|
|
397
414
|
|
|
@@ -411,18 +428,18 @@ class LoadExperimentConfig:
|
|
|
411
428
|
except Exception:
|
|
412
429
|
pass
|
|
413
430
|
# fallback split
|
|
414
|
-
parts = [p.strip() for p in s.strip("()[] ").split(
|
|
431
|
+
parts = [p.strip() for p in s.strip("()[] ").split(",") if p.strip() != ""]
|
|
415
432
|
return parts
|
|
416
433
|
|
|
417
|
-
if hint in (
|
|
434
|
+
if hint in ("int", "integer"):
|
|
418
435
|
return int(v)
|
|
419
|
-
if hint in (
|
|
436
|
+
if hint in ("float", "double"):
|
|
420
437
|
return float(v)
|
|
421
|
-
if hint in (
|
|
438
|
+
if hint in ("bool", "boolean"):
|
|
422
439
|
return parse_bool(v)
|
|
423
|
-
if hint in (
|
|
440
|
+
if hint in ("list", "array"):
|
|
424
441
|
return parse_list_like(v)
|
|
425
|
-
if hint in (
|
|
442
|
+
if hint in ("string", "str"):
|
|
426
443
|
return v
|
|
427
444
|
|
|
428
445
|
# infer
|
|
@@ -448,27 +465,31 @@ class LoadExperimentConfig:
|
|
|
448
465
|
return lit
|
|
449
466
|
except Exception:
|
|
450
467
|
pass
|
|
451
|
-
if (
|
|
452
|
-
return [p.strip() for p in v.split(
|
|
468
|
+
if ("," in v) and (not any(ch in v for ch in "{}[]()")):
|
|
469
|
+
return [p.strip() for p in v.split(",") if p.strip() != ""]
|
|
453
470
|
return v
|
|
454
471
|
|
|
455
472
|
def _parse_df(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
456
473
|
parsed: Dict[str, Any] = {}
|
|
457
474
|
for idx, row in df.iterrows():
|
|
458
|
-
name = str(row[
|
|
475
|
+
name = str(row["variable"]).strip()
|
|
459
476
|
if name == "":
|
|
460
477
|
continue
|
|
461
|
-
raw_val = row.get(
|
|
462
|
-
raw_type = row.get(
|
|
478
|
+
raw_val = row.get("value", "")
|
|
479
|
+
raw_type = row.get("type", "")
|
|
463
480
|
if pd.isna(raw_val) or str(raw_val).strip() == "":
|
|
464
481
|
raw_val = None
|
|
465
482
|
try:
|
|
466
483
|
parsed_val = self._parse_value_as_type(raw_val, raw_type)
|
|
467
484
|
except Exception as e:
|
|
468
|
-
warnings.warn(
|
|
485
|
+
warnings.warn(
|
|
486
|
+
f"Failed to parse config variable '{name}' (row {idx}): {e}. Storing raw value."
|
|
487
|
+
)
|
|
469
488
|
parsed_val = None if raw_val is None else raw_val
|
|
470
489
|
if name in parsed:
|
|
471
|
-
warnings.warn(
|
|
490
|
+
warnings.warn(
|
|
491
|
+
f"Duplicate config variable '{name}' encountered (row {idx}). Overwriting previous value."
|
|
492
|
+
)
|
|
472
493
|
parsed[name] = parsed_val
|
|
473
494
|
return parsed
|
|
474
495
|
|
|
@@ -476,7 +497,7 @@ class LoadExperimentConfig:
|
|
|
476
497
|
"""Return parsed config as a pandas DataFrame (variable, value)."""
|
|
477
498
|
rows = []
|
|
478
499
|
for k, v in self.var_dict.items():
|
|
479
|
-
rows.append({
|
|
500
|
+
rows.append({"variable": k, "value": v})
|
|
480
501
|
return pd.DataFrame(rows)
|
|
481
502
|
|
|
482
503
|
|
|
@@ -644,17 +665,17 @@ class ExperimentConfig:
|
|
|
644
665
|
input_data_path: Optional[str] = None
|
|
645
666
|
output_directory: Optional[str] = None
|
|
646
667
|
fasta: Optional[str] = None
|
|
647
|
-
bam_suffix: str =
|
|
668
|
+
bam_suffix: str = BAM_SUFFIX
|
|
648
669
|
recursive_input_search: bool = True
|
|
649
670
|
input_type: Optional[str] = None
|
|
650
671
|
input_files: Optional[List[Path]] = None
|
|
651
|
-
split_dir: str =
|
|
672
|
+
split_dir: str = SPLIT_DIR
|
|
652
673
|
split_path: Optional[str] = None
|
|
653
|
-
strands: List[str] = field(default_factory=lambda:
|
|
654
|
-
conversions: List[str] = field(default_factory=lambda:
|
|
674
|
+
strands: List[str] = field(default_factory=lambda: STRANDS)
|
|
675
|
+
conversions: List[str] = field(default_factory=lambda: CONVERSIONS)
|
|
655
676
|
fasta_regions_of_interest: Optional[str] = None
|
|
656
677
|
sample_sheet_path: Optional[str] = None
|
|
657
|
-
sample_sheet_mapping_column: Optional[str] =
|
|
678
|
+
sample_sheet_mapping_column: Optional[str] = "Experiment_name_and_barcode"
|
|
658
679
|
experiment_name: Optional[str] = None
|
|
659
680
|
input_already_demuxed: bool = False
|
|
660
681
|
summary_file: Optional[Path] = None
|
|
@@ -690,8 +711,8 @@ class ExperimentConfig:
|
|
|
690
711
|
model_dir: Optional[str] = None
|
|
691
712
|
barcode_kit: Optional[str] = None
|
|
692
713
|
model: str = "hac"
|
|
693
|
-
barcode_both_ends: bool =
|
|
694
|
-
trim: bool =
|
|
714
|
+
barcode_both_ends: bool = BARCODE_BOTH_ENDS
|
|
715
|
+
trim: bool = TRIM
|
|
695
716
|
# General basecalling params
|
|
696
717
|
filter_threshold: float = 0.8
|
|
697
718
|
# Modified basecalling specific params
|
|
@@ -699,44 +720,72 @@ class ExperimentConfig:
|
|
|
699
720
|
m5C_threshold: float = 0.7
|
|
700
721
|
hm5C_threshold: float = 0.7
|
|
701
722
|
thresholds: List[float] = field(default_factory=list)
|
|
702
|
-
mod_list: List[str] = field(
|
|
703
|
-
|
|
723
|
+
mod_list: List[str] = field(
|
|
724
|
+
default_factory=lambda: list(MOD_LIST)
|
|
725
|
+
) # Dorado modified basecalling codes
|
|
726
|
+
mod_map: Dict[str, str] = field(
|
|
727
|
+
default_factory=lambda: dict(MOD_MAP)
|
|
728
|
+
) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
|
|
704
729
|
|
|
705
730
|
# Alignment params
|
|
706
|
-
mapping_threshold: float = 0.01
|
|
707
|
-
align_from_bam: bool =
|
|
731
|
+
mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
|
|
732
|
+
align_from_bam: bool = (
|
|
733
|
+
False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
|
|
734
|
+
)
|
|
708
735
|
aligner: str = "dorado"
|
|
709
736
|
aligner_args: Optional[List[str]] = None
|
|
710
737
|
make_bigwigs: bool = False
|
|
711
738
|
make_beds: bool = False
|
|
712
739
|
|
|
713
740
|
# Anndata structure
|
|
714
|
-
reference_column: Optional[str] =
|
|
715
|
-
sample_column: Optional[str] =
|
|
741
|
+
reference_column: Optional[str] = REF_COL
|
|
742
|
+
sample_column: Optional[str] = SAMPLE_COL
|
|
716
743
|
|
|
717
744
|
# General Plotting
|
|
718
|
-
sample_name_col_for_plotting: Optional[str] =
|
|
745
|
+
sample_name_col_for_plotting: Optional[str] = "Barcode"
|
|
719
746
|
rows_per_qc_histogram_grid: int = 12
|
|
720
747
|
|
|
721
748
|
# Preprocessing - Read length and quality filter params
|
|
722
749
|
read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
723
|
-
read_len_filter_thresholds: Optional[Sequence[float]] = field(
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
750
|
+
read_len_filter_thresholds: Optional[Sequence[float]] = field(
|
|
751
|
+
default_factory=lambda: [100, None]
|
|
752
|
+
)
|
|
753
|
+
read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(
|
|
754
|
+
default_factory=lambda: [0.4, 1.5]
|
|
755
|
+
)
|
|
756
|
+
read_quality_filter_thresholds: Optional[Sequence[float]] = field(
|
|
757
|
+
default_factory=lambda: [15, None]
|
|
758
|
+
)
|
|
759
|
+
read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(
|
|
760
|
+
default_factory=lambda: [None, None]
|
|
761
|
+
)
|
|
727
762
|
|
|
728
763
|
# Preprocessing - Optional reindexing params
|
|
729
764
|
reindexing_offsets: Dict[str, int] = field(default_factory=dict)
|
|
730
765
|
reindexed_var_suffix: Optional[str] = "reindexed"
|
|
731
766
|
|
|
732
767
|
# Preprocessing - Direct mod detection binarization params
|
|
733
|
-
fit_position_methylation_thresholds: Optional[bool] =
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
768
|
+
fit_position_methylation_thresholds: Optional[bool] = (
|
|
769
|
+
False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
770
|
+
)
|
|
771
|
+
binarize_on_fixed_methlyation_threshold: Optional[float] = (
|
|
772
|
+
0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
773
|
+
)
|
|
774
|
+
positive_control_sample_methylation_fitting: Optional[str] = (
|
|
775
|
+
None # A positive control Sample_name to use for fully modified template data
|
|
776
|
+
)
|
|
777
|
+
negative_control_sample_methylation_fitting: Optional[str] = (
|
|
778
|
+
None # A negative control Sample_name to use for fully unmodified template data
|
|
779
|
+
)
|
|
780
|
+
infer_on_percentile_sample_methylation_fitting: Optional[int] = (
|
|
781
|
+
10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
782
|
+
)
|
|
783
|
+
inference_variable_sample_methylation_fitting: Optional[str] = (
|
|
784
|
+
"Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
785
|
+
)
|
|
786
|
+
fit_j_threshold: Optional[float] = (
|
|
787
|
+
0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
788
|
+
)
|
|
740
789
|
output_binary_layer_name: Optional[str] = "binarized_methylation"
|
|
741
790
|
|
|
742
791
|
# Preprocessing - Read modification filter params
|
|
@@ -748,13 +797,25 @@ class ExperimentConfig:
|
|
|
748
797
|
min_valid_fraction_positions_in_read_vs_ref: float = 0.2
|
|
749
798
|
|
|
750
799
|
# Preprocessing - plotting params
|
|
751
|
-
obs_to_plot_pp_qc: List[str] = field(
|
|
800
|
+
obs_to_plot_pp_qc: List[str] = field(
|
|
801
|
+
default_factory=lambda: [
|
|
802
|
+
"read_length",
|
|
803
|
+
"mapped_length",
|
|
804
|
+
"read_quality",
|
|
805
|
+
"mapping_quality",
|
|
806
|
+
"mapped_length_to_reference_length_ratio",
|
|
807
|
+
"mapped_length_to_read_length_ratio",
|
|
808
|
+
"Raw_modification_signal",
|
|
809
|
+
]
|
|
810
|
+
)
|
|
752
811
|
|
|
753
812
|
# Preprocessing - Duplicate detection params
|
|
754
|
-
duplicate_detection_site_types: List[str] = field(
|
|
813
|
+
duplicate_detection_site_types: List[str] = field(
|
|
814
|
+
default_factory=lambda: ["GpC", "CpG", "ambiguous_GpC_CpG"]
|
|
815
|
+
)
|
|
755
816
|
duplicate_detection_distance_threshold: float = 0.07
|
|
756
|
-
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: [
|
|
757
|
-
duplicate_detection_keep_best_metric: str =
|
|
817
|
+
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ["Fraction_C_site_modified"])
|
|
818
|
+
duplicate_detection_keep_best_metric: str = "read_quality"
|
|
758
819
|
duplicate_detection_window_size_for_hamming_neighbors: int = 50
|
|
759
820
|
duplicate_detection_min_overlapping_positions: int = 20
|
|
760
821
|
duplicate_detection_do_hierarchical: bool = True
|
|
@@ -765,32 +826,37 @@ class ExperimentConfig:
|
|
|
765
826
|
position_max_nan_threshold: float = 0.1
|
|
766
827
|
|
|
767
828
|
# Spatial Analysis - Clustermap params
|
|
768
|
-
layer_for_clustermap_plotting: Optional[str] =
|
|
769
|
-
clustermap_cmap_c: Optional[str] =
|
|
770
|
-
clustermap_cmap_gpc: Optional[str] =
|
|
771
|
-
clustermap_cmap_cpg: Optional[str] =
|
|
772
|
-
clustermap_cmap_a: Optional[str] =
|
|
773
|
-
spatial_clustermap_sortby: Optional[str] =
|
|
829
|
+
layer_for_clustermap_plotting: Optional[str] = "nan0_0minus1"
|
|
830
|
+
clustermap_cmap_c: Optional[str] = "coolwarm"
|
|
831
|
+
clustermap_cmap_gpc: Optional[str] = "coolwarm"
|
|
832
|
+
clustermap_cmap_cpg: Optional[str] = "coolwarm"
|
|
833
|
+
clustermap_cmap_a: Optional[str] = "coolwarm"
|
|
834
|
+
spatial_clustermap_sortby: Optional[str] = "gpc"
|
|
774
835
|
|
|
775
836
|
# Spatial Analysis - UMAP/Leiden params
|
|
776
|
-
layer_for_umap_plotting: Optional[str] =
|
|
777
|
-
umap_layers_to_plot: List[str] = field(
|
|
837
|
+
layer_for_umap_plotting: Optional[str] = "nan_half"
|
|
838
|
+
umap_layers_to_plot: List[str] = field(
|
|
839
|
+
default_factory=lambda: ["mapped_length", "Raw_modification_signal"]
|
|
840
|
+
)
|
|
778
841
|
|
|
779
842
|
# Spatial Analysis - Spatial Autocorrelation params
|
|
843
|
+
autocorr_normalization_method: str = "pearson"
|
|
780
844
|
rows_per_qc_autocorr_grid: int = 12
|
|
781
845
|
autocorr_rolling_window_size: int = 25
|
|
782
846
|
autocorr_max_lag: int = 800
|
|
783
|
-
autocorr_site_types: List[str] = field(default_factory=lambda: [
|
|
847
|
+
autocorr_site_types: List[str] = field(default_factory=lambda: ["GpC", "CpG", "C"])
|
|
784
848
|
|
|
785
849
|
# Spatial Analysis - Correlation Matrix params
|
|
786
|
-
correlation_matrix_types: List[str] = field(
|
|
787
|
-
|
|
788
|
-
|
|
850
|
+
correlation_matrix_types: List[str] = field(
|
|
851
|
+
default_factory=lambda: ["pearson", "binary_covariance"]
|
|
852
|
+
)
|
|
853
|
+
correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
|
|
854
|
+
correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
|
|
789
855
|
|
|
790
856
|
# HMM params
|
|
791
857
|
hmm_n_states: int = 2
|
|
792
|
-
hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
|
|
793
|
-
hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
|
|
858
|
+
hmm_init_emission_probs: List[list] = field(default_factory=lambda: [[0.8, 0.2], [0.2, 0.8]])
|
|
859
|
+
hmm_init_transition_probs: List[list] = field(default_factory=lambda: [[0.9, 0.1], [0.1, 0.9]])
|
|
794
860
|
hmm_init_start_probs: List[float] = field(default_factory=lambda: [0.5, 0.5])
|
|
795
861
|
hmm_eps: float = 1e-8
|
|
796
862
|
hmm_dtype: str = "float64"
|
|
@@ -798,15 +864,28 @@ class ExperimentConfig:
|
|
|
798
864
|
hmm_batch_size: int = 1024
|
|
799
865
|
hmm_use_viterbi: bool = False
|
|
800
866
|
hmm_device: Optional[str] = None
|
|
801
|
-
hmm_methbases: Optional[List[str]] =
|
|
867
|
+
hmm_methbases: Optional[List[str]] = (
|
|
868
|
+
None # if None, HMM.annotate_adata will fall back to mod_target_bases
|
|
869
|
+
)
|
|
870
|
+
# HMM fitting/application strategy
|
|
871
|
+
hmm_fit_strategy: str = "per_group" # "per_group" | "shared_transitions"
|
|
872
|
+
hmm_shared_scope: List[str] = field(default_factory=lambda: ["reference", "methbase"])
|
|
873
|
+
hmm_groupby: List[str] = field(default_factory=lambda: ["sample", "reference", "methbase"])
|
|
874
|
+
# Shared-transitions adaptation behavior
|
|
875
|
+
hmm_adapt_emissions: bool = True
|
|
876
|
+
hmm_adapt_startprobs: bool = True
|
|
877
|
+
hmm_emission_adapt_iters: int = 5
|
|
878
|
+
hmm_emission_adapt_tol: float = 1e-4
|
|
802
879
|
footprints: Optional[bool] = True
|
|
803
880
|
accessible_patches: Optional[bool] = True
|
|
804
881
|
cpg: Optional[bool] = False
|
|
805
882
|
hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
|
|
806
|
-
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None,
|
|
807
|
-
clustermap_cmap_hmm: Optional[str] =
|
|
808
|
-
hmm_clustermap_feature_layers: List[str] = field(
|
|
809
|
-
|
|
883
|
+
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 60)])
|
|
884
|
+
clustermap_cmap_hmm: Optional[str] = "coolwarm"
|
|
885
|
+
hmm_clustermap_feature_layers: List[str] = field(
|
|
886
|
+
default_factory=lambda: ["all_accessible_features"]
|
|
887
|
+
)
|
|
888
|
+
hmm_clustermap_sortby: Optional[str] = "hmm"
|
|
810
889
|
hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
|
|
811
890
|
|
|
812
891
|
# Pipeline control flow - load adata
|
|
@@ -830,7 +909,7 @@ class ExperimentConfig:
|
|
|
830
909
|
force_redo_filter_reads_on_modification_thresholds: bool = False
|
|
831
910
|
bypass_flag_duplicate_reads: bool = False
|
|
832
911
|
force_redo_flag_duplicate_reads: bool = False
|
|
833
|
-
bypass_complexity_analysis: bool = False
|
|
912
|
+
bypass_complexity_analysis: bool = False
|
|
834
913
|
force_redo_complexity_analysis: bool = False
|
|
835
914
|
|
|
836
915
|
# Pipeline control flow - Spatial Analyses
|
|
@@ -910,7 +989,9 @@ class ExperimentConfig:
|
|
|
910
989
|
defaults_loaded = dict(defaults_map[modality] or {})
|
|
911
990
|
defaults_source_chain = [f"defaults_map['{modality}']"]
|
|
912
991
|
elif defaults_dir is not None:
|
|
913
|
-
defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
|
|
992
|
+
defaults_loaded, defaults_source_chain = load_defaults_with_inheritance(
|
|
993
|
+
defaults_dir, modality
|
|
994
|
+
)
|
|
914
995
|
|
|
915
996
|
# If CSV asks to extend defaults, load those and merge
|
|
916
997
|
merged = dict(defaults_loaded or {})
|
|
@@ -925,7 +1006,11 @@ class ExperimentConfig:
|
|
|
925
1006
|
else:
|
|
926
1007
|
ext_list = []
|
|
927
1008
|
for ext in ext_list:
|
|
928
|
-
ext_defaults, ext_sources = (
|
|
1009
|
+
ext_defaults, ext_sources = (
|
|
1010
|
+
load_defaults_with_inheritance(defaults_dir, ext)
|
|
1011
|
+
if defaults_dir
|
|
1012
|
+
else ({}, [])
|
|
1013
|
+
)
|
|
929
1014
|
merged = deep_merge(merged, ext_defaults)
|
|
930
1015
|
for s in ext_sources:
|
|
931
1016
|
if s not in defaults_source_chain:
|
|
@@ -955,34 +1040,40 @@ class ExperimentConfig:
|
|
|
955
1040
|
merged["experiment_name"] = f"{date_str}_SMF_experiment"
|
|
956
1041
|
|
|
957
1042
|
# Input file types and path handling
|
|
958
|
-
input_data_path = Path(merged[
|
|
1043
|
+
input_data_path = Path(merged["input_data_path"])
|
|
959
1044
|
|
|
960
1045
|
# Detect the input filetype
|
|
961
1046
|
if input_data_path.is_file():
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
1047
|
+
suffix = input_data_path.suffix.lower()
|
|
1048
|
+
suffixes = [
|
|
1049
|
+
s.lower() for s in input_data_path.suffixes
|
|
1050
|
+
] # handles multi-part extensions
|
|
1051
|
+
|
|
1052
|
+
# recognize multi-suffix cases like .fastq.gz or .fq.gz
|
|
1053
|
+
if any(s in [".pod5", ".p5"] for s in suffixes):
|
|
1054
|
+
input_type = "pod5"
|
|
1055
|
+
input_files = [Path(input_data_path)]
|
|
1056
|
+
elif any(s in [".fast5", ".f5"] for s in suffixes):
|
|
1057
|
+
input_type = "fast5"
|
|
1058
|
+
input_files = [Path(input_data_path)]
|
|
1059
|
+
elif any(s in [".fastq", ".fq"] for s in suffixes):
|
|
1060
|
+
input_type = "fastq"
|
|
1061
|
+
input_files = [Path(input_data_path)]
|
|
1062
|
+
elif any(s in [".bam"] for s in suffixes):
|
|
1063
|
+
input_type = "bam"
|
|
1064
|
+
input_files = [Path(input_data_path)]
|
|
1065
|
+
elif any(s in [".h5ad", ".h5"] for s in suffixes):
|
|
1066
|
+
input_type = "h5ad"
|
|
1067
|
+
input_files = [Path(input_data_path)]
|
|
1068
|
+
else:
|
|
1069
|
+
print("Error detecting input file type")
|
|
983
1070
|
|
|
984
1071
|
elif input_data_path.is_dir():
|
|
985
|
-
found = discover_input_files(
|
|
1072
|
+
found = discover_input_files(
|
|
1073
|
+
input_data_path,
|
|
1074
|
+
bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
|
|
1075
|
+
recursive=merged["recursive_input_search"],
|
|
1076
|
+
)
|
|
986
1077
|
|
|
987
1078
|
if found["input_is_pod5"]:
|
|
988
1079
|
input_type = "pod5"
|
|
@@ -1010,12 +1101,12 @@ class ExperimentConfig:
|
|
|
1010
1101
|
)
|
|
1011
1102
|
|
|
1012
1103
|
# summary file output path
|
|
1013
|
-
output_dir = Path(merged[
|
|
1014
|
-
summary_file_basename = merged["experiment_name"] +
|
|
1104
|
+
output_dir = Path(merged["output_directory"])
|
|
1105
|
+
summary_file_basename = merged["experiment_name"] + "_output_summary.csv"
|
|
1015
1106
|
summary_file = output_dir / summary_file_basename
|
|
1016
1107
|
|
|
1017
1108
|
# Demultiplexing output path
|
|
1018
|
-
split_dir = merged.get("split_dir",
|
|
1109
|
+
split_dir = merged.get("split_dir", SPLIT_DIR)
|
|
1019
1110
|
split_path = output_dir / split_dir
|
|
1020
1111
|
|
|
1021
1112
|
# final normalization
|
|
@@ -1039,7 +1130,14 @@ class ExperimentConfig:
|
|
|
1039
1130
|
merged["hm5C_threshold"],
|
|
1040
1131
|
]
|
|
1041
1132
|
|
|
1042
|
-
for bkey in (
|
|
1133
|
+
for bkey in (
|
|
1134
|
+
"barcode_both_ends",
|
|
1135
|
+
"trim",
|
|
1136
|
+
"input_already_demuxed",
|
|
1137
|
+
"make_bigwigs",
|
|
1138
|
+
"skip_unclassified",
|
|
1139
|
+
"delete_batch_hdfs",
|
|
1140
|
+
):
|
|
1043
1141
|
if bkey in merged:
|
|
1044
1142
|
merged[bkey] = _parse_bool(merged[bkey])
|
|
1045
1143
|
|
|
@@ -1048,12 +1146,12 @@ class ExperimentConfig:
|
|
|
1048
1146
|
if "threads" in merged:
|
|
1049
1147
|
tval = _parse_numeric(merged.get("threads", None), None)
|
|
1050
1148
|
merged["threads"] = None if tval is None else int(tval)
|
|
1051
|
-
|
|
1149
|
+
|
|
1052
1150
|
if "aligner_args" in merged and merged.get("aligner_args") is None:
|
|
1053
1151
|
merged.pop("aligner_args", None)
|
|
1054
1152
|
|
|
1055
1153
|
# --- Resolve aligner_args into concrete list for the chosen aligner ---
|
|
1056
|
-
merged[
|
|
1154
|
+
merged["aligner_args"] = resolve_aligner_args(merged)
|
|
1057
1155
|
|
|
1058
1156
|
if "mod_list" in merged:
|
|
1059
1157
|
merged["mod_list"] = _parse_list(merged.get("mod_list"))
|
|
@@ -1068,11 +1166,22 @@ class ExperimentConfig:
|
|
|
1068
1166
|
# allow older names (footprint_ranges, accessible_ranges, cpg_ranges) — optional:
|
|
1069
1167
|
maybe_fs = {}
|
|
1070
1168
|
if "footprint_ranges" in merged or "hmm_footprint_ranges" in merged:
|
|
1071
|
-
maybe_fs["footprint"] = {
|
|
1169
|
+
maybe_fs["footprint"] = {
|
|
1170
|
+
"features": merged.get("hmm_footprint_ranges", merged.get("footprint_ranges")),
|
|
1171
|
+
"state": merged.get("hmm_footprint_state", "Non-Modified"),
|
|
1172
|
+
}
|
|
1072
1173
|
if "accessible_ranges" in merged or "hmm_accessible_ranges" in merged:
|
|
1073
|
-
maybe_fs["accessible"] = {
|
|
1174
|
+
maybe_fs["accessible"] = {
|
|
1175
|
+
"features": merged.get(
|
|
1176
|
+
"hmm_accessible_ranges", merged.get("accessible_ranges")
|
|
1177
|
+
),
|
|
1178
|
+
"state": merged.get("hmm_accessible_state", "Modified"),
|
|
1179
|
+
}
|
|
1074
1180
|
if "cpg_ranges" in merged or "hmm_cpg_ranges" in merged:
|
|
1075
|
-
maybe_fs["cpg"] = {
|
|
1181
|
+
maybe_fs["cpg"] = {
|
|
1182
|
+
"features": merged.get("hmm_cpg_ranges", merged.get("cpg_ranges")),
|
|
1183
|
+
"state": merged.get("hmm_cpg_state", "Modified"),
|
|
1184
|
+
}
|
|
1076
1185
|
if maybe_fs:
|
|
1077
1186
|
merged.setdefault("hmm_feature_sets", {})
|
|
1078
1187
|
for k, v in maybe_fs.items():
|
|
@@ -1093,10 +1202,23 @@ class ExperimentConfig:
|
|
|
1093
1202
|
if not hmm_methbases: # None or []
|
|
1094
1203
|
hmm_methbases = _parse_list(merged.get("mod_target_bases", None))
|
|
1095
1204
|
if not hmm_methbases:
|
|
1096
|
-
hmm_methbases = [
|
|
1205
|
+
hmm_methbases = ["C"]
|
|
1097
1206
|
hmm_methbases = list(hmm_methbases)
|
|
1098
1207
|
hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
|
|
1099
|
-
hmm_clustermap_feature_layers = _parse_list(
|
|
1208
|
+
hmm_clustermap_feature_layers = _parse_list(
|
|
1209
|
+
merged.get("hmm_clustermap_feature_layers", "all_accessible_features")
|
|
1210
|
+
)
|
|
1211
|
+
|
|
1212
|
+
hmm_fit_strategy = str(merged.get("hmm_fit_strategy", "per_group")).strip()
|
|
1213
|
+
hmm_shared_scope = _parse_list(merged.get("hmm_shared_scope", ["reference", "methbase"]))
|
|
1214
|
+
hmm_groupby = _parse_list(merged.get("hmm_groupby", ["sample", "reference", "methbase"]))
|
|
1215
|
+
|
|
1216
|
+
hmm_adapt_emissions = _parse_bool(merged.get("hmm_adapt_emissions", True))
|
|
1217
|
+
hmm_adapt_startprobs = _parse_bool(merged.get("hmm_adapt_startprobs", True))
|
|
1218
|
+
hmm_emission_adapt_iters = int(_parse_numeric(merged.get("hmm_emission_adapt_iters", 5), 5))
|
|
1219
|
+
hmm_emission_adapt_tol = float(
|
|
1220
|
+
_parse_numeric(merged.get("hmm_emission_adapt_tol", 1e-4), 1e-4)
|
|
1221
|
+
)
|
|
1100
1222
|
|
|
1101
1223
|
# HMM peak feature configs (for call_hmm_peaks)
|
|
1102
1224
|
merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
|
|
@@ -1106,165 +1228,252 @@ class ExperimentConfig:
|
|
|
1106
1228
|
|
|
1107
1229
|
# instantiate dataclass
|
|
1108
1230
|
instance = cls(
|
|
1109
|
-
smf_modality
|
|
1110
|
-
input_data_path
|
|
1111
|
-
recursive_input_search
|
|
1112
|
-
input_type
|
|
1113
|
-
input_files
|
|
1114
|
-
output_directory
|
|
1115
|
-
summary_file
|
|
1116
|
-
fasta
|
|
1117
|
-
sequencer
|
|
1118
|
-
model_dir
|
|
1119
|
-
barcode_kit
|
|
1120
|
-
fastq_barcode_map
|
|
1121
|
-
fastq_auto_pairing
|
|
1122
|
-
bam_suffix
|
|
1123
|
-
split_dir
|
|
1124
|
-
split_path
|
|
1125
|
-
strands
|
|
1126
|
-
conversions
|
|
1127
|
-
fasta_regions_of_interest
|
|
1128
|
-
mapping_threshold
|
|
1129
|
-
experiment_name
|
|
1130
|
-
model
|
|
1131
|
-
barcode_both_ends
|
|
1132
|
-
trim
|
|
1133
|
-
input_already_demuxed
|
|
1134
|
-
threads
|
|
1135
|
-
sample_sheet_path
|
|
1136
|
-
sample_sheet_mapping_column
|
|
1137
|
-
delete_intermediate_bams
|
|
1138
|
-
delete_intermediate_tsvs
|
|
1139
|
-
align_from_bam
|
|
1140
|
-
aligner
|
|
1141
|
-
aligner_args
|
|
1142
|
-
device
|
|
1143
|
-
make_bigwigs
|
|
1144
|
-
make_beds
|
|
1145
|
-
delete_intermediate_hdfs
|
|
1146
|
-
mod_target_bases
|
|
1147
|
-
enzyme_target_bases
|
|
1148
|
-
conversion_types
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1231
|
+
smf_modality=merged.get("smf_modality"),
|
|
1232
|
+
input_data_path=input_data_path,
|
|
1233
|
+
recursive_input_search=merged.get("recursive_input_search"),
|
|
1234
|
+
input_type=input_type,
|
|
1235
|
+
input_files=input_files,
|
|
1236
|
+
output_directory=output_dir,
|
|
1237
|
+
summary_file=summary_file,
|
|
1238
|
+
fasta=merged.get("fasta"),
|
|
1239
|
+
sequencer=merged.get("sequencer"),
|
|
1240
|
+
model_dir=merged.get("model_dir"),
|
|
1241
|
+
barcode_kit=merged.get("barcode_kit"),
|
|
1242
|
+
fastq_barcode_map=merged.get("fastq_barcode_map"),
|
|
1243
|
+
fastq_auto_pairing=merged.get("fastq_auto_pairing"),
|
|
1244
|
+
bam_suffix=merged.get("bam_suffix", BAM_SUFFIX),
|
|
1245
|
+
split_dir=split_dir,
|
|
1246
|
+
split_path=split_path,
|
|
1247
|
+
strands=merged.get("strands", STRANDS),
|
|
1248
|
+
conversions=merged.get("conversions", CONVERSIONS),
|
|
1249
|
+
fasta_regions_of_interest=merged.get("fasta_regions_of_interest"),
|
|
1250
|
+
mapping_threshold=float(merged.get("mapping_threshold", 0.01)),
|
|
1251
|
+
experiment_name=merged.get("experiment_name"),
|
|
1252
|
+
model=merged.get("model", "hac"),
|
|
1253
|
+
barcode_both_ends=merged.get("barcode_both_ends", BARCODE_BOTH_ENDS),
|
|
1254
|
+
trim=merged.get("trim", TRIM),
|
|
1255
|
+
input_already_demuxed=merged.get("input_already_demuxed", False),
|
|
1256
|
+
threads=merged.get("threads"),
|
|
1257
|
+
sample_sheet_path=merged.get("sample_sheet_path"),
|
|
1258
|
+
sample_sheet_mapping_column=merged.get("sample_sheet_mapping_column"),
|
|
1259
|
+
delete_intermediate_bams=merged.get("delete_intermediate_bams", False),
|
|
1260
|
+
delete_intermediate_tsvs=merged.get("delete_intermediate_tsvs", True),
|
|
1261
|
+
align_from_bam=merged.get("align_from_bam", False),
|
|
1262
|
+
aligner=merged.get("aligner", "minimap2"),
|
|
1263
|
+
aligner_args=merged.get("aligner_args", None),
|
|
1264
|
+
device=merged.get("device", "auto"),
|
|
1265
|
+
make_bigwigs=merged.get("make_bigwigs", False),
|
|
1266
|
+
make_beds=merged.get("make_beds", False),
|
|
1267
|
+
delete_intermediate_hdfs=merged.get("delete_intermediate_hdfs", True),
|
|
1268
|
+
mod_target_bases=merged.get("mod_target_bases", ["GpC", "CpG"]),
|
|
1269
|
+
enzyme_target_bases=merged.get("enzyme_target_bases", ["GpC"]),
|
|
1270
|
+
conversion_types=merged.get("conversions", ["unconverted"])
|
|
1271
|
+
+ merged.get("conversion_types", ["5mC"]),
|
|
1272
|
+
filter_threshold=merged.get("filter_threshold", 0.8),
|
|
1273
|
+
m6A_threshold=merged.get("m6A_threshold", 0.7),
|
|
1274
|
+
m5C_threshold=merged.get("m5C_threshold", 0.7),
|
|
1275
|
+
hm5C_threshold=merged.get("hm5C_threshold", 0.7),
|
|
1276
|
+
thresholds=merged.get("thresholds", []),
|
|
1277
|
+
mod_list=merged.get("mod_list", list(MOD_LIST)),
|
|
1278
|
+
mod_map=merged.get("mod_map", list(MOD_MAP)),
|
|
1279
|
+
batch_size=merged.get("batch_size", 4),
|
|
1280
|
+
skip_unclassified=merged.get("skip_unclassified", True),
|
|
1281
|
+
delete_batch_hdfs=merged.get("delete_batch_hdfs", True),
|
|
1282
|
+
reference_column=merged.get("reference_column", REF_COL),
|
|
1283
|
+
sample_column=merged.get("sample_column", SAMPLE_COL),
|
|
1284
|
+
sample_name_col_for_plotting=merged.get("sample_name_col_for_plotting", "Barcode"),
|
|
1285
|
+
obs_to_plot_pp_qc=obs_to_plot_pp_qc,
|
|
1286
|
+
fit_position_methylation_thresholds=merged.get(
|
|
1287
|
+
"fit_position_methylation_thresholds", False
|
|
1288
|
+
),
|
|
1289
|
+
binarize_on_fixed_methlyation_threshold=merged.get(
|
|
1290
|
+
"binarize_on_fixed_methlyation_threshold", 0.7
|
|
1291
|
+
),
|
|
1292
|
+
positive_control_sample_methylation_fitting=merged.get(
|
|
1293
|
+
"positive_control_sample_methylation_fitting", None
|
|
1294
|
+
),
|
|
1295
|
+
negative_control_sample_methylation_fitting=merged.get(
|
|
1296
|
+
"negative_control_sample_methylation_fitting", None
|
|
1297
|
+
),
|
|
1298
|
+
infer_on_percentile_sample_methylation_fitting=merged.get(
|
|
1299
|
+
"infer_on_percentile_sample_methylation_fitting", 10
|
|
1300
|
+
),
|
|
1301
|
+
inference_variable_sample_methylation_fitting=merged.get(
|
|
1302
|
+
"inference_variable_sample_methylation_fitting", "Raw_modification_signal"
|
|
1303
|
+
),
|
|
1304
|
+
fit_j_threshold=merged.get("fit_j_threshold", 0.5),
|
|
1305
|
+
output_binary_layer_name=merged.get(
|
|
1306
|
+
"output_binary_layer_name", "binarized_methylation"
|
|
1307
|
+
),
|
|
1308
|
+
reindexing_offsets=merged.get("reindexing_offsets", {None: None}),
|
|
1309
|
+
reindexed_var_suffix=merged.get("reindexed_var_suffix", "reindexed"),
|
|
1310
|
+
layer_for_clustermap_plotting=merged.get(
|
|
1311
|
+
"layer_for_clustermap_plotting", "nan0_0minus1"
|
|
1312
|
+
),
|
|
1313
|
+
clustermap_cmap_c=merged.get("clustermap_cmap_c", "coolwarm"),
|
|
1314
|
+
clustermap_cmap_gpc=merged.get("clustermap_cmap_gpc", "coolwarm"),
|
|
1315
|
+
clustermap_cmap_cpg=merged.get("clustermap_cmap_cpg", "coolwarm"),
|
|
1316
|
+
clustermap_cmap_a=merged.get("clustermap_cmap_a", "coolwarm"),
|
|
1317
|
+
spatial_clustermap_sortby=merged.get("spatial_clustermap_sortby", "gpc"),
|
|
1318
|
+
layer_for_umap_plotting=merged.get("layer_for_umap_plotting", "nan_half"),
|
|
1319
|
+
umap_layers_to_plot=merged.get(
|
|
1320
|
+
"umap_layers_to_plot", ["mapped_length", "Raw_modification_signal"]
|
|
1321
|
+
),
|
|
1322
|
+
rows_per_qc_histogram_grid=merged.get("rows_per_qc_histogram_grid", 12),
|
|
1323
|
+
rows_per_qc_autocorr_grid=merged.get("rows_per_qc_autocorr_grid", 12),
|
|
1324
|
+
autocorr_normalization_method=merged.get("autocorr_normalization_method", "pearson"),
|
|
1325
|
+
autocorr_rolling_window_size=merged.get("autocorr_rolling_window_size", 25),
|
|
1326
|
+
autocorr_max_lag=merged.get("autocorr_max_lag", 800),
|
|
1327
|
+
autocorr_site_types=merged.get("autocorr_site_types", ["GpC", "CpG", "C"]),
|
|
1328
|
+
hmm_n_states=merged.get("hmm_n_states", 2),
|
|
1329
|
+
hmm_init_emission_probs=merged.get("hmm_init_emission_probs", [[0.8, 0.2], [0.2, 0.8]]),
|
|
1330
|
+
hmm_init_transition_probs=merged.get(
|
|
1331
|
+
"hmm_init_transition_probs", [[0.9, 0.1], [0.1, 0.9]]
|
|
1332
|
+
),
|
|
1333
|
+
hmm_init_start_probs=merged.get("hmm_init_start_probs", [0.5, 0.5]),
|
|
1334
|
+
hmm_eps=merged.get("hmm_eps", 1e-8),
|
|
1335
|
+
hmm_fit_strategy=hmm_fit_strategy,
|
|
1336
|
+
hmm_shared_scope=hmm_shared_scope,
|
|
1337
|
+
hmm_groupby=hmm_groupby,
|
|
1338
|
+
hmm_adapt_emissions=hmm_adapt_emissions,
|
|
1339
|
+
hmm_adapt_startprobs=hmm_adapt_startprobs,
|
|
1340
|
+
hmm_emission_adapt_iters=hmm_emission_adapt_iters,
|
|
1341
|
+
hmm_emission_adapt_tol=hmm_emission_adapt_tol,
|
|
1342
|
+
hmm_dtype=merged.get("hmm_dtype", "float64"),
|
|
1343
|
+
hmm_feature_sets=hmm_feature_sets,
|
|
1344
|
+
hmm_annotation_threshold=hmm_annotation_threshold,
|
|
1345
|
+
hmm_batch_size=hmm_batch_size,
|
|
1346
|
+
hmm_use_viterbi=hmm_use_viterbi,
|
|
1347
|
+
hmm_methbases=hmm_methbases,
|
|
1348
|
+
hmm_device=hmm_device,
|
|
1349
|
+
hmm_merge_layer_features=hmm_merge_layer_features,
|
|
1350
|
+
clustermap_cmap_hmm=merged.get("clustermap_cmap_hmm", "coolwarm"),
|
|
1351
|
+
hmm_clustermap_feature_layers=hmm_clustermap_feature_layers,
|
|
1352
|
+
hmm_clustermap_sortby=merged.get("hmm_clustermap_sortby", "hmm"),
|
|
1353
|
+
hmm_peak_feature_configs=hmm_peak_feature_configs,
|
|
1354
|
+
footprints=merged.get("footprints", None),
|
|
1355
|
+
accessible_patches=merged.get("accessible_patches", None),
|
|
1356
|
+
cpg=merged.get("cpg", None),
|
|
1357
|
+
read_coord_filter=merged.get("read_coord_filter", [None, None]),
|
|
1358
|
+
read_len_filter_thresholds=merged.get("read_len_filter_thresholds", [100, None]),
|
|
1359
|
+
read_len_to_ref_ratio_filter_thresholds=merged.get(
|
|
1360
|
+
"read_len_to_ref_ratio_filter_thresholds", [0.3, None]
|
|
1361
|
+
),
|
|
1362
|
+
read_quality_filter_thresholds=merged.get("read_quality_filter_thresholds", [15, None]),
|
|
1363
|
+
read_mapping_quality_filter_thresholds=merged.get(
|
|
1364
|
+
"read_mapping_quality_filter_thresholds", [None, None]
|
|
1365
|
+
),
|
|
1366
|
+
read_mod_filtering_gpc_thresholds=merged.get(
|
|
1367
|
+
"read_mod_filtering_gpc_thresholds", [0.025, 0.975]
|
|
1368
|
+
),
|
|
1369
|
+
read_mod_filtering_cpg_thresholds=merged.get(
|
|
1370
|
+
"read_mod_filtering_cpg_thresholds", [0.0, 1.0]
|
|
1371
|
+
),
|
|
1372
|
+
read_mod_filtering_c_thresholds=merged.get(
|
|
1373
|
+
"read_mod_filtering_c_thresholds", [0.025, 0.975]
|
|
1374
|
+
),
|
|
1375
|
+
read_mod_filtering_a_thresholds=merged.get(
|
|
1376
|
+
"read_mod_filtering_a_thresholds", [0.025, 0.975]
|
|
1377
|
+
),
|
|
1378
|
+
read_mod_filtering_use_other_c_as_background=merged.get(
|
|
1379
|
+
"read_mod_filtering_use_other_c_as_background", True
|
|
1380
|
+
),
|
|
1381
|
+
min_valid_fraction_positions_in_read_vs_ref=merged.get(
|
|
1382
|
+
"min_valid_fraction_positions_in_read_vs_ref", 0.2
|
|
1383
|
+
),
|
|
1384
|
+
duplicate_detection_site_types=merged.get(
|
|
1385
|
+
"duplicate_detection_site_types", ["GpC", "CpG", "ambiguous_GpC_CpG"]
|
|
1386
|
+
),
|
|
1387
|
+
duplicate_detection_distance_threshold=merged.get(
|
|
1388
|
+
"duplicate_detection_distance_threshold", 0.07
|
|
1389
|
+
),
|
|
1390
|
+
duplicate_detection_keep_best_metric=merged.get(
|
|
1391
|
+
"duplicate_detection_keep_best_metric", "read_quality"
|
|
1392
|
+
),
|
|
1393
|
+
duplicate_detection_window_size_for_hamming_neighbors=merged.get(
|
|
1394
|
+
"duplicate_detection_window_size_for_hamming_neighbors", 50
|
|
1395
|
+
),
|
|
1396
|
+
duplicate_detection_min_overlapping_positions=merged.get(
|
|
1397
|
+
"duplicate_detection_min_overlapping_positions", 20
|
|
1398
|
+
),
|
|
1399
|
+
duplicate_detection_do_hierarchical=merged.get(
|
|
1400
|
+
"duplicate_detection_do_hierarchical", True
|
|
1401
|
+
),
|
|
1402
|
+
duplicate_detection_hierarchical_linkage=merged.get(
|
|
1403
|
+
"duplicate_detection_hierarchical_linkage", "average"
|
|
1404
|
+
),
|
|
1405
|
+
duplicate_detection_do_pca=merged.get("duplicate_detection_do_pca", False),
|
|
1406
|
+
position_max_nan_threshold=merged.get("position_max_nan_threshold", 0.1),
|
|
1407
|
+
correlation_matrix_types=merged.get(
|
|
1408
|
+
"correlation_matrix_types", ["pearson", "binary_covariance"]
|
|
1409
|
+
),
|
|
1410
|
+
correlation_matrix_cmaps=merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
|
|
1411
|
+
correlation_matrix_site_types=merged.get("correlation_matrix_site_types", ["GpC_site"]),
|
|
1412
|
+
hamming_vs_metric_keys=merged.get(
|
|
1413
|
+
"hamming_vs_metric_keys", ["Fraction_C_site_modified"]
|
|
1414
|
+
),
|
|
1415
|
+
force_redo_load_adata=merged.get("force_redo_load_adata", False),
|
|
1416
|
+
force_redo_preprocessing=merged.get("force_redo_preprocessing", False),
|
|
1417
|
+
force_reload_sample_sheet=merged.get("force_reload_sample_sheet", True),
|
|
1418
|
+
bypass_add_read_length_and_mapping_qc=merged.get(
|
|
1419
|
+
"bypass_add_read_length_and_mapping_qc", False
|
|
1420
|
+
),
|
|
1421
|
+
force_redo_add_read_length_and_mapping_qc=merged.get(
|
|
1422
|
+
"force_redo_add_read_length_and_mapping_qc", False
|
|
1423
|
+
),
|
|
1424
|
+
bypass_clean_nan=merged.get("bypass_clean_nan", False),
|
|
1425
|
+
force_redo_clean_nan=merged.get("force_redo_clean_nan", False),
|
|
1426
|
+
bypass_append_base_context=merged.get("bypass_append_base_context", False),
|
|
1427
|
+
force_redo_append_base_context=merged.get("force_redo_append_base_context", False),
|
|
1428
|
+
invert_adata=merged.get("invert_adata", False),
|
|
1429
|
+
bypass_append_binary_layer_by_base_context=merged.get(
|
|
1430
|
+
"bypass_append_binary_layer_by_base_context", False
|
|
1431
|
+
),
|
|
1432
|
+
force_redo_append_binary_layer_by_base_context=merged.get(
|
|
1433
|
+
"force_redo_append_binary_layer_by_base_context", False
|
|
1434
|
+
),
|
|
1435
|
+
bypass_calculate_read_modification_stats=merged.get(
|
|
1436
|
+
"bypass_calculate_read_modification_stats", False
|
|
1437
|
+
),
|
|
1438
|
+
force_redo_calculate_read_modification_stats=merged.get(
|
|
1439
|
+
"force_redo_calculate_read_modification_stats", False
|
|
1440
|
+
),
|
|
1441
|
+
bypass_filter_reads_on_modification_thresholds=merged.get(
|
|
1442
|
+
"bypass_filter_reads_on_modification_thresholds", False
|
|
1443
|
+
),
|
|
1444
|
+
force_redo_filter_reads_on_modification_thresholds=merged.get(
|
|
1445
|
+
"force_redo_filter_reads_on_modification_thresholds", False
|
|
1446
|
+
),
|
|
1447
|
+
bypass_flag_duplicate_reads=merged.get("bypass_flag_duplicate_reads", False),
|
|
1448
|
+
force_redo_flag_duplicate_reads=merged.get("force_redo_flag_duplicate_reads", False),
|
|
1449
|
+
bypass_complexity_analysis=merged.get("bypass_complexity_analysis", False),
|
|
1450
|
+
force_redo_complexity_analysis=merged.get("force_redo_complexity_analysis", False),
|
|
1451
|
+
force_redo_spatial_analyses=merged.get("force_redo_spatial_analyses", False),
|
|
1452
|
+
bypass_basic_clustermaps=merged.get("bypass_basic_clustermaps", False),
|
|
1453
|
+
force_redo_basic_clustermaps=merged.get("force_redo_basic_clustermaps", False),
|
|
1454
|
+
bypass_basic_umap=merged.get("bypass_basic_umap", False),
|
|
1455
|
+
force_redo_basic_umap=merged.get("force_redo_basic_umap", False),
|
|
1456
|
+
bypass_spatial_autocorr_calculations=merged.get(
|
|
1457
|
+
"bypass_spatial_autocorr_calculations", False
|
|
1458
|
+
),
|
|
1459
|
+
force_redo_spatial_autocorr_calculations=merged.get(
|
|
1460
|
+
"force_redo_spatial_autocorr_calculations", False
|
|
1461
|
+
),
|
|
1462
|
+
bypass_spatial_autocorr_plotting=merged.get("bypass_spatial_autocorr_plotting", False),
|
|
1463
|
+
force_redo_spatial_autocorr_plotting=merged.get(
|
|
1464
|
+
"force_redo_spatial_autocorr_plotting", False
|
|
1465
|
+
),
|
|
1466
|
+
bypass_matrix_corr_calculations=merged.get("bypass_matrix_corr_calculations", False),
|
|
1467
|
+
force_redo_matrix_corr_calculations=merged.get(
|
|
1468
|
+
"force_redo_matrix_corr_calculations", False
|
|
1469
|
+
),
|
|
1470
|
+
bypass_matrix_corr_plotting=merged.get("bypass_matrix_corr_plotting", False),
|
|
1471
|
+
force_redo_matrix_corr_plotting=merged.get("force_redo_matrix_corr_plotting", False),
|
|
1472
|
+
bypass_hmm_fit=merged.get("bypass_hmm_fit", False),
|
|
1473
|
+
force_redo_hmm_fit=merged.get("force_redo_hmm_fit", False),
|
|
1474
|
+
bypass_hmm_apply=merged.get("bypass_hmm_apply", False),
|
|
1475
|
+
force_redo_hmm_apply=merged.get("force_redo_hmm_apply", False),
|
|
1476
|
+
config_source=config_source or "<var_dict>",
|
|
1268
1477
|
)
|
|
1269
1478
|
|
|
1270
1479
|
report = {
|
|
@@ -1291,9 +1500,20 @@ class ExperimentConfig:
|
|
|
1291
1500
|
Load CSV using LoadExperimentConfig (or accept DataFrame) and build ExperimentConfig.
|
|
1292
1501
|
Additional kwargs passed to from_var_dict().
|
|
1293
1502
|
"""
|
|
1294
|
-
loader =
|
|
1503
|
+
loader = (
|
|
1504
|
+
LoadExperimentConfig(csv_input)
|
|
1505
|
+
if not isinstance(csv_input, pd.DataFrame)
|
|
1506
|
+
else LoadExperimentConfig(pd.DataFrame(csv_input))
|
|
1507
|
+
)
|
|
1295
1508
|
var_dict = loader.var_dict
|
|
1296
|
-
return cls.from_var_dict(
|
|
1509
|
+
return cls.from_var_dict(
|
|
1510
|
+
var_dict,
|
|
1511
|
+
date_str=date_str,
|
|
1512
|
+
config_source=config_source,
|
|
1513
|
+
defaults_dir=defaults_dir,
|
|
1514
|
+
defaults_map=defaults_map,
|
|
1515
|
+
**kwargs,
|
|
1516
|
+
)
|
|
1297
1517
|
|
|
1298
1518
|
# -------------------------
|
|
1299
1519
|
# validation & serialization
|
|
@@ -1306,7 +1526,9 @@ class ExperimentConfig:
|
|
|
1306
1526
|
return errs
|
|
1307
1527
|
for g, info in hfs.items():
|
|
1308
1528
|
if not isinstance(info, dict):
|
|
1309
|
-
errs.append(
|
|
1529
|
+
errs.append(
|
|
1530
|
+
f"hmm_feature_sets['{g}'] must be a mapping with 'features' and 'state'."
|
|
1531
|
+
)
|
|
1310
1532
|
continue
|
|
1311
1533
|
feats = info.get("features")
|
|
1312
1534
|
if not isinstance(feats, dict) or len(feats) == 0:
|
|
@@ -1316,7 +1538,9 @@ class ExperimentConfig:
|
|
|
1316
1538
|
try:
|
|
1317
1539
|
lo, hi = float(rng[0]), float(rng[1])
|
|
1318
1540
|
if lo < 0 or hi <= lo:
|
|
1319
|
-
errs.append(
|
|
1541
|
+
errs.append(
|
|
1542
|
+
f"Feature range for {g}:{fname} must satisfy 0 <= lo < hi; got {rng}."
|
|
1543
|
+
)
|
|
1320
1544
|
except Exception:
|
|
1321
1545
|
errs.append(f"Feature range for {g}:{fname} is invalid: {rng}")
|
|
1322
1546
|
return errs
|
|
@@ -1349,13 +1573,18 @@ class ExperimentConfig:
|
|
|
1349
1573
|
|
|
1350
1574
|
if not (0.0 <= float(self.mapping_threshold) <= 1.0):
|
|
1351
1575
|
errors.append("mapping_threshold must be in [0,1].")
|
|
1352
|
-
for t in (
|
|
1576
|
+
for t in (
|
|
1577
|
+
self.filter_threshold,
|
|
1578
|
+
self.m6A_threshold,
|
|
1579
|
+
self.m5C_threshold,
|
|
1580
|
+
self.hm5C_threshold,
|
|
1581
|
+
):
|
|
1353
1582
|
if not (0.0 <= float(t) <= 1.0):
|
|
1354
1583
|
errors.append(f"threshold value {t} must be in [0,1].")
|
|
1355
1584
|
|
|
1356
1585
|
if raise_on_error and errors:
|
|
1357
1586
|
raise ValueError("ExperimentConfig validation failed:\n " + "\n ".join(errors))
|
|
1358
|
-
|
|
1587
|
+
|
|
1359
1588
|
errs = _validate_hmm_features_structure(self.hmm_feature_sets)
|
|
1360
1589
|
errors.extend(errs)
|
|
1361
1590
|
|