smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli/preprocess_adata.py
CHANGED
|
@@ -1,293 +1,488 @@
|
|
|
1
|
-
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import anndata as ad
|
|
5
|
+
|
|
6
|
+
from smftools.logging_utils import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def preprocess_adata(
|
|
12
|
+
config_path: str,
|
|
13
|
+
) -> Tuple[Optional[ad.AnnData], Optional[Path], Optional[ad.AnnData], Optional[Path]]:
|
|
2
14
|
"""
|
|
3
|
-
|
|
4
|
-
Command line accesses this through smftools preprocess <config_path>
|
|
15
|
+
CLI-facing wrapper for preprocessing.
|
|
5
16
|
|
|
6
|
-
|
|
7
|
-
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
17
|
+
Called by: `smftools preprocess <config_path>`
|
|
8
18
|
|
|
9
|
-
|
|
10
|
-
|
|
19
|
+
- Ensure a raw AnnData exists (or some later-stage AnnData) via `load_adata`.
|
|
20
|
+
- Determine which AnnData stages exist (raw, pp, pp_dedup, spatial, hmm).
|
|
21
|
+
- Respect cfg flags (force_redo_preprocessing, force_redo_flag_duplicate_reads).
|
|
22
|
+
- Decide what starting AnnData to load (or whether to early-return).
|
|
23
|
+
- Call `preprocess_adata_core(...)` when appropriate.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
pp_adata : AnnData | None
|
|
28
|
+
Preprocessed AnnData (may be None if we skipped work).
|
|
29
|
+
pp_adata_path : Path | None
|
|
30
|
+
Path to preprocessed AnnData.
|
|
31
|
+
pp_dedup_adata : AnnData | None
|
|
32
|
+
Preprocessed, duplicate-removed AnnData.
|
|
33
|
+
pp_dedup_adata_path : Path | None
|
|
34
|
+
Path to preprocessed, duplicate-removed AnnData.
|
|
11
35
|
"""
|
|
12
|
-
from ..readwrite import safe_read_h5ad
|
|
36
|
+
from ..readwrite import safe_read_h5ad
|
|
37
|
+
from .helpers import get_adata_paths
|
|
13
38
|
from .load_adata import load_adata
|
|
14
39
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
import anndata as ad
|
|
18
|
-
import scanpy as sc
|
|
40
|
+
# 1) Ensure config is loaded and at least *some* AnnData stage exists
|
|
41
|
+
loaded_adata, loaded_path, cfg = load_adata(config_path)
|
|
19
42
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
43
|
+
# 2) Compute canonical paths
|
|
44
|
+
paths = get_adata_paths(cfg)
|
|
45
|
+
raw_path = paths.raw
|
|
46
|
+
pp_path = paths.pp
|
|
47
|
+
pp_dedup_path = paths.pp_dedup
|
|
48
|
+
spatial_path = paths.spatial
|
|
49
|
+
hmm_path = paths.hmm
|
|
23
50
|
|
|
24
|
-
|
|
25
|
-
|
|
51
|
+
raw_exists = raw_path.exists()
|
|
52
|
+
pp_exists = pp_path.exists()
|
|
53
|
+
pp_dedup_exists = pp_dedup_path.exists()
|
|
54
|
+
spatial_exists = spatial_path.exists()
|
|
55
|
+
hmm_exists = hmm_path.exists()
|
|
26
56
|
|
|
27
|
-
|
|
28
|
-
|
|
57
|
+
# Helper: reuse loaded_adata if it matches the path we want, else read from disk
|
|
58
|
+
def _load(path: Path):
|
|
59
|
+
if loaded_adata is not None and loaded_path == path:
|
|
60
|
+
return loaded_adata
|
|
61
|
+
adata, _ = safe_read_h5ad(path)
|
|
62
|
+
return adata
|
|
29
63
|
|
|
30
|
-
#
|
|
31
|
-
|
|
32
|
-
|
|
64
|
+
# -----------------------------
|
|
65
|
+
# Case A: full redo of preprocessing
|
|
66
|
+
# -----------------------------
|
|
67
|
+
if getattr(cfg, "force_redo_preprocessing", False):
|
|
68
|
+
logger.info(
|
|
69
|
+
"Forcing full redo of preprocessing workflow, starting from latest stage AnnData available."
|
|
70
|
+
)
|
|
33
71
|
|
|
34
|
-
|
|
35
|
-
|
|
72
|
+
if hmm_exists:
|
|
73
|
+
adata = _load(hmm_path)
|
|
74
|
+
source_path = hmm_path
|
|
75
|
+
elif spatial_exists:
|
|
76
|
+
adata = _load(spatial_path)
|
|
77
|
+
source_path = spatial_path
|
|
78
|
+
elif pp_dedup_exists:
|
|
79
|
+
adata = _load(pp_dedup_path)
|
|
80
|
+
source_path = pp_dedup_path
|
|
81
|
+
elif pp_exists:
|
|
82
|
+
adata = _load(pp_path)
|
|
83
|
+
source_path = pp_path
|
|
84
|
+
elif raw_exists:
|
|
85
|
+
adata = _load(raw_path)
|
|
86
|
+
source_path = raw_path
|
|
87
|
+
else:
|
|
88
|
+
logger.error("Cannot redo preprocessing: no AnnData available at any stage.")
|
|
89
|
+
return (None, None, None, None)
|
|
36
90
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
91
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
92
|
+
adata=adata,
|
|
93
|
+
cfg=cfg,
|
|
94
|
+
pp_adata_path=pp_path,
|
|
95
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
96
|
+
source_adata_path=source_path,
|
|
97
|
+
config_path=config_path,
|
|
98
|
+
)
|
|
99
|
+
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
43
100
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
59
|
-
elif preprocessed_version_available:
|
|
60
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
61
|
-
elif preprocessed_dup_removed_version_available:
|
|
62
|
-
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
63
|
-
else:
|
|
64
|
-
print(f"Can not redo preprocessing when there is no adata available.")
|
|
65
|
-
return
|
|
66
|
-
elif cfg.force_redo_flag_duplicate_reads:
|
|
67
|
-
print(f"Forcing redo of duplicate detection workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
68
|
-
if preprocessed_version_available:
|
|
69
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
70
|
-
elif initial_version_available:
|
|
71
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
72
|
-
else:
|
|
73
|
-
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
74
|
-
return
|
|
75
|
-
elif cfg.force_redo_basic_analyses:
|
|
76
|
-
print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
77
|
-
if preprocessed_version_available:
|
|
78
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
79
|
-
elif initial_version_available:
|
|
80
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
81
|
-
else:
|
|
82
|
-
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
83
|
-
elif hmm_adata_exists:
|
|
84
|
-
print(f"HMM anndata found: {hmm_adata_path}")
|
|
85
|
-
return (None, None, None, None)
|
|
86
|
-
elif spatial_adata_exists:
|
|
87
|
-
print(f"Spatial anndata found: {spatial_adata_exists}")
|
|
88
|
-
return (None, None, None, None)
|
|
89
|
-
elif preprocessed_dup_removed_version_available:
|
|
90
|
-
print(f"Preprocessed deduplicated anndata found: {pp_dup_rem_adata_path}")
|
|
91
|
-
return (None, pp_adata_path, None, pp_dup_rem_adata_path)
|
|
92
|
-
elif preprocessed_version_available:
|
|
93
|
-
print(f"Preprocessed anndata found: {pp_adata_path}")
|
|
94
|
-
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
95
|
-
elif initial_version_available:
|
|
96
|
-
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
101
|
+
# -----------------------------
|
|
102
|
+
# Case B: redo duplicate detection only
|
|
103
|
+
# -----------------------------
|
|
104
|
+
if getattr(cfg, "force_redo_flag_duplicate_reads", False):
|
|
105
|
+
logger.info(
|
|
106
|
+
"Forcing redo of duplicate detection workflow, starting from the preprocessed AnnData "
|
|
107
|
+
"if available. Otherwise, will use the raw AnnData."
|
|
108
|
+
)
|
|
109
|
+
if pp_exists:
|
|
110
|
+
adata = _load(pp_path)
|
|
111
|
+
source_path = pp_path
|
|
112
|
+
elif raw_exists:
|
|
113
|
+
adata = _load(raw_path)
|
|
114
|
+
source_path = raw_path
|
|
97
115
|
else:
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
116
|
+
logger.error(
|
|
117
|
+
"Cannot redo duplicate detection: no compatible AnnData available "
|
|
118
|
+
"(need at least raw or preprocessed)."
|
|
119
|
+
)
|
|
120
|
+
return (None, None, None, None)
|
|
121
|
+
|
|
122
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
123
|
+
adata=adata,
|
|
124
|
+
cfg=cfg,
|
|
125
|
+
pp_adata_path=pp_path,
|
|
126
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
127
|
+
source_adata_path=source_path,
|
|
128
|
+
config_path=config_path,
|
|
129
|
+
)
|
|
130
|
+
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
131
|
+
|
|
132
|
+
# -----------------------------
|
|
133
|
+
# Case C: normal behavior (no explicit redo flags)
|
|
134
|
+
# -----------------------------
|
|
135
|
+
|
|
136
|
+
# If HMM exists, preprocessing is considered “done enough”
|
|
137
|
+
if hmm_exists:
|
|
138
|
+
logger.debug(f"Skipping preprocessing. HMM AnnData found: {hmm_path}")
|
|
139
|
+
return (None, None, None, None)
|
|
140
|
+
|
|
141
|
+
# If spatial exists, also skip re-preprocessing by default
|
|
142
|
+
if spatial_exists:
|
|
143
|
+
logger.debug(f"Skipping preprocessing. Spatial AnnData found: {spatial_path}")
|
|
144
|
+
return (None, None, None, None)
|
|
145
|
+
|
|
146
|
+
# If pp_dedup exists, just return paths (no recomputation)
|
|
147
|
+
if pp_dedup_exists:
|
|
148
|
+
logger.debug(
|
|
149
|
+
f"Skipping preprocessing. Preprocessed deduplicated AnnData found: {pp_dedup_path}"
|
|
150
|
+
)
|
|
151
|
+
return (None, pp_path, None, pp_dedup_path)
|
|
152
|
+
|
|
153
|
+
# If pp exists but pp_dedup does not, load pp and run core
|
|
154
|
+
if pp_exists:
|
|
155
|
+
logger.debug(f"Preprocessed AnnData found: {pp_path}")
|
|
156
|
+
adata = _load(pp_path)
|
|
157
|
+
source_path = pp_path
|
|
158
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
159
|
+
adata=adata,
|
|
160
|
+
cfg=cfg,
|
|
161
|
+
pp_adata_path=pp_path,
|
|
162
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
163
|
+
source_adata_path=source_path,
|
|
164
|
+
config_path=config_path,
|
|
165
|
+
)
|
|
166
|
+
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
167
|
+
|
|
168
|
+
# Otherwise, fall back to raw (if available)
|
|
169
|
+
if raw_exists:
|
|
170
|
+
adata = _load(raw_path)
|
|
171
|
+
source_path = raw_path
|
|
172
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path = preprocess_adata_core(
|
|
173
|
+
adata=adata,
|
|
174
|
+
cfg=cfg,
|
|
175
|
+
pp_adata_path=pp_path,
|
|
176
|
+
pp_dup_rem_adata_path=pp_dedup_path,
|
|
177
|
+
source_adata_path=source_path,
|
|
178
|
+
config_path=config_path,
|
|
179
|
+
)
|
|
180
|
+
return pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path
|
|
181
|
+
|
|
182
|
+
logger.error("No AnnData available at any stage for preprocessing.")
|
|
183
|
+
return (None, None, None, None)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def preprocess_adata_core(
|
|
187
|
+
adata: ad.AnnData,
|
|
188
|
+
cfg,
|
|
189
|
+
pp_adata_path: Path,
|
|
190
|
+
pp_dup_rem_adata_path: Path,
|
|
191
|
+
source_adata_path: Optional[Path] = None,
|
|
192
|
+
config_path: Optional[str] = None,
|
|
193
|
+
) -> Tuple[ad.AnnData, Path, ad.AnnData, Path]:
|
|
194
|
+
"""
|
|
195
|
+
Core preprocessing pipeline.
|
|
196
|
+
|
|
197
|
+
Assumes:
|
|
198
|
+
- `adata` is an AnnData object at some stage (raw/pp/etc.) to start preprocessing from.
|
|
199
|
+
- `cfg` is the ExperimentConfig containing all thresholds & options.
|
|
200
|
+
- `pp_adata_path` and `pp_dup_rem_adata_path` are the target output paths for
|
|
201
|
+
preprocessed and preprocessed+deduplicated AnnData.
|
|
202
|
+
|
|
203
|
+
Does NOT:
|
|
204
|
+
- Decide which stage to load from (that's the wrapper's job).
|
|
205
|
+
- Decide whether to skip entirely; it always runs its steps, but individual
|
|
206
|
+
sub-steps may skip based on `cfg.bypass_*` or directory existence.
|
|
207
|
+
|
|
208
|
+
Returns
|
|
209
|
+
-------
|
|
210
|
+
pp_adata : AnnData
|
|
211
|
+
Preprocessed AnnData (with QC filters, binarization, etc.).
|
|
212
|
+
pp_adata_path : Path
|
|
213
|
+
Path where pp_adata was written.
|
|
214
|
+
pp_dedup_adata : AnnData
|
|
215
|
+
Preprocessed AnnData with duplicate reads removed (for non-direct SMF).
|
|
216
|
+
pp_dup_rem_adata_path : Path
|
|
217
|
+
Path where pp_dedup_adata was written.
|
|
218
|
+
"""
|
|
219
|
+
from pathlib import Path
|
|
220
|
+
|
|
221
|
+
from ..metadata import record_smftools_metadata
|
|
222
|
+
from ..plotting import plot_read_qc_histograms
|
|
223
|
+
from ..preprocessing import (
|
|
224
|
+
append_base_context,
|
|
225
|
+
append_binary_layer_by_base_context,
|
|
226
|
+
binarize_adata,
|
|
227
|
+
binarize_on_Youden,
|
|
228
|
+
calculate_complexity_II,
|
|
229
|
+
calculate_coverage,
|
|
230
|
+
calculate_position_Youden,
|
|
231
|
+
calculate_read_modification_stats,
|
|
232
|
+
clean_NaN,
|
|
233
|
+
filter_reads_on_length_quality_mapping,
|
|
234
|
+
filter_reads_on_modification_thresholds,
|
|
235
|
+
flag_duplicate_reads,
|
|
236
|
+
load_sample_sheet,
|
|
237
|
+
)
|
|
238
|
+
from ..readwrite import make_dirs
|
|
239
|
+
from .helpers import write_gz_h5ad
|
|
240
|
+
|
|
241
|
+
################################### 1) Load existing ###################################
|
|
242
|
+
# General config variable init - Necessary user passed inputs
|
|
243
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
244
|
+
output_directory = Path(
|
|
245
|
+
cfg.output_directory
|
|
246
|
+
) # Path to the output directory to make for the analysis. Necessary.
|
|
247
|
+
make_dirs([output_directory])
|
|
248
|
+
|
|
101
249
|
######### Begin Preprocessing #########
|
|
102
250
|
pp_dir = output_directory / "preprocessed"
|
|
103
251
|
|
|
104
252
|
## Load sample sheet metadata based on barcode mapping ##
|
|
105
|
-
if cfg
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
253
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
254
|
+
load_sample_sheet(
|
|
255
|
+
adata,
|
|
256
|
+
cfg.sample_sheet_path,
|
|
257
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
258
|
+
as_category=True,
|
|
259
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
260
|
+
)
|
|
112
261
|
else:
|
|
113
262
|
pass
|
|
114
|
-
|
|
263
|
+
|
|
115
264
|
# Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
|
|
116
265
|
pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
|
|
117
266
|
|
|
118
267
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
119
|
-
|
|
268
|
+
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
120
269
|
else:
|
|
121
|
-
from ..plotting import plot_read_qc_histograms
|
|
122
270
|
make_dirs([pp_dir, pp_length_qc_dir])
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
271
|
+
plot_read_qc_histograms(
|
|
272
|
+
adata,
|
|
273
|
+
pp_length_qc_dir,
|
|
274
|
+
cfg.obs_to_plot_pp_qc,
|
|
275
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
276
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
277
|
+
)
|
|
129
278
|
|
|
130
279
|
# Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
|
|
131
|
-
from ..preprocessing import filter_reads_on_length_quality_mapping
|
|
132
280
|
print(adata.shape)
|
|
133
|
-
adata = filter_reads_on_length_quality_mapping(
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
281
|
+
adata = filter_reads_on_length_quality_mapping(
|
|
282
|
+
adata,
|
|
283
|
+
filter_on_coordinates=cfg.read_coord_filter,
|
|
284
|
+
read_length=cfg.read_len_filter_thresholds,
|
|
285
|
+
length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
|
|
286
|
+
read_quality=cfg.read_quality_filter_thresholds,
|
|
287
|
+
mapping_quality=cfg.read_mapping_quality_filter_thresholds,
|
|
288
|
+
bypass=None,
|
|
289
|
+
force_redo=None,
|
|
290
|
+
)
|
|
141
291
|
print(adata.shape)
|
|
142
292
|
|
|
143
293
|
pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
|
|
144
294
|
|
|
145
295
|
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
146
|
-
|
|
296
|
+
logger.debug(f"{pp_length_qc_dir} already exists. Skipping read level QC plotting.")
|
|
147
297
|
else:
|
|
148
|
-
from ..plotting import plot_read_qc_histograms
|
|
149
298
|
make_dirs([pp_dir, pp_length_qc_dir])
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
299
|
+
plot_read_qc_histograms(
|
|
300
|
+
adata,
|
|
301
|
+
pp_length_qc_dir,
|
|
302
|
+
cfg.obs_to_plot_pp_qc,
|
|
303
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
304
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
305
|
+
)
|
|
306
|
+
|
|
157
307
|
############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
|
|
158
|
-
|
|
159
|
-
if smf_modality == 'direct':
|
|
160
|
-
from ..preprocessing import calculate_position_Youden, binarize_on_Youden, binarize_adata
|
|
308
|
+
if smf_modality == "direct":
|
|
161
309
|
native = True
|
|
162
310
|
if cfg.fit_position_methylation_thresholds:
|
|
163
311
|
pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
|
|
164
312
|
make_dirs([pp_Youden_dir])
|
|
165
313
|
# Calculate positional methylation thresholds for mod calls
|
|
166
|
-
calculate_position_Youden(
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
314
|
+
calculate_position_Youden(
|
|
315
|
+
adata,
|
|
316
|
+
positive_control_sample=cfg.positive_control_sample_methylation_fitting,
|
|
317
|
+
negative_control_sample=cfg.negative_control_sample_methylation_fitting,
|
|
318
|
+
J_threshold=cfg.fit_j_threshold,
|
|
319
|
+
ref_column=cfg.reference_column,
|
|
320
|
+
sample_column=cfg.sample_column,
|
|
321
|
+
infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
|
|
322
|
+
inference_variable=cfg.inference_variable_sample_methylation_fitting,
|
|
323
|
+
save=True,
|
|
324
|
+
output_directory=pp_Youden_dir,
|
|
325
|
+
)
|
|
176
326
|
# binarize the modcalls based on the determined thresholds
|
|
177
|
-
binarize_on_Youden(
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
327
|
+
binarize_on_Youden(
|
|
328
|
+
adata,
|
|
329
|
+
ref_column=cfg.reference_column,
|
|
330
|
+
output_layer_name=cfg.output_binary_layer_name,
|
|
331
|
+
)
|
|
181
332
|
else:
|
|
182
|
-
binarize_adata(
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
333
|
+
binarize_adata(
|
|
334
|
+
adata,
|
|
335
|
+
source="X",
|
|
336
|
+
target_layer=cfg.output_binary_layer_name,
|
|
337
|
+
threshold=cfg.binarize_on_fixed_methlyation_threshold,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
clean_NaN(
|
|
341
|
+
adata,
|
|
342
|
+
layer=cfg.output_binary_layer_name,
|
|
343
|
+
bypass=cfg.bypass_clean_nan,
|
|
344
|
+
force_redo=cfg.force_redo_clean_nan,
|
|
345
|
+
)
|
|
192
346
|
else:
|
|
193
347
|
native = False
|
|
194
|
-
clean_NaN(adata,
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
348
|
+
clean_NaN(adata, bypass=cfg.bypass_clean_nan, force_redo=cfg.force_redo_clean_nan)
|
|
349
|
+
|
|
350
|
+
############### Calculate positional coverage by reference set in dataset ###############
|
|
351
|
+
calculate_coverage(
|
|
352
|
+
adata,
|
|
353
|
+
ref_column=cfg.reference_column,
|
|
354
|
+
position_nan_threshold=cfg.position_max_nan_threshold,
|
|
355
|
+
smf_modality=smf_modality,
|
|
356
|
+
target_layer=cfg.output_binary_layer_name,
|
|
357
|
+
)
|
|
198
358
|
|
|
199
359
|
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
|
|
200
|
-
from ..preprocessing import append_base_context, append_binary_layer_by_base_context
|
|
201
360
|
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
202
|
-
append_base_context(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
from ..preprocessing import calculate_read_modification_stats
|
|
223
|
-
calculate_read_modification_stats(adata,
|
|
224
|
-
cfg.reference_column,
|
|
225
|
-
cfg.sample_column,
|
|
226
|
-
cfg.mod_target_bases,
|
|
227
|
-
bypass=cfg.bypass_calculate_read_modification_stats,
|
|
228
|
-
force_redo=cfg.force_redo_calculate_read_modification_stats)
|
|
229
|
-
|
|
361
|
+
append_base_context(
|
|
362
|
+
adata,
|
|
363
|
+
ref_column=cfg.reference_column,
|
|
364
|
+
use_consensus=False,
|
|
365
|
+
native=native,
|
|
366
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
367
|
+
bypass=cfg.bypass_append_base_context,
|
|
368
|
+
force_redo=cfg.force_redo_append_base_context,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
############### Calculate read methylation/deamination statistics for specific base contexts defined by append_base_context ###############
|
|
372
|
+
calculate_read_modification_stats(
|
|
373
|
+
adata,
|
|
374
|
+
cfg.reference_column,
|
|
375
|
+
cfg.sample_column,
|
|
376
|
+
cfg.mod_target_bases,
|
|
377
|
+
bypass=cfg.bypass_calculate_read_modification_stats,
|
|
378
|
+
force_redo=cfg.force_redo_calculate_read_modification_stats,
|
|
379
|
+
)
|
|
380
|
+
|
|
230
381
|
### Make a dir for outputting sample level read modification metrics before filtering ###
|
|
231
382
|
pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
|
|
232
383
|
|
|
233
384
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
234
|
-
|
|
385
|
+
logger.debug(
|
|
386
|
+
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
387
|
+
)
|
|
235
388
|
else:
|
|
236
|
-
from ..plotting import plot_read_qc_histograms
|
|
237
389
|
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
238
|
-
obs_to_plot = [
|
|
239
|
-
if any(base in cfg.mod_target_bases for base in [
|
|
240
|
-
obs_to_plot += [
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
390
|
+
obs_to_plot = ["Raw_modification_signal"]
|
|
391
|
+
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
392
|
+
obs_to_plot += [
|
|
393
|
+
"Fraction_GpC_site_modified",
|
|
394
|
+
"Fraction_CpG_site_modified",
|
|
395
|
+
"Fraction_other_C_site_modified",
|
|
396
|
+
"Fraction_C_site_modified",
|
|
397
|
+
]
|
|
398
|
+
if "A" in cfg.mod_target_bases:
|
|
399
|
+
obs_to_plot += ["Fraction_A_site_modified"]
|
|
400
|
+
plot_read_qc_histograms(
|
|
401
|
+
adata,
|
|
402
|
+
pp_meth_qc_dir,
|
|
403
|
+
obs_to_plot,
|
|
404
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
405
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
406
|
+
)
|
|
247
407
|
|
|
248
408
|
##### Optionally filter reads on modification metrics
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
409
|
+
adata = filter_reads_on_modification_thresholds(
|
|
410
|
+
adata,
|
|
411
|
+
smf_modality=smf_modality,
|
|
412
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
413
|
+
gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
|
|
414
|
+
cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
|
|
415
|
+
any_c_thresholds=cfg.read_mod_filtering_c_thresholds,
|
|
416
|
+
a_thresholds=cfg.read_mod_filtering_a_thresholds,
|
|
417
|
+
use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
|
|
418
|
+
min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
419
|
+
bypass=cfg.bypass_filter_reads_on_modification_thresholds,
|
|
420
|
+
force_redo=cfg.force_redo_filter_reads_on_modification_thresholds,
|
|
421
|
+
)
|
|
422
|
+
|
|
262
423
|
pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
|
|
263
|
-
|
|
424
|
+
|
|
264
425
|
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
265
|
-
|
|
426
|
+
logger.debug(
|
|
427
|
+
f"{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting."
|
|
428
|
+
)
|
|
266
429
|
else:
|
|
267
|
-
from ..plotting import plot_read_qc_histograms
|
|
268
430
|
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
269
|
-
obs_to_plot = [
|
|
270
|
-
if any(base in cfg.mod_target_bases for base in [
|
|
271
|
-
obs_to_plot += [
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
431
|
+
obs_to_plot = ["Raw_modification_signal"]
|
|
432
|
+
if any(base in cfg.mod_target_bases for base in ["GpC", "CpG", "C"]):
|
|
433
|
+
obs_to_plot += [
|
|
434
|
+
"Fraction_GpC_site_modified",
|
|
435
|
+
"Fraction_CpG_site_modified",
|
|
436
|
+
"Fraction_other_C_site_modified",
|
|
437
|
+
"Fraction_C_site_modified",
|
|
438
|
+
]
|
|
439
|
+
if "A" in cfg.mod_target_bases:
|
|
440
|
+
obs_to_plot += ["Fraction_A_site_modified"]
|
|
441
|
+
plot_read_qc_histograms(
|
|
442
|
+
adata,
|
|
443
|
+
pp_meth_qc_dir,
|
|
444
|
+
obs_to_plot,
|
|
445
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
446
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
############### Calculate final positional coverage by reference set in dataset after filtering reads ###############
|
|
450
|
+
calculate_coverage(
|
|
451
|
+
adata,
|
|
452
|
+
ref_column=cfg.reference_column,
|
|
453
|
+
position_nan_threshold=cfg.position_max_nan_threshold,
|
|
454
|
+
smf_modality=smf_modality,
|
|
455
|
+
target_layer=cfg.output_binary_layer_name,
|
|
456
|
+
force_redo=True,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats after filtering reads ###############
|
|
460
|
+
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
461
|
+
append_base_context(
|
|
462
|
+
adata,
|
|
463
|
+
ref_column=cfg.reference_column,
|
|
464
|
+
use_consensus=False,
|
|
465
|
+
native=native,
|
|
466
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
467
|
+
bypass=cfg.bypass_append_base_context,
|
|
468
|
+
force_redo=True,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Add site type binary modification layers for valid coverage sites
|
|
472
|
+
adata = append_binary_layer_by_base_context(
|
|
473
|
+
adata,
|
|
474
|
+
cfg.reference_column,
|
|
475
|
+
smf_modality,
|
|
476
|
+
bypass=cfg.bypass_append_binary_layer_by_base_context,
|
|
477
|
+
force_redo=cfg.force_redo_append_binary_layer_by_base_context,
|
|
478
|
+
from_valid_sites_only=True,
|
|
479
|
+
)
|
|
284
480
|
|
|
285
481
|
############### Duplicate detection for conversion/deamination SMF ###############
|
|
286
|
-
if smf_modality !=
|
|
287
|
-
from ..preprocessing import flag_duplicate_reads, calculate_complexity_II
|
|
482
|
+
if smf_modality != "direct":
|
|
288
483
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
289
484
|
|
|
290
|
-
var_filters_sets =[]
|
|
485
|
+
var_filters_sets = []
|
|
291
486
|
for ref in references:
|
|
292
487
|
for site_type in cfg.duplicate_detection_site_types:
|
|
293
488
|
var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
|
|
@@ -297,27 +492,30 @@ def preprocess_adata(config_path):
|
|
|
297
492
|
make_dirs([pp_dup_qc_dir])
|
|
298
493
|
|
|
299
494
|
# Flag duplicate reads and plot duplicate detection QC
|
|
300
|
-
adata_unique, adata = flag_duplicate_reads(
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
495
|
+
adata_unique, adata = flag_duplicate_reads(
|
|
496
|
+
adata,
|
|
497
|
+
var_filters_sets,
|
|
498
|
+
distance_threshold=cfg.duplicate_detection_distance_threshold,
|
|
499
|
+
obs_reference_col=cfg.reference_column,
|
|
500
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
501
|
+
output_directory=pp_dup_qc_dir,
|
|
502
|
+
metric_keys=cfg.hamming_vs_metric_keys,
|
|
503
|
+
keep_best_metric=cfg.duplicate_detection_keep_best_metric,
|
|
504
|
+
bypass=cfg.bypass_flag_duplicate_reads,
|
|
505
|
+
force_redo=cfg.force_redo_flag_duplicate_reads,
|
|
506
|
+
window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
507
|
+
min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
|
|
508
|
+
do_pca=cfg.duplicate_detection_do_pca,
|
|
509
|
+
pca_n_components=50,
|
|
510
|
+
pca_center=True,
|
|
511
|
+
do_hierarchical=cfg.duplicate_detection_do_hierarchical,
|
|
512
|
+
hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
|
|
513
|
+
hierarchical_metric="euclidean",
|
|
514
|
+
hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
515
|
+
demux_types=("double", "already"),
|
|
516
|
+
demux_col="demux_type",
|
|
517
|
+
)
|
|
518
|
+
|
|
321
519
|
# Use the flagged duplicate read groups and perform complexity analysis
|
|
322
520
|
complexity_outs = pp_dup_qc_dir / "sample_complexity_analyses"
|
|
323
521
|
make_dirs([complexity_outs])
|
|
@@ -326,15 +524,15 @@ def preprocess_adata(config_path):
|
|
|
326
524
|
output_directory=complexity_outs,
|
|
327
525
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
328
526
|
ref_col=cfg.reference_column,
|
|
329
|
-
cluster_col=
|
|
527
|
+
cluster_col="sequence__merged_cluster_id",
|
|
330
528
|
plot=True,
|
|
331
|
-
save_plot=True,
|
|
529
|
+
save_plot=True, # set False to display instead
|
|
332
530
|
n_boot=30,
|
|
333
531
|
n_depths=12,
|
|
334
532
|
random_state=42,
|
|
335
533
|
csv_summary=True,
|
|
336
534
|
bypass=cfg.bypass_complexity_analysis,
|
|
337
|
-
force_redo=cfg.force_redo_complexity_analysis
|
|
535
|
+
force_redo=cfg.force_redo_complexity_analysis,
|
|
338
536
|
)
|
|
339
537
|
|
|
340
538
|
else:
|
|
@@ -342,22 +540,30 @@ def preprocess_adata(config_path):
|
|
|
342
540
|
########################################################################################################################
|
|
343
541
|
|
|
344
542
|
############################################### Save preprocessed adata with duplicate detection ###############################################
|
|
345
|
-
from ..readwrite import safe_write_h5ad
|
|
346
543
|
if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
544
|
+
logger.info("Saving preprocessed adata.")
|
|
545
|
+
record_smftools_metadata(
|
|
546
|
+
adata,
|
|
547
|
+
step_name="preprocess",
|
|
548
|
+
cfg=cfg,
|
|
549
|
+
config_path=config_path,
|
|
550
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
551
|
+
output_path=pp_adata_path,
|
|
552
|
+
)
|
|
553
|
+
write_gz_h5ad(adata, pp_adata_path)
|
|
353
554
|
|
|
354
555
|
if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
556
|
+
logger.info("Saving preprocessed adata with duplicates removed.")
|
|
557
|
+
record_smftools_metadata(
|
|
558
|
+
adata_unique,
|
|
559
|
+
step_name="preprocess",
|
|
560
|
+
cfg=cfg,
|
|
561
|
+
config_path=config_path,
|
|
562
|
+
input_paths=[pp_adata_path],
|
|
563
|
+
output_path=pp_dup_rem_adata_path,
|
|
564
|
+
)
|
|
565
|
+
write_gz_h5ad(adata_unique, pp_dup_rem_adata_path)
|
|
566
|
+
|
|
361
567
|
########################################################################################################################
|
|
362
568
|
|
|
363
|
-
return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
|
|
569
|
+
return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
|