smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/metadata.py
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import platform
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Iterable, Optional
|
|
11
|
+
|
|
12
|
+
from ._version import __version__
|
|
13
|
+
from .schema import SCHEMA_REGISTRY_RESOURCE, SCHEMA_REGISTRY_VERSION
|
|
14
|
+
|
|
15
|
+
_DEPENDENCIES = ("anndata", "numpy", "pandas", "scanpy", "torch")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _iso_timestamp() -> str:
|
|
19
|
+
return datetime.now(timezone.utc).astimezone().isoformat()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _safe_version(package_name: str) -> Optional[str]:
|
|
23
|
+
try:
|
|
24
|
+
return version(package_name)
|
|
25
|
+
except PackageNotFoundError:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _find_git_root(start: Path) -> Optional[Path]:
|
|
30
|
+
for candidate in [start, *start.parents]:
|
|
31
|
+
if (candidate / ".git").exists():
|
|
32
|
+
return candidate
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _get_git_commit() -> Optional[str]:
|
|
37
|
+
root = _find_git_root(Path(__file__).resolve())
|
|
38
|
+
if root is None:
|
|
39
|
+
return None
|
|
40
|
+
try:
|
|
41
|
+
result = subprocess.run(
|
|
42
|
+
["git", "rev-parse", "HEAD"],
|
|
43
|
+
cwd=root,
|
|
44
|
+
capture_output=True,
|
|
45
|
+
text=True,
|
|
46
|
+
check=False,
|
|
47
|
+
)
|
|
48
|
+
except OSError:
|
|
49
|
+
return None
|
|
50
|
+
if result.returncode != 0:
|
|
51
|
+
return None
|
|
52
|
+
return result.stdout.strip() or None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _hash_file(path: Path, *, max_full_bytes: int = 50 * 1024 * 1024) -> dict[str, Any]:
|
|
56
|
+
stat = path.stat()
|
|
57
|
+
size = stat.st_size
|
|
58
|
+
mtime = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat()
|
|
59
|
+
hasher = hashlib.sha256()
|
|
60
|
+
hash_mode = "full"
|
|
61
|
+
hash_bytes = 0
|
|
62
|
+
chunk_size = 1024 * 1024
|
|
63
|
+
|
|
64
|
+
with path.open("rb") as handle:
|
|
65
|
+
if size <= max_full_bytes:
|
|
66
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
67
|
+
hasher.update(chunk)
|
|
68
|
+
hash_bytes += len(chunk)
|
|
69
|
+
else:
|
|
70
|
+
hash_mode = "head_tail_1mb"
|
|
71
|
+
head = handle.read(chunk_size)
|
|
72
|
+
hasher.update(head)
|
|
73
|
+
hash_bytes += len(head)
|
|
74
|
+
if size > chunk_size:
|
|
75
|
+
handle.seek(max(size - chunk_size, 0))
|
|
76
|
+
tail = handle.read(chunk_size)
|
|
77
|
+
hasher.update(tail)
|
|
78
|
+
hash_bytes += len(tail)
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"size": size,
|
|
82
|
+
"mtime": mtime,
|
|
83
|
+
"hash": hasher.hexdigest(),
|
|
84
|
+
"hash_algorithm": "sha256",
|
|
85
|
+
"hash_mode": hash_mode,
|
|
86
|
+
"hash_bytes": hash_bytes,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _path_record(path: Path, role: Optional[str] = None) -> dict[str, Any]:
|
|
91
|
+
record: dict[str, Any] = {"path": str(path)}
|
|
92
|
+
if role:
|
|
93
|
+
record["role"] = role
|
|
94
|
+
if not path.exists():
|
|
95
|
+
record["exists"] = False
|
|
96
|
+
return record
|
|
97
|
+
|
|
98
|
+
record["exists"] = True
|
|
99
|
+
if path.is_dir():
|
|
100
|
+
stat = path.stat()
|
|
101
|
+
record["mtime"] = datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat()
|
|
102
|
+
record["type"] = "directory"
|
|
103
|
+
return record
|
|
104
|
+
|
|
105
|
+
record["type"] = "file"
|
|
106
|
+
record.update(_hash_file(path))
|
|
107
|
+
return record
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _normalize_paths(paths: Optional[Iterable[Path | str]]) -> list[Path]:
|
|
111
|
+
if not paths:
|
|
112
|
+
return []
|
|
113
|
+
normalized = []
|
|
114
|
+
for path in paths:
|
|
115
|
+
if path is None:
|
|
116
|
+
continue
|
|
117
|
+
normalized.append(Path(path))
|
|
118
|
+
return normalized
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _environment_snapshot() -> dict[str, Any]:
|
|
122
|
+
dependencies = {name: version for name in _DEPENDENCIES if (version := _safe_version(name))}
|
|
123
|
+
return {
|
|
124
|
+
"smftools_version": __version__,
|
|
125
|
+
"python_version": platform.python_version(),
|
|
126
|
+
"platform": platform.platform(),
|
|
127
|
+
"system": platform.system(),
|
|
128
|
+
"release": platform.release(),
|
|
129
|
+
"machine": platform.machine(),
|
|
130
|
+
"dependencies": dependencies,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _infer_dtype(value: Any) -> str:
|
|
135
|
+
if hasattr(value, "dtype"):
|
|
136
|
+
return str(value.dtype)
|
|
137
|
+
if hasattr(value, "dtypes"):
|
|
138
|
+
try:
|
|
139
|
+
return ",".join(str(dt) for dt in value.dtypes)
|
|
140
|
+
except TypeError:
|
|
141
|
+
return str(value.dtypes)
|
|
142
|
+
return type(value).__name__
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _schema_snapshot(adata) -> dict[str, Any]:
|
|
146
|
+
layers = {
|
|
147
|
+
name: {"dtype": _infer_dtype(matrix), "shape": list(matrix.shape)}
|
|
148
|
+
for name, matrix in adata.layers.items()
|
|
149
|
+
}
|
|
150
|
+
obsm = {
|
|
151
|
+
name: {"dtype": _infer_dtype(matrix), "shape": list(matrix.shape)}
|
|
152
|
+
for name, matrix in adata.obsm.items()
|
|
153
|
+
}
|
|
154
|
+
obsp = {
|
|
155
|
+
name: {"dtype": _infer_dtype(matrix), "shape": list(matrix.shape)}
|
|
156
|
+
for name, matrix in adata.obsp.items()
|
|
157
|
+
}
|
|
158
|
+
return {
|
|
159
|
+
"layers": layers,
|
|
160
|
+
"obs": {name: {"dtype": str(adata.obs[name].dtype)} for name in adata.obs.columns},
|
|
161
|
+
"var": {name: {"dtype": str(adata.var[name].dtype)} for name in adata.var.columns},
|
|
162
|
+
"obsm": obsm,
|
|
163
|
+
"obsp": obsp,
|
|
164
|
+
"uns_keys": sorted(adata.uns.keys()),
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _runtime_schema_entries(items: dict[str, Any]) -> dict[str, dict[str, Any]]:
|
|
169
|
+
return {
|
|
170
|
+
key: {
|
|
171
|
+
"dtype": _infer_dtype(value),
|
|
172
|
+
"created_by": "runtime_snapshot",
|
|
173
|
+
"modified_by": [],
|
|
174
|
+
"notes": "",
|
|
175
|
+
"requires": [],
|
|
176
|
+
"optional_inputs": [],
|
|
177
|
+
}
|
|
178
|
+
for key, value in items.items()
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _runtime_schema_dict(adata, step_name: str, output_path: Optional[Path] = None) -> dict:
|
|
183
|
+
return {
|
|
184
|
+
"schema_version": "runtime-1",
|
|
185
|
+
"description": "Runtime AnnData schema snapshot (auto-generated).",
|
|
186
|
+
"generated_at": _iso_timestamp(),
|
|
187
|
+
"output_path": str(output_path) if output_path else None,
|
|
188
|
+
"stages": {
|
|
189
|
+
step_name: {
|
|
190
|
+
"stage_requires": [],
|
|
191
|
+
"obs": _runtime_schema_entries(
|
|
192
|
+
{name: adata.obs[name] for name in adata.obs.columns}
|
|
193
|
+
),
|
|
194
|
+
"var": _runtime_schema_entries(
|
|
195
|
+
{name: adata.var[name] for name in adata.var.columns}
|
|
196
|
+
),
|
|
197
|
+
"obsm": _runtime_schema_entries(dict(adata.obsm.items())),
|
|
198
|
+
"varm": _runtime_schema_entries(dict(adata.varm.items())),
|
|
199
|
+
"layers": _runtime_schema_entries(dict(adata.layers.items())),
|
|
200
|
+
"obsp": _runtime_schema_entries(dict(adata.obsp.items())),
|
|
201
|
+
"uns": _runtime_schema_entries(dict(adata.uns.items())),
|
|
202
|
+
}
|
|
203
|
+
},
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def append_runtime_schema_entry(
|
|
208
|
+
adata,
|
|
209
|
+
*,
|
|
210
|
+
stage: str,
|
|
211
|
+
location: str,
|
|
212
|
+
key: str,
|
|
213
|
+
created_by: str,
|
|
214
|
+
used_structures: Optional[list[str]] = None,
|
|
215
|
+
notes: Optional[str] = None,
|
|
216
|
+
) -> None:
|
|
217
|
+
"""Append a runtime schema entry describing a newly created structure.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
adata: AnnData object to annotate.
|
|
221
|
+
stage: Pipeline stage name (e.g. "load", "preprocess").
|
|
222
|
+
location: AnnData slot ("obs", "var", "layers", "obsm", "varm", "obsp", "uns").
|
|
223
|
+
key: Name of the structure within the slot.
|
|
224
|
+
created_by: Function or module responsible for creating the structure.
|
|
225
|
+
used_structures: List of structures consumed to create this structure.
|
|
226
|
+
notes: Optional notes (e.g., first line of a docstring).
|
|
227
|
+
"""
|
|
228
|
+
smftools_uns = adata.uns.setdefault("smftools", {})
|
|
229
|
+
runtime_schema = smftools_uns.setdefault(
|
|
230
|
+
"runtime_schema",
|
|
231
|
+
{
|
|
232
|
+
"schema_version": "runtime-1",
|
|
233
|
+
"description": "Runtime AnnData schema annotations (recorded during execution).",
|
|
234
|
+
"generated_at": _iso_timestamp(),
|
|
235
|
+
"stages": {},
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
stages = runtime_schema.setdefault("stages", {})
|
|
239
|
+
stage_block = stages.setdefault(stage, {})
|
|
240
|
+
slot_block = stage_block.setdefault(location, {})
|
|
241
|
+
|
|
242
|
+
value = None
|
|
243
|
+
if location == "obs" and key in adata.obs:
|
|
244
|
+
value = adata.obs[key]
|
|
245
|
+
elif location == "var" and key in adata.var:
|
|
246
|
+
value = adata.var[key]
|
|
247
|
+
elif location == "layers" and key in adata.layers:
|
|
248
|
+
value = adata.layers[key]
|
|
249
|
+
elif location == "obsm" and key in adata.obsm:
|
|
250
|
+
value = adata.obsm[key]
|
|
251
|
+
elif location == "varm" and key in adata.varm:
|
|
252
|
+
value = adata.varm[key]
|
|
253
|
+
elif location == "obsp" and key in adata.obsp:
|
|
254
|
+
value = adata.obsp[key]
|
|
255
|
+
elif location == "uns" and key in adata.uns:
|
|
256
|
+
value = adata.uns[key]
|
|
257
|
+
|
|
258
|
+
slot_block[key] = {
|
|
259
|
+
"dtype": _infer_dtype(value) if value is not None else "unknown",
|
|
260
|
+
"created_by": created_by,
|
|
261
|
+
"used_structures": used_structures or [],
|
|
262
|
+
"notes": notes or "",
|
|
263
|
+
"recorded_at": _iso_timestamp(),
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _format_yaml_value(value: Any) -> str:
|
|
268
|
+
if value is None:
|
|
269
|
+
return "null"
|
|
270
|
+
if isinstance(value, bool):
|
|
271
|
+
return "true" if value else "false"
|
|
272
|
+
if isinstance(value, (int, float)):
|
|
273
|
+
return str(value)
|
|
274
|
+
if isinstance(value, str):
|
|
275
|
+
escaped = value.replace('"', '\\"')
|
|
276
|
+
return f'"{escaped}"'
|
|
277
|
+
return f'"{str(value)}"'
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _dump_yaml(data: Any, indent: int = 0) -> str:
|
|
281
|
+
space = " " * indent
|
|
282
|
+
if isinstance(data, dict):
|
|
283
|
+
lines = []
|
|
284
|
+
for key, value in data.items():
|
|
285
|
+
if isinstance(value, (dict, list)):
|
|
286
|
+
lines.append(f"{space}{key}:")
|
|
287
|
+
lines.append(_dump_yaml(value, indent + 1))
|
|
288
|
+
else:
|
|
289
|
+
lines.append(f"{space}{key}: {_format_yaml_value(value)}")
|
|
290
|
+
return "\n".join(lines)
|
|
291
|
+
if isinstance(data, list):
|
|
292
|
+
lines = []
|
|
293
|
+
for item in data:
|
|
294
|
+
if isinstance(item, (dict, list)):
|
|
295
|
+
lines.append(f"{space}-")
|
|
296
|
+
lines.append(_dump_yaml(item, indent + 1))
|
|
297
|
+
else:
|
|
298
|
+
lines.append(f"{space}- {_format_yaml_value(item)}")
|
|
299
|
+
return "\n".join(lines)
|
|
300
|
+
return f"{space}{_format_yaml_value(data)}"
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _schema_sidecar_path(output_path: Path) -> Path:
|
|
304
|
+
name = output_path.name
|
|
305
|
+
if name.endswith(".h5ad.gz"):
|
|
306
|
+
base = name[: -len(".h5ad.gz")]
|
|
307
|
+
elif name.endswith(".h5ad"):
|
|
308
|
+
base = name[: -len(".h5ad")]
|
|
309
|
+
else:
|
|
310
|
+
base = output_path.stem
|
|
311
|
+
return output_path.with_name(f"{base}.schema.yaml")
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def write_runtime_schema_yaml(adata, output_path: Path, step_name: str) -> Path:
|
|
315
|
+
runtime_schema = adata.uns.get("smftools", {}).get("runtime_schema")
|
|
316
|
+
if isinstance(runtime_schema, dict):
|
|
317
|
+
schema_dict = dict(runtime_schema)
|
|
318
|
+
schema_dict.setdefault("output_path", str(output_path))
|
|
319
|
+
schema_dict.setdefault("generated_at", _iso_timestamp())
|
|
320
|
+
schema_dict.setdefault("schema_version", "runtime-1")
|
|
321
|
+
schema_dict.setdefault(
|
|
322
|
+
"description", "Runtime AnnData schema annotations (recorded during execution)."
|
|
323
|
+
)
|
|
324
|
+
else:
|
|
325
|
+
schema_dict = _runtime_schema_dict(adata, step_name, output_path=output_path)
|
|
326
|
+
yaml_text = _dump_yaml(schema_dict)
|
|
327
|
+
schema_path = _schema_sidecar_path(output_path)
|
|
328
|
+
schema_path.write_text(yaml_text + "\n", encoding="utf-8")
|
|
329
|
+
return schema_path
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _append_unique_inputs(existing: list[dict[str, Any]], new_inputs: list[dict[str, Any]]) -> None:
|
|
333
|
+
seen = {
|
|
334
|
+
(item.get("path"), item.get("hash"), item.get("hash_mode"))
|
|
335
|
+
for item in existing
|
|
336
|
+
if item.get("path")
|
|
337
|
+
}
|
|
338
|
+
for item in new_inputs:
|
|
339
|
+
key = (item.get("path"), item.get("hash"), item.get("hash_mode"))
|
|
340
|
+
if key in seen:
|
|
341
|
+
continue
|
|
342
|
+
existing.append(item)
|
|
343
|
+
seen.add(key)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def record_smftools_metadata(
|
|
347
|
+
adata,
|
|
348
|
+
*,
|
|
349
|
+
step_name: str,
|
|
350
|
+
cfg: Optional[Any] = None,
|
|
351
|
+
config_path: Optional[str | Path] = None,
|
|
352
|
+
input_paths: Optional[Iterable[Path | str]] = None,
|
|
353
|
+
output_path: Optional[Path | str] = None,
|
|
354
|
+
status: str = "ok",
|
|
355
|
+
cli_argv: Optional[list[str]] = None,
|
|
356
|
+
) -> None:
|
|
357
|
+
"""Record structured smftools metadata into AnnData.uns.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
adata: AnnData object to update.
|
|
361
|
+
step_name: Pipeline step name (e.g. "load", "preprocess").
|
|
362
|
+
cfg: Optional ExperimentConfig to capture resolved params.
|
|
363
|
+
config_path: Path to the experiment config file used.
|
|
364
|
+
input_paths: Optional iterable of input artifacts (e.g. h5ad inputs).
|
|
365
|
+
output_path: Optional output path written by this step.
|
|
366
|
+
status: Step status string ("ok" or "failed").
|
|
367
|
+
cli_argv: Optional command argument vector for provenance.
|
|
368
|
+
"""
|
|
369
|
+
smftools_uns = adata.uns.setdefault("smftools", {})
|
|
370
|
+
timestamp = _iso_timestamp()
|
|
371
|
+
|
|
372
|
+
if "created_by" not in smftools_uns:
|
|
373
|
+
smftools_uns["created_by"] = {
|
|
374
|
+
"version": __version__,
|
|
375
|
+
"time": timestamp,
|
|
376
|
+
"git_commit": _get_git_commit(),
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
smftools_uns.setdefault("environment", _environment_snapshot())
|
|
380
|
+
smftools_uns.setdefault("schema_version", "1")
|
|
381
|
+
smftools_uns.setdefault("schema_registry_version", SCHEMA_REGISTRY_VERSION)
|
|
382
|
+
smftools_uns.setdefault(
|
|
383
|
+
"schema_registry_resource",
|
|
384
|
+
f"smftools.schema:{SCHEMA_REGISTRY_RESOURCE}",
|
|
385
|
+
)
|
|
386
|
+
smftools_uns["schema"] = _schema_snapshot(adata)
|
|
387
|
+
|
|
388
|
+
provenance = smftools_uns.setdefault("provenance", {})
|
|
389
|
+
inputs = provenance.setdefault("inputs", [])
|
|
390
|
+
|
|
391
|
+
input_records: list[dict[str, Any]] = []
|
|
392
|
+
if config_path:
|
|
393
|
+
input_records.append(_path_record(Path(config_path), role="config"))
|
|
394
|
+
if cfg is not None:
|
|
395
|
+
cfg_paths = _normalize_paths(
|
|
396
|
+
[
|
|
397
|
+
cfg.input_data_path,
|
|
398
|
+
cfg.fasta,
|
|
399
|
+
cfg.sample_sheet_path,
|
|
400
|
+
cfg.summary_file,
|
|
401
|
+
]
|
|
402
|
+
)
|
|
403
|
+
input_records.extend(_path_record(path, role="input") for path in cfg_paths)
|
|
404
|
+
input_records.extend(
|
|
405
|
+
_path_record(path, role="input") for path in _normalize_paths(cfg.input_files)
|
|
406
|
+
)
|
|
407
|
+
if input_paths:
|
|
408
|
+
input_records.extend(_path_record(Path(path), role="input") for path in input_paths)
|
|
409
|
+
|
|
410
|
+
_append_unique_inputs(inputs, input_records)
|
|
411
|
+
|
|
412
|
+
outputs: dict[str, Any] = {
|
|
413
|
+
"layers": sorted(adata.layers.keys()),
|
|
414
|
+
"obs_columns": sorted(adata.obs.columns),
|
|
415
|
+
"var_columns": sorted(adata.var.columns),
|
|
416
|
+
"uns_keys": sorted(adata.uns.keys()),
|
|
417
|
+
"obsm_keys": sorted(adata.obsm.keys()),
|
|
418
|
+
"obsp_keys": sorted(adata.obsp.keys()),
|
|
419
|
+
}
|
|
420
|
+
if output_path is not None:
|
|
421
|
+
out_path = Path(output_path)
|
|
422
|
+
outputs["h5ad_path"] = str(out_path)
|
|
423
|
+
outputs["schema_yaml_path"] = str(_schema_sidecar_path(out_path))
|
|
424
|
+
|
|
425
|
+
runtime = {"device": getattr(cfg, "device", None), "threads": getattr(cfg, "threads", None)}
|
|
426
|
+
if cli_argv is None:
|
|
427
|
+
cli_argv = list(sys.argv)
|
|
428
|
+
|
|
429
|
+
step_record = {
|
|
430
|
+
"id": hashlib.sha1(f"{step_name}-{timestamp}".encode("utf-8")).hexdigest(),
|
|
431
|
+
"time": timestamp,
|
|
432
|
+
"step": step_name,
|
|
433
|
+
"smftools_version": __version__,
|
|
434
|
+
"params": cfg.to_dict() if cfg is not None else None,
|
|
435
|
+
"inputs": input_records,
|
|
436
|
+
"outputs": outputs,
|
|
437
|
+
"runtime": runtime,
|
|
438
|
+
"status": status,
|
|
439
|
+
"cli_argv": cli_argv,
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
history = smftools_uns.setdefault("history", [])
|
|
443
|
+
history.append(step_record)
|
smftools/plotting/__init__.py
CHANGED
|
@@ -1,8 +1,22 @@
|
|
|
1
1
|
from .autocorrelation_plotting import *
|
|
2
|
+
from .classifiers import (
|
|
3
|
+
plot_feature_importances_or_saliency,
|
|
4
|
+
plot_model_curves_from_adata,
|
|
5
|
+
plot_model_curves_from_adata_with_frequency_grid,
|
|
6
|
+
plot_model_performance,
|
|
7
|
+
)
|
|
8
|
+
from .general_plotting import (
|
|
9
|
+
combined_hmm_raw_clustermap,
|
|
10
|
+
combined_raw_clustermap,
|
|
11
|
+
plot_hmm_layers_rolling_by_sample_ref,
|
|
12
|
+
)
|
|
2
13
|
from .hmm_plotting import *
|
|
3
|
-
from .position_stats import
|
|
4
|
-
|
|
5
|
-
|
|
14
|
+
from .position_stats import (
|
|
15
|
+
plot_bar_relative_risk,
|
|
16
|
+
plot_positionwise_matrix,
|
|
17
|
+
plot_positionwise_matrix_grid,
|
|
18
|
+
plot_volcano_relative_risk,
|
|
19
|
+
)
|
|
6
20
|
from .qc_plotting import *
|
|
7
21
|
|
|
8
22
|
__all__ = [
|
|
@@ -14,5 +28,5 @@ __all__ = [
|
|
|
14
28
|
"plot_feature_importances_or_saliency",
|
|
15
29
|
"plot_model_performance",
|
|
16
30
|
"plot_model_curves_from_adata",
|
|
17
|
-
"plot_model_curves_from_adata_with_frequency_grid"
|
|
18
|
-
]
|
|
31
|
+
"plot_model_curves_from_adata_with_frequency_grid",
|
|
32
|
+
]
|