sleap-roots-contracts 0.1.0a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sleap_roots_contracts-0.1.0a0/PKG-INFO +49 -0
- sleap_roots_contracts-0.1.0a0/README.md +27 -0
- sleap_roots_contracts-0.1.0a0/pyproject.toml +55 -0
- sleap_roots_contracts-0.1.0a0/src/sleap_roots_contracts/__init__.py +32 -0
- sleap_roots_contracts-0.1.0a0/src/sleap_roots_contracts/hashing.py +60 -0
- sleap_roots_contracts-0.1.0a0/src/sleap_roots_contracts/identity.py +46 -0
- sleap_roots_contracts-0.1.0a0/src/sleap_roots_contracts/models.py +182 -0
- sleap_roots_contracts-0.1.0a0/src/sleap_roots_contracts/registry.py +75 -0
- sleap_roots_contracts-0.1.0a0/src/sleap_roots_contracts/schema.py +53 -0
- sleap_roots_contracts-0.1.0a0/src/sleap_roots_contracts/trait_definitions.yaml +18 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sleap-roots-contracts
|
|
3
|
+
Version: 0.1.0a0
|
|
4
|
+
Summary: Shared result + provenance contract for the sleap-roots <-> Bloom pipeline.
|
|
5
|
+
Keywords: sleap,roots,phenotyping,provenance,contract
|
|
6
|
+
Author: eberrigan
|
|
7
|
+
Author-email: eberrigan <eberrigan@salk.edu>
|
|
8
|
+
License-Expression: GPL-3.0-or-later
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Requires-Dist: pydantic>=2.7
|
|
16
|
+
Requires-Dist: pyyaml>=6.0
|
|
17
|
+
Requires-Python: >=3.11
|
|
18
|
+
Project-URL: Homepage, https://github.com/talmolab/sleap-roots-contracts
|
|
19
|
+
Project-URL: Issues, https://github.com/talmolab/sleap-roots-contracts/issues
|
|
20
|
+
Project-URL: Repository, https://github.com/talmolab/sleap-roots-contracts
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# sleap-roots-contracts
|
|
24
|
+
|
|
25
|
+
Shared **result + provenance contract** for the sleap-roots ↔ Bloom pipeline.
|
|
26
|
+
|
|
27
|
+
This is a small, dependency-light, Bloom-agnostic library that defines the shape of a
|
|
28
|
+
per-scan pipeline result and its provenance (Pydantic v2 models), emits a versioned JSON
|
|
29
|
+
Schema artifact, and ships a trait-definitions registry. The Python producers
|
|
30
|
+
(`sleap-roots-predict`, `sleap-roots-traits`) import it; Bloom consumes the emitted schema.
|
|
31
|
+
|
|
32
|
+
It is sub-project #1 of the sleap-roots ↔ Bloom integration program. Design and plan:
|
|
33
|
+
`docs/01-contract-library-design.md` and `docs/02-contract-library-plan.md`.
|
|
34
|
+
|
|
35
|
+
## Develop
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv sync
|
|
39
|
+
uv run pytest -v
|
|
40
|
+
uv run black --check src tests && uv run ruff check src tests
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Key ideas
|
|
44
|
+
|
|
45
|
+
- **Pydantic is canonical**; `schema/*.json` is generated and drift-guarded in CI.
|
|
46
|
+
- Trait **values** are long-format rows (no jsonb); provenance is a jsonb blob on the source.
|
|
47
|
+
- Hashes (`param_hash`, `idempotency_key`) are **producer-side only**; Bloom treats them as
|
|
48
|
+
opaque strings.
|
|
49
|
+
- Distributed via **PyPI** (no Docker image — this is a library).
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# sleap-roots-contracts
|
|
2
|
+
|
|
3
|
+
Shared **result + provenance contract** for the sleap-roots ↔ Bloom pipeline.
|
|
4
|
+
|
|
5
|
+
This is a small, dependency-light, Bloom-agnostic library that defines the shape of a
|
|
6
|
+
per-scan pipeline result and its provenance (Pydantic v2 models), emits a versioned JSON
|
|
7
|
+
Schema artifact, and ships a trait-definitions registry. The Python producers
|
|
8
|
+
(`sleap-roots-predict`, `sleap-roots-traits`) import it; Bloom consumes the emitted schema.
|
|
9
|
+
|
|
10
|
+
It is sub-project #1 of the sleap-roots ↔ Bloom integration program. Design and plan:
|
|
11
|
+
`docs/01-contract-library-design.md` and `docs/02-contract-library-plan.md`.
|
|
12
|
+
|
|
13
|
+
## Develop
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
uv sync
|
|
17
|
+
uv run pytest -v
|
|
18
|
+
uv run black --check src tests && uv run ruff check src tests
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Key ideas
|
|
22
|
+
|
|
23
|
+
- **Pydantic is canonical**; `schema/*.json` is generated and drift-guarded in CI.
|
|
24
|
+
- Trait **values** are long-format rows (no jsonb); provenance is a jsonb blob on the source.
|
|
25
|
+
- Hashes (`param_hash`, `idempotency_key`) are **producer-side only**; Bloom treats them as
|
|
26
|
+
opaque strings.
|
|
27
|
+
- Distributed via **PyPI** (no Docker image — this is a library).
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "sleap-roots-contracts"
|
|
3
|
+
version = "0.1.0a0"
|
|
4
|
+
description = "Shared result + provenance contract for the sleap-roots <-> Bloom pipeline."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "GPL-3.0-or-later"
|
|
7
|
+
authors = [{ name = "eberrigan", email = "eberrigan@salk.edu" }]
|
|
8
|
+
requires-python = ">=3.11"
|
|
9
|
+
keywords = ["sleap", "roots", "phenotyping", "provenance", "contract"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Programming Language :: Python :: 3.11",
|
|
15
|
+
"Topic :: Scientific/Engineering",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"pydantic>=2.7",
|
|
20
|
+
"pyyaml>=6.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://github.com/talmolab/sleap-roots-contracts"
|
|
25
|
+
Repository = "https://github.com/talmolab/sleap-roots-contracts"
|
|
26
|
+
Issues = "https://github.com/talmolab/sleap-roots-contracts/issues"
|
|
27
|
+
|
|
28
|
+
[build-system]
|
|
29
|
+
requires = ["uv_build>=0.8.11,<0.9.0"]
|
|
30
|
+
build-backend = "uv_build"
|
|
31
|
+
|
|
32
|
+
[dependency-groups]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=8.4.1",
|
|
35
|
+
"pytest-cov>=6.2.1",
|
|
36
|
+
"ruff>=0.12.11",
|
|
37
|
+
"black>=25.1.0",
|
|
38
|
+
"jsonschema>=4.23",
|
|
39
|
+
"build>=1.3.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[tool.black]
|
|
43
|
+
line-length = 88
|
|
44
|
+
|
|
45
|
+
[tool.ruff.lint]
|
|
46
|
+
select = ["D"]
|
|
47
|
+
|
|
48
|
+
[tool.ruff.lint.pydocstyle]
|
|
49
|
+
convention = "google"
|
|
50
|
+
|
|
51
|
+
[tool.ruff.lint.per-file-ignores]
|
|
52
|
+
"tests/**" = ["D"]
|
|
53
|
+
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Shared result + provenance contract for the sleap-roots <-> Bloom pipeline."""
|
|
2
|
+
|
|
3
|
+
from .hashing import NonCanonicalizableError, compute_param_hash
|
|
4
|
+
from .models import (
|
|
5
|
+
BlobRef,
|
|
6
|
+
InputRef,
|
|
7
|
+
ModelRef,
|
|
8
|
+
Provenance,
|
|
9
|
+
ResolvedParams,
|
|
10
|
+
ResultEnvelope,
|
|
11
|
+
TraitValue,
|
|
12
|
+
)
|
|
13
|
+
from .registry import TraitDefinition, load_registry, validate_trait
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0a0"
|
|
16
|
+
__all__ = [
|
|
17
|
+
"BlobRef",
|
|
18
|
+
"InputRef",
|
|
19
|
+
"ModelRef",
|
|
20
|
+
"Provenance",
|
|
21
|
+
"ResolvedParams",
|
|
22
|
+
"ResultEnvelope",
|
|
23
|
+
"TraitValue",
|
|
24
|
+
"TraitDefinition",
|
|
25
|
+
"load_registry",
|
|
26
|
+
"validate_trait",
|
|
27
|
+
# Producer-side hashing surface (Python consumers of this package are the
|
|
28
|
+
# producers; Bloom consumes only the emitted JSON Schema).
|
|
29
|
+
"compute_param_hash",
|
|
30
|
+
"NonCanonicalizableError",
|
|
31
|
+
"__version__",
|
|
32
|
+
]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Canonical-JSON hashing for params (producer-side only; Bloom treats output as opaque)."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NonCanonicalizableError(ValueError):
|
|
10
|
+
"""Raised when a value cannot be canonicalized (e.g. NaN/inf)."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _normalize(obj: Any) -> Any:
|
|
14
|
+
"""Recursively reject NaN/inf and normalize numbers to a fixed representation.
|
|
15
|
+
|
|
16
|
+
Integer-valued floats collapse to int (``1.0`` -> ``1``, ``-0.0`` -> ``0``) so
|
|
17
|
+
that type-variant params (int vs float) hash identically; ``bool`` is left
|
|
18
|
+
untouched. The walk is byte-stable within a CPython version.
|
|
19
|
+
"""
|
|
20
|
+
if isinstance(obj, bool):
|
|
21
|
+
return obj
|
|
22
|
+
if isinstance(obj, float):
|
|
23
|
+
if not math.isfinite(obj):
|
|
24
|
+
raise NonCanonicalizableError(
|
|
25
|
+
f"NaN/inf not allowed in hashed values: {obj}"
|
|
26
|
+
)
|
|
27
|
+
if obj == int(obj):
|
|
28
|
+
return int(obj)
|
|
29
|
+
return obj
|
|
30
|
+
if isinstance(obj, dict):
|
|
31
|
+
return {key: _normalize(value) for key, value in obj.items()}
|
|
32
|
+
if isinstance(obj, (list, tuple)):
|
|
33
|
+
return [_normalize(value) for value in obj]
|
|
34
|
+
return obj
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def canonical_json(values: Any) -> str:
|
|
38
|
+
"""Serialize any JSON value to deterministic JSON: sorted keys, compact, no NaN/inf."""
|
|
39
|
+
return json.dumps(
|
|
40
|
+
_normalize(values),
|
|
41
|
+
sort_keys=True,
|
|
42
|
+
separators=(",", ":"),
|
|
43
|
+
ensure_ascii=False,
|
|
44
|
+
allow_nan=False,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def sha256_hex(text: str) -> str:
|
|
49
|
+
"""Return the hex sha256 of a UTF-8 string."""
|
|
50
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def compute_param_hash(values: dict[str, Any]) -> str:
|
|
54
|
+
"""Compute the canonical, deterministic hash of a resolved-params dict.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
NonCanonicalizableError: a value is NaN/inf.
|
|
58
|
+
TypeError: a value is not JSON-serializable.
|
|
59
|
+
"""
|
|
60
|
+
return sha256_hex(canonical_json(values))
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Deterministic idempotency-key derivation (producer-side only)."""
|
|
2
|
+
|
|
3
|
+
from .hashing import canonical_json, sha256_hex
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def compute_idempotency_key(
|
|
7
|
+
*,
|
|
8
|
+
scan_key: str,
|
|
9
|
+
images_checksum: str,
|
|
10
|
+
models: list[tuple[str, str, str | None]],
|
|
11
|
+
param_hash: str,
|
|
12
|
+
predict_code_sha: str,
|
|
13
|
+
traits_code_sha: str,
|
|
14
|
+
) -> str:
|
|
15
|
+
"""Derive the run identity from inputs, models, params, and code versions.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
scan_key: Producer-side scan identifier.
|
|
19
|
+
images_checksum: Checksum over the input image set.
|
|
20
|
+
models: (registry_id, version, weights_checksum) per model; order-independent.
|
|
21
|
+
param_hash: Output of compute_param_hash.
|
|
22
|
+
predict_code_sha: Git sha of the predict producer.
|
|
23
|
+
traits_code_sha: Git sha of the traits producer.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Hex sha256 identity string.
|
|
27
|
+
"""
|
|
28
|
+
# Encode each model as a structured triple and order them by their canonical
|
|
29
|
+
# JSON, so the key is order-independent yet injective: no delimiter ambiguity
|
|
30
|
+
# and None (unpinned) stays distinct from "" (empty checksum).
|
|
31
|
+
model_entries = sorted(
|
|
32
|
+
(
|
|
33
|
+
[registry_id, version, weights_checksum]
|
|
34
|
+
for registry_id, version, weights_checksum in models
|
|
35
|
+
),
|
|
36
|
+
key=canonical_json,
|
|
37
|
+
)
|
|
38
|
+
payload = {
|
|
39
|
+
"scan_key": scan_key,
|
|
40
|
+
"images_checksum": images_checksum,
|
|
41
|
+
"models": model_entries,
|
|
42
|
+
"param_hash": param_hash,
|
|
43
|
+
"predict_code_sha": predict_code_sha,
|
|
44
|
+
"traits_code_sha": traits_code_sha,
|
|
45
|
+
}
|
|
46
|
+
return sha256_hex(canonical_json(payload))
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Pydantic contract models — the canonical source of truth."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, model_validator
|
|
8
|
+
|
|
9
|
+
from .hashing import compute_param_hash
|
|
10
|
+
from .identity import compute_idempotency_key
|
|
11
|
+
|
|
12
|
+
# Contract models are immutable: derived fields (param_hash, idempotency_key) and
|
|
13
|
+
# normalized values are guaranteed correct for the life of the instance, so the
|
|
14
|
+
# validators below use object.__setattr__ to set them past the frozen guard.
|
|
15
|
+
_FROZEN = ConfigDict(frozen=True)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ModelRef(BaseModel):
|
|
19
|
+
"""Identity of one model used in a run (FK-able to a future Bloom models table)."""
|
|
20
|
+
|
|
21
|
+
model_config = _FROZEN
|
|
22
|
+
|
|
23
|
+
registry_id: str
|
|
24
|
+
version: str
|
|
25
|
+
sleap_nn_version: str
|
|
26
|
+
root_type: str | None = None
|
|
27
|
+
weights_checksum: str | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class InputRef(BaseModel):
|
|
31
|
+
"""Pins the input data a run consumed, for reproducibility."""
|
|
32
|
+
|
|
33
|
+
model_config = _FROZEN
|
|
34
|
+
|
|
35
|
+
image_ids: list[str]
|
|
36
|
+
images_checksum: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ResolvedParams(BaseModel):
|
|
40
|
+
"""Fully-resolved run params plus their canonical hash."""
|
|
41
|
+
|
|
42
|
+
model_config = _FROZEN
|
|
43
|
+
|
|
44
|
+
values: dict[str, Any]
|
|
45
|
+
param_hash: str = ""
|
|
46
|
+
|
|
47
|
+
@model_validator(mode="after")
|
|
48
|
+
def _fill_hash(self) -> "ResolvedParams":
|
|
49
|
+
computed = compute_param_hash(self.values)
|
|
50
|
+
if self.param_hash and self.param_hash != computed:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"param_hash {self.param_hash!r} does not match values "
|
|
53
|
+
f"(computed {computed!r})"
|
|
54
|
+
)
|
|
55
|
+
object.__setattr__(self, "param_hash", computed)
|
|
56
|
+
return self
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class Provenance(BaseModel):
|
|
60
|
+
"""Run provenance; serializes to cyl_trait_sources.metadata jsonb (sub-project #2)."""
|
|
61
|
+
|
|
62
|
+
model_config = _FROZEN
|
|
63
|
+
|
|
64
|
+
contract_version: str
|
|
65
|
+
scan_key: str
|
|
66
|
+
inputs: InputRef
|
|
67
|
+
idempotency_key: str = ""
|
|
68
|
+
pipeline_run_id: str | None = None
|
|
69
|
+
|
|
70
|
+
# predict stage
|
|
71
|
+
predict_models: list[ModelRef]
|
|
72
|
+
predict_container_digest: str
|
|
73
|
+
predict_code_sha: str
|
|
74
|
+
worker_request_id: str | None = None
|
|
75
|
+
|
|
76
|
+
# traits stage
|
|
77
|
+
traits_sleap_roots_version: str
|
|
78
|
+
traits_container_digest: str
|
|
79
|
+
traits_code_sha: str
|
|
80
|
+
|
|
81
|
+
# orchestration (execution-model dependent)
|
|
82
|
+
argo_workflow_uid: str | None = None
|
|
83
|
+
argo_node_id: str | None = None
|
|
84
|
+
|
|
85
|
+
params: ResolvedParams
|
|
86
|
+
produced_at: datetime | None = None
|
|
87
|
+
|
|
88
|
+
@model_validator(mode="after")
|
|
89
|
+
def _fill_idempotency_key(self) -> "Provenance":
|
|
90
|
+
key = compute_idempotency_key(
|
|
91
|
+
scan_key=self.scan_key,
|
|
92
|
+
images_checksum=self.inputs.images_checksum,
|
|
93
|
+
models=[
|
|
94
|
+
(m.registry_id, m.version, m.weights_checksum)
|
|
95
|
+
for m in self.predict_models
|
|
96
|
+
],
|
|
97
|
+
param_hash=self.params.param_hash,
|
|
98
|
+
predict_code_sha=self.predict_code_sha,
|
|
99
|
+
traits_code_sha=self.traits_code_sha,
|
|
100
|
+
)
|
|
101
|
+
if self.idempotency_key and self.idempotency_key != key:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f"idempotency_key {self.idempotency_key!r} does not match derived "
|
|
104
|
+
f"value (computed {key!r})"
|
|
105
|
+
)
|
|
106
|
+
object.__setattr__(self, "idempotency_key", key)
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class TraitValue(BaseModel):
|
|
111
|
+
"""One long-format trait row. NaN/inf normalize to None (-> SQL NULL)."""
|
|
112
|
+
|
|
113
|
+
model_config = _FROZEN
|
|
114
|
+
|
|
115
|
+
name: str
|
|
116
|
+
value: float | None = None
|
|
117
|
+
grain: Literal["scan", "image"] = "scan"
|
|
118
|
+
scan_key: str
|
|
119
|
+
|
|
120
|
+
@model_validator(mode="after")
|
|
121
|
+
def _normalize_nonfinite(self) -> "TraitValue":
|
|
122
|
+
if self.value is not None and (
|
|
123
|
+
math.isnan(self.value) or math.isinf(self.value)
|
|
124
|
+
):
|
|
125
|
+
object.__setattr__(self, "value", None)
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
BlobKind = Literal["predictions_slp", "labels", "h5", "qc_image"]
|
|
130
|
+
|
|
131
|
+
# Single source of truth for BlobRef's "at least one location" rule: both the
|
|
132
|
+
# emitted JSON Schema constraint and the runtime validator derive from this, so a
|
|
133
|
+
# field rename can't leave the schema and the model out of sync.
|
|
134
|
+
_BLOB_LOCATION_FIELDS = ("s3_location", "box_link")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _blob_location_anyof() -> dict:
|
|
138
|
+
"""Build the at-least-one-location ``anyOf`` from the location field names.
|
|
139
|
+
|
|
140
|
+
Each branch requires one location field and constrains it to a (non-null)
|
|
141
|
+
string, so an all-null object is rejected by the schema exactly as the model
|
|
142
|
+
validator rejects it.
|
|
143
|
+
"""
|
|
144
|
+
return {
|
|
145
|
+
"anyOf": [
|
|
146
|
+
{"required": [field], "properties": {field: {"type": "string"}}}
|
|
147
|
+
for field in _BLOB_LOCATION_FIELDS
|
|
148
|
+
]
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class BlobRef(BaseModel):
|
|
153
|
+
"""Pointer to an intermediate artifact (rows in the #2 intermediates table)."""
|
|
154
|
+
|
|
155
|
+
# Encode the "at least one location" rule in the emitted JSON Schema so
|
|
156
|
+
# consumers (Bloom codegen) reject the same objects Pydantic does.
|
|
157
|
+
model_config = ConfigDict(frozen=True, json_schema_extra=_blob_location_anyof())
|
|
158
|
+
|
|
159
|
+
kind: BlobKind
|
|
160
|
+
scan_key: str
|
|
161
|
+
s3_location: str | None = None
|
|
162
|
+
box_link: str | None = None
|
|
163
|
+
checksum: str | None = None
|
|
164
|
+
file_size: int | None = None
|
|
165
|
+
|
|
166
|
+
@model_validator(mode="after")
|
|
167
|
+
def _require_location(self) -> "BlobRef":
|
|
168
|
+
if all(getattr(self, field) is None for field in _BLOB_LOCATION_FIELDS):
|
|
169
|
+
raise ValueError(
|
|
170
|
+
"BlobRef requires at least one of " + " or ".join(_BLOB_LOCATION_FIELDS)
|
|
171
|
+
)
|
|
172
|
+
return self
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class ResultEnvelope(BaseModel):
|
|
176
|
+
"""One per-scan result: 1 envelope : 1 source row : 1 scan."""
|
|
177
|
+
|
|
178
|
+
model_config = _FROZEN
|
|
179
|
+
|
|
180
|
+
provenance: Provenance
|
|
181
|
+
traits: list[TraitValue]
|
|
182
|
+
blobs: list[BlobRef] = []
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Trait definitions registry: name/dtype/range validation for trait values."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import warnings
|
|
5
|
+
from importlib import resources
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TraitDefinition(BaseModel):
|
|
13
|
+
"""Definition of a known trait."""
|
|
14
|
+
|
|
15
|
+
unit: str
|
|
16
|
+
dtype: Literal["float", "int"]
|
|
17
|
+
description: str
|
|
18
|
+
min: float | None = None
|
|
19
|
+
max: float | None = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_registry() -> dict[str, TraitDefinition]:
|
|
23
|
+
"""Load the packaged trait-definitions registry."""
|
|
24
|
+
text = (
|
|
25
|
+
resources.files("sleap_roots_contracts")
|
|
26
|
+
.joinpath("trait_definitions.yaml")
|
|
27
|
+
.read_text()
|
|
28
|
+
)
|
|
29
|
+
raw = yaml.safe_load(text) or {}
|
|
30
|
+
return {name: TraitDefinition(**spec) for name, spec in raw.items()}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def validate_trait(
|
|
34
|
+
name: str,
|
|
35
|
+
value: object,
|
|
36
|
+
registry: dict[str, TraitDefinition],
|
|
37
|
+
on_unknown: Literal["warn", "error"] = "warn",
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Validate a trait name + value against the registry.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
name: Trait name to look up.
|
|
43
|
+
value: Trait value to validate. ``None`` skips the numeric checks;
|
|
44
|
+
otherwise it must be a finite, non-bool number (any other type is
|
|
45
|
+
rejected) so this guards untrusted producer input, not just floats.
|
|
46
|
+
registry: The loaded trait-definitions registry.
|
|
47
|
+
on_unknown: Behavior for names absent from the registry ("warn" or "error").
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: unknown name (when on_unknown="error"), a non-numeric or
|
|
51
|
+
non-finite value, a value that violates the definition's dtype, or an
|
|
52
|
+
out-of-range value.
|
|
53
|
+
"""
|
|
54
|
+
definition = registry.get(name)
|
|
55
|
+
if definition is None:
|
|
56
|
+
if on_unknown == "error":
|
|
57
|
+
raise ValueError(f"Unknown trait: {name!r}")
|
|
58
|
+
warnings.warn(
|
|
59
|
+
f"Unknown trait not in registry: {name!r}", UserWarning, stacklevel=2
|
|
60
|
+
)
|
|
61
|
+
return
|
|
62
|
+
if value is None:
|
|
63
|
+
return
|
|
64
|
+
# bool is an int subclass; reject it alongside other non-numeric types.
|
|
65
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
66
|
+
raise ValueError(f"{name}={value!r} is not numeric")
|
|
67
|
+
# Reject NaN/inf: they slip past min/max comparisons (all comparisons False).
|
|
68
|
+
if not math.isfinite(value):
|
|
69
|
+
raise ValueError(f"{name}={value} is not finite")
|
|
70
|
+
if definition.dtype == "int" and float(value) != int(value):
|
|
71
|
+
raise ValueError(f"{name}={value} is not an integer (dtype int)")
|
|
72
|
+
if definition.min is not None and value < definition.min:
|
|
73
|
+
raise ValueError(f"{name}={value} below min {definition.min}")
|
|
74
|
+
if definition.max is not None and value > definition.max:
|
|
75
|
+
raise ValueError(f"{name}={value} above max {definition.max}")
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Emit versioned JSON Schema artifacts from the Pydantic models."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from . import __version__
|
|
7
|
+
from .models import ResultEnvelope
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _default_schema_dir() -> Path:
|
|
11
|
+
"""Locate the repo's ``schema/`` dir for the producer-side emitter.
|
|
12
|
+
|
|
13
|
+
Walks up from this file for the directory containing ``pyproject.toml`` (the
|
|
14
|
+
repo root in a source checkout / CI); falls back to the current working
|
|
15
|
+
directory when none is found (e.g. when the package is pip-installed, where a
|
|
16
|
+
``parents[2]`` guess would land on an unwritable site-packages path).
|
|
17
|
+
"""
|
|
18
|
+
here = Path(__file__).resolve()
|
|
19
|
+
for parent in here.parents:
|
|
20
|
+
if (parent / "pyproject.toml").is_file():
|
|
21
|
+
return parent / "schema"
|
|
22
|
+
return Path.cwd() / "schema"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
SCHEMA_DIR = _default_schema_dir()
|
|
26
|
+
MODELS = {"result_envelope": ResultEnvelope}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def render(name: str) -> str:
|
|
30
|
+
"""Render one schema as a deterministic JSON string."""
|
|
31
|
+
schema = MODELS[name].model_json_schema()
|
|
32
|
+
# Make the artifact self-describing so consumers (and jsonschema.validate)
|
|
33
|
+
# select the intended dialect instead of defaulting to Draft 7.
|
|
34
|
+
schema["$schema"] = "https://json-schema.org/draft/2020-12/schema"
|
|
35
|
+
# Carry the package version as a path segment (not a URI fragment): JSON Schema
|
|
36
|
+
# Draft 2020-12 forbids a non-empty fragment in "$id".
|
|
37
|
+
schema["$id"] = (
|
|
38
|
+
"https://github.com/talmolab/sleap-roots-contracts/schema/"
|
|
39
|
+
f"v{__version__}/{name}.schema.json"
|
|
40
|
+
)
|
|
41
|
+
return json.dumps(schema, indent=2, sort_keys=True) + "\n"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def emit_schema(schema_dir: Path | None = None) -> None:
|
|
45
|
+
"""Write all schemas to ``schema_dir`` (defaults to the repo's ``schema/``)."""
|
|
46
|
+
target = schema_dir if schema_dir is not None else SCHEMA_DIR
|
|
47
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
48
|
+
for name in MODELS:
|
|
49
|
+
(target / f"{name}.schema.json").write_text(render(name), encoding="utf-8")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
emit_schema()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Seed trait definitions. Source of truth for full population: the traits computed by
|
|
2
|
+
# `sleap_roots` (expand by enumerating its trait outputs). on_unknown defaults to "warn".
|
|
3
|
+
primary_length:
|
|
4
|
+
unit: px
|
|
5
|
+
dtype: float
|
|
6
|
+
min: 0.0
|
|
7
|
+
description: Length of the primary root.
|
|
8
|
+
lateral_count:
|
|
9
|
+
unit: count
|
|
10
|
+
dtype: int
|
|
11
|
+
min: 0.0
|
|
12
|
+
description: Number of lateral roots detected.
|
|
13
|
+
crown_angle:
|
|
14
|
+
unit: deg
|
|
15
|
+
dtype: float
|
|
16
|
+
min: 0.0
|
|
17
|
+
max: 360.0
|
|
18
|
+
description: Crown root angle.
|