sleap-roots-contracts 0.1.0a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ Metadata-Version: 2.4
2
+ Name: sleap-roots-contracts
3
+ Version: 0.1.0a0
4
+ Summary: Shared result + provenance contract for the sleap-roots <-> Bloom pipeline.
5
+ Keywords: sleap,roots,phenotyping,provenance,contract
6
+ Author: eberrigan
7
+ Author-email: eberrigan <eberrigan@salk.edu>
8
+ License-Expression: GPL-3.0-or-later
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Topic :: Scientific/Engineering
14
+ Classifier: Intended Audience :: Science/Research
15
+ Requires-Dist: pydantic>=2.7
16
+ Requires-Dist: pyyaml>=6.0
17
+ Requires-Python: >=3.11
18
+ Project-URL: Homepage, https://github.com/talmolab/sleap-roots-contracts
19
+ Project-URL: Issues, https://github.com/talmolab/sleap-roots-contracts/issues
20
+ Project-URL: Repository, https://github.com/talmolab/sleap-roots-contracts
21
+ Description-Content-Type: text/markdown
22
+
23
+ # sleap-roots-contracts
24
+
25
+ Shared **result + provenance contract** for the sleap-roots ↔ Bloom pipeline.
26
+
27
+ This is a small, dependency-light, Bloom-agnostic library that defines the shape of a
28
+ per-scan pipeline result and its provenance (Pydantic v2 models), emits a versioned JSON
29
+ Schema artifact, and ships a trait-definitions registry. The Python producers
30
+ (`sleap-roots-predict`, `sleap-roots-traits`) import it; Bloom consumes the emitted schema.
31
+
32
+ It is sub-project #1 of the sleap-roots ↔ Bloom integration program. Design and plan:
33
+ `docs/01-contract-library-design.md` and `docs/02-contract-library-plan.md`.
34
+
35
+ ## Develop
36
+
37
+ ```bash
38
+ uv sync
39
+ uv run pytest -v
40
+ uv run black --check src tests && uv run ruff check src tests
41
+ ```
42
+
43
+ ## Key ideas
44
+
45
+ - **Pydantic is canonical**; `schema/*.json` is generated and drift-guarded in CI.
46
+ - Trait **values** are long-format rows (no jsonb); provenance is a jsonb blob on the source.
47
+ - Hashes (`param_hash`, `idempotency_key`) are **producer-side only**; Bloom treats them as
48
+ opaque strings.
49
+ - Distributed via **PyPI** (no Docker image — this is a library).
@@ -0,0 +1,27 @@
1
+ # sleap-roots-contracts
2
+
3
+ Shared **result + provenance contract** for the sleap-roots ↔ Bloom pipeline.
4
+
5
+ This is a small, dependency-light, Bloom-agnostic library that defines the shape of a
6
+ per-scan pipeline result and its provenance (Pydantic v2 models), emits a versioned JSON
7
+ Schema artifact, and ships a trait-definitions registry. The Python producers
8
+ (`sleap-roots-predict`, `sleap-roots-traits`) import it; Bloom consumes the emitted schema.
9
+
10
+ It is sub-project #1 of the sleap-roots ↔ Bloom integration program. Design and plan:
11
+ `docs/01-contract-library-design.md` and `docs/02-contract-library-plan.md`.
12
+
13
+ ## Develop
14
+
15
+ ```bash
16
+ uv sync
17
+ uv run pytest -v
18
+ uv run black --check src tests && uv run ruff check src tests
19
+ ```
20
+
21
+ ## Key ideas
22
+
23
+ - **Pydantic is canonical**; `schema/*.json` is generated and drift-guarded in CI.
24
+ - Trait **values** are long-format rows (no jsonb); provenance is a jsonb blob on the source.
25
+ - Hashes (`param_hash`, `idempotency_key`) are **producer-side only**; Bloom treats them as
26
+ opaque strings.
27
+ - Distributed via **PyPI** (no Docker image — this is a library).
@@ -0,0 +1,55 @@
1
+ [project]
2
+ name = "sleap-roots-contracts"
3
+ version = "0.1.0a0"
4
+ description = "Shared result + provenance contract for the sleap-roots <-> Bloom pipeline."
5
+ readme = "README.md"
6
+ license = "GPL-3.0-or-later"
7
+ authors = [{ name = "eberrigan", email = "eberrigan@salk.edu" }]
8
+ requires-python = ">=3.11"
9
+ keywords = ["sleap", "roots", "phenotyping", "provenance", "contract"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
13
+ "Programming Language :: Python :: 3",
14
+ "Programming Language :: Python :: 3.11",
15
+ "Topic :: Scientific/Engineering",
16
+ "Intended Audience :: Science/Research",
17
+ ]
18
+ dependencies = [
19
+ "pydantic>=2.7",
20
+ "pyyaml>=6.0",
21
+ ]
22
+
23
+ [project.urls]
24
+ Homepage = "https://github.com/talmolab/sleap-roots-contracts"
25
+ Repository = "https://github.com/talmolab/sleap-roots-contracts"
26
+ Issues = "https://github.com/talmolab/sleap-roots-contracts/issues"
27
+
28
+ [build-system]
29
+ requires = ["uv_build>=0.8.11,<0.9.0"]
30
+ build-backend = "uv_build"
31
+
32
+ [dependency-groups]
33
+ dev = [
34
+ "pytest>=8.4.1",
35
+ "pytest-cov>=6.2.1",
36
+ "ruff>=0.12.11",
37
+ "black>=25.1.0",
38
+ "jsonschema>=4.23",
39
+ "build>=1.3.0",
40
+ ]
41
+
42
+ [tool.black]
43
+ line-length = 88
44
+
45
+ [tool.ruff.lint]
46
+ select = ["D"]
47
+
48
+ [tool.ruff.lint.pydocstyle]
49
+ convention = "google"
50
+
51
+ [tool.ruff.lint.per-file-ignores]
52
+ "tests/**" = ["D"]
53
+
54
+ [tool.pytest.ini_options]
55
+ testpaths = ["tests"]
@@ -0,0 +1,32 @@
1
+ """Shared result + provenance contract for the sleap-roots <-> Bloom pipeline."""
2
+
3
+ from .hashing import NonCanonicalizableError, compute_param_hash
4
+ from .models import (
5
+ BlobRef,
6
+ InputRef,
7
+ ModelRef,
8
+ Provenance,
9
+ ResolvedParams,
10
+ ResultEnvelope,
11
+ TraitValue,
12
+ )
13
+ from .registry import TraitDefinition, load_registry, validate_trait
14
+
15
+ __version__ = "0.1.0a0"
16
+ __all__ = [
17
+ "BlobRef",
18
+ "InputRef",
19
+ "ModelRef",
20
+ "Provenance",
21
+ "ResolvedParams",
22
+ "ResultEnvelope",
23
+ "TraitValue",
24
+ "TraitDefinition",
25
+ "load_registry",
26
+ "validate_trait",
27
+ # Producer-side hashing surface (Python consumers of this package are the
28
+ # producers; Bloom consumes only the emitted JSON Schema).
29
+ "compute_param_hash",
30
+ "NonCanonicalizableError",
31
+ "__version__",
32
+ ]
@@ -0,0 +1,60 @@
1
+ """Canonical-JSON hashing for params (producer-side only; Bloom treats output as opaque)."""
2
+
3
+ import hashlib
4
+ import json
5
+ import math
6
+ from typing import Any
7
+
8
+
9
+ class NonCanonicalizableError(ValueError):
10
+ """Raised when a value cannot be canonicalized (e.g. NaN/inf)."""
11
+
12
+
13
+ def _normalize(obj: Any) -> Any:
14
+ """Recursively reject NaN/inf and normalize numbers to a fixed representation.
15
+
16
+ Integer-valued floats collapse to int (``1.0`` -> ``1``, ``-0.0`` -> ``0``) so
17
+ that type-variant params (int vs float) hash identically; ``bool`` is left
18
+ untouched. The walk is byte-stable within a CPython version.
19
+ """
20
+ if isinstance(obj, bool):
21
+ return obj
22
+ if isinstance(obj, float):
23
+ if not math.isfinite(obj):
24
+ raise NonCanonicalizableError(
25
+ f"NaN/inf not allowed in hashed values: {obj}"
26
+ )
27
+ if obj == int(obj):
28
+ return int(obj)
29
+ return obj
30
+ if isinstance(obj, dict):
31
+ return {key: _normalize(value) for key, value in obj.items()}
32
+ if isinstance(obj, (list, tuple)):
33
+ return [_normalize(value) for value in obj]
34
+ return obj
35
+
36
+
37
+ def canonical_json(values: Any) -> str:
38
+ """Serialize any JSON value to deterministic JSON: sorted keys, compact, no NaN/inf."""
39
+ return json.dumps(
40
+ _normalize(values),
41
+ sort_keys=True,
42
+ separators=(",", ":"),
43
+ ensure_ascii=False,
44
+ allow_nan=False,
45
+ )
46
+
47
+
48
+ def sha256_hex(text: str) -> str:
49
+ """Return the hex sha256 of a UTF-8 string."""
50
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
51
+
52
+
53
+ def compute_param_hash(values: dict[str, Any]) -> str:
54
+ """Compute the canonical, deterministic hash of a resolved-params dict.
55
+
56
+ Raises:
57
+ NonCanonicalizableError: a value is NaN/inf.
58
+ TypeError: a value is not JSON-serializable.
59
+ """
60
+ return sha256_hex(canonical_json(values))
@@ -0,0 +1,46 @@
1
+ """Deterministic idempotency-key derivation (producer-side only)."""
2
+
3
+ from .hashing import canonical_json, sha256_hex
4
+
5
+
6
+ def compute_idempotency_key(
7
+ *,
8
+ scan_key: str,
9
+ images_checksum: str,
10
+ models: list[tuple[str, str, str | None]],
11
+ param_hash: str,
12
+ predict_code_sha: str,
13
+ traits_code_sha: str,
14
+ ) -> str:
15
+ """Derive the run identity from inputs, models, params, and code versions.
16
+
17
+ Args:
18
+ scan_key: Producer-side scan identifier.
19
+ images_checksum: Checksum over the input image set.
20
+ models: (registry_id, version, weights_checksum) per model; order-independent.
21
+ param_hash: Output of compute_param_hash.
22
+ predict_code_sha: Git sha of the predict producer.
23
+ traits_code_sha: Git sha of the traits producer.
24
+
25
+ Returns:
26
+ Hex sha256 identity string.
27
+ """
28
+ # Encode each model as a structured triple and order them by their canonical
29
+ # JSON, so the key is order-independent yet injective: no delimiter ambiguity
30
+ # and None (unpinned) stays distinct from "" (empty checksum).
31
+ model_entries = sorted(
32
+ (
33
+ [registry_id, version, weights_checksum]
34
+ for registry_id, version, weights_checksum in models
35
+ ),
36
+ key=canonical_json,
37
+ )
38
+ payload = {
39
+ "scan_key": scan_key,
40
+ "images_checksum": images_checksum,
41
+ "models": model_entries,
42
+ "param_hash": param_hash,
43
+ "predict_code_sha": predict_code_sha,
44
+ "traits_code_sha": traits_code_sha,
45
+ }
46
+ return sha256_hex(canonical_json(payload))
@@ -0,0 +1,182 @@
1
+ """Pydantic contract models — the canonical source of truth."""
2
+
3
+ import math
4
+ from datetime import datetime
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, model_validator
8
+
9
+ from .hashing import compute_param_hash
10
+ from .identity import compute_idempotency_key
11
+
12
+ # Contract models are immutable: derived fields (param_hash, idempotency_key) and
13
+ # normalized values are guaranteed correct for the life of the instance, so the
14
+ # validators below use object.__setattr__ to set them past the frozen guard.
15
+ _FROZEN = ConfigDict(frozen=True)
16
+
17
+
18
+ class ModelRef(BaseModel):
19
+ """Identity of one model used in a run (FK-able to a future Bloom models table)."""
20
+
21
+ model_config = _FROZEN
22
+
23
+ registry_id: str
24
+ version: str
25
+ sleap_nn_version: str
26
+ root_type: str | None = None
27
+ weights_checksum: str | None = None
28
+
29
+
30
+ class InputRef(BaseModel):
31
+ """Pins the input data a run consumed, for reproducibility."""
32
+
33
+ model_config = _FROZEN
34
+
35
+ image_ids: list[str]
36
+ images_checksum: str
37
+
38
+
39
+ class ResolvedParams(BaseModel):
40
+ """Fully-resolved run params plus their canonical hash."""
41
+
42
+ model_config = _FROZEN
43
+
44
+ values: dict[str, Any]
45
+ param_hash: str = ""
46
+
47
+ @model_validator(mode="after")
48
+ def _fill_hash(self) -> "ResolvedParams":
49
+ computed = compute_param_hash(self.values)
50
+ if self.param_hash and self.param_hash != computed:
51
+ raise ValueError(
52
+ f"param_hash {self.param_hash!r} does not match values "
53
+ f"(computed {computed!r})"
54
+ )
55
+ object.__setattr__(self, "param_hash", computed)
56
+ return self
57
+
58
+
59
+ class Provenance(BaseModel):
60
+ """Run provenance; serializes to cyl_trait_sources.metadata jsonb (sub-project #2)."""
61
+
62
+ model_config = _FROZEN
63
+
64
+ contract_version: str
65
+ scan_key: str
66
+ inputs: InputRef
67
+ idempotency_key: str = ""
68
+ pipeline_run_id: str | None = None
69
+
70
+ # predict stage
71
+ predict_models: list[ModelRef]
72
+ predict_container_digest: str
73
+ predict_code_sha: str
74
+ worker_request_id: str | None = None
75
+
76
+ # traits stage
77
+ traits_sleap_roots_version: str
78
+ traits_container_digest: str
79
+ traits_code_sha: str
80
+
81
+ # orchestration (execution-model dependent)
82
+ argo_workflow_uid: str | None = None
83
+ argo_node_id: str | None = None
84
+
85
+ params: ResolvedParams
86
+ produced_at: datetime | None = None
87
+
88
+ @model_validator(mode="after")
89
+ def _fill_idempotency_key(self) -> "Provenance":
90
+ key = compute_idempotency_key(
91
+ scan_key=self.scan_key,
92
+ images_checksum=self.inputs.images_checksum,
93
+ models=[
94
+ (m.registry_id, m.version, m.weights_checksum)
95
+ for m in self.predict_models
96
+ ],
97
+ param_hash=self.params.param_hash,
98
+ predict_code_sha=self.predict_code_sha,
99
+ traits_code_sha=self.traits_code_sha,
100
+ )
101
+ if self.idempotency_key and self.idempotency_key != key:
102
+ raise ValueError(
103
+ f"idempotency_key {self.idempotency_key!r} does not match derived "
104
+ f"value (computed {key!r})"
105
+ )
106
+ object.__setattr__(self, "idempotency_key", key)
107
+ return self
108
+
109
+
110
+ class TraitValue(BaseModel):
111
+ """One long-format trait row. NaN/inf normalize to None (-> SQL NULL)."""
112
+
113
+ model_config = _FROZEN
114
+
115
+ name: str
116
+ value: float | None = None
117
+ grain: Literal["scan", "image"] = "scan"
118
+ scan_key: str
119
+
120
+ @model_validator(mode="after")
121
+ def _normalize_nonfinite(self) -> "TraitValue":
122
+ if self.value is not None and (
123
+ math.isnan(self.value) or math.isinf(self.value)
124
+ ):
125
+ object.__setattr__(self, "value", None)
126
+ return self
127
+
128
+
129
+ BlobKind = Literal["predictions_slp", "labels", "h5", "qc_image"]
130
+
131
+ # Single source of truth for BlobRef's "at least one location" rule: both the
132
+ # emitted JSON Schema constraint and the runtime validator derive from this, so a
133
+ # field rename can't leave the schema and the model out of sync.
134
+ _BLOB_LOCATION_FIELDS = ("s3_location", "box_link")
135
+
136
+
137
+ def _blob_location_anyof() -> dict:
138
+ """Build the at-least-one-location ``anyOf`` from the location field names.
139
+
140
+ Each branch requires one location field and constrains it to a (non-null)
141
+ string, so an all-null object is rejected by the schema exactly as the model
142
+ validator rejects it.
143
+ """
144
+ return {
145
+ "anyOf": [
146
+ {"required": [field], "properties": {field: {"type": "string"}}}
147
+ for field in _BLOB_LOCATION_FIELDS
148
+ ]
149
+ }
150
+
151
+
152
+ class BlobRef(BaseModel):
153
+ """Pointer to an intermediate artifact (rows in the #2 intermediates table)."""
154
+
155
+ # Encode the "at least one location" rule in the emitted JSON Schema so
156
+ # consumers (Bloom codegen) reject the same objects Pydantic does.
157
+ model_config = ConfigDict(frozen=True, json_schema_extra=_blob_location_anyof())
158
+
159
+ kind: BlobKind
160
+ scan_key: str
161
+ s3_location: str | None = None
162
+ box_link: str | None = None
163
+ checksum: str | None = None
164
+ file_size: int | None = None
165
+
166
+ @model_validator(mode="after")
167
+ def _require_location(self) -> "BlobRef":
168
+ if all(getattr(self, field) is None for field in _BLOB_LOCATION_FIELDS):
169
+ raise ValueError(
170
+ "BlobRef requires at least one of " + " or ".join(_BLOB_LOCATION_FIELDS)
171
+ )
172
+ return self
173
+
174
+
175
+ class ResultEnvelope(BaseModel):
176
+ """One per-scan result: 1 envelope : 1 source row : 1 scan."""
177
+
178
+ model_config = _FROZEN
179
+
180
+ provenance: Provenance
181
+ traits: list[TraitValue]
182
+ blobs: list[BlobRef] = []
@@ -0,0 +1,75 @@
1
+ """Trait definitions registry: name/dtype/range validation for trait values."""
2
+
3
+ import math
4
+ import warnings
5
+ from importlib import resources
6
+ from typing import Literal
7
+
8
+ import yaml
9
+ from pydantic import BaseModel
10
+
11
+
12
+ class TraitDefinition(BaseModel):
13
+ """Definition of a known trait."""
14
+
15
+ unit: str
16
+ dtype: Literal["float", "int"]
17
+ description: str
18
+ min: float | None = None
19
+ max: float | None = None
20
+
21
+
22
+ def load_registry() -> dict[str, TraitDefinition]:
23
+ """Load the packaged trait-definitions registry."""
24
+ text = (
25
+ resources.files("sleap_roots_contracts")
26
+ .joinpath("trait_definitions.yaml")
27
+ .read_text()
28
+ )
29
+ raw = yaml.safe_load(text) or {}
30
+ return {name: TraitDefinition(**spec) for name, spec in raw.items()}
31
+
32
+
33
+ def validate_trait(
34
+ name: str,
35
+ value: object,
36
+ registry: dict[str, TraitDefinition],
37
+ on_unknown: Literal["warn", "error"] = "warn",
38
+ ) -> None:
39
+ """Validate a trait name + value against the registry.
40
+
41
+ Args:
42
+ name: Trait name to look up.
43
+ value: Trait value to validate. ``None`` skips the numeric checks;
44
+ otherwise it must be a finite, non-bool number (any other type is
45
+ rejected) so this guards untrusted producer input, not just floats.
46
+ registry: The loaded trait-definitions registry.
47
+ on_unknown: Behavior for names absent from the registry ("warn" or "error").
48
+
49
+ Raises:
50
+ ValueError: unknown name (when on_unknown="error"), a non-numeric or
51
+ non-finite value, a value that violates the definition's dtype, or an
52
+ out-of-range value.
53
+ """
54
+ definition = registry.get(name)
55
+ if definition is None:
56
+ if on_unknown == "error":
57
+ raise ValueError(f"Unknown trait: {name!r}")
58
+ warnings.warn(
59
+ f"Unknown trait not in registry: {name!r}", UserWarning, stacklevel=2
60
+ )
61
+ return
62
+ if value is None:
63
+ return
64
+ # bool is an int subclass; reject it alongside other non-numeric types.
65
+ if isinstance(value, bool) or not isinstance(value, (int, float)):
66
+ raise ValueError(f"{name}={value!r} is not numeric")
67
+ # Reject NaN/inf: they slip past min/max comparisons (all comparisons False).
68
+ if not math.isfinite(value):
69
+ raise ValueError(f"{name}={value} is not finite")
70
+ if definition.dtype == "int" and float(value) != int(value):
71
+ raise ValueError(f"{name}={value} is not an integer (dtype int)")
72
+ if definition.min is not None and value < definition.min:
73
+ raise ValueError(f"{name}={value} below min {definition.min}")
74
+ if definition.max is not None and value > definition.max:
75
+ raise ValueError(f"{name}={value} above max {definition.max}")
@@ -0,0 +1,53 @@
1
+ """Emit versioned JSON Schema artifacts from the Pydantic models."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from . import __version__
7
+ from .models import ResultEnvelope
8
+
9
+
10
+ def _default_schema_dir() -> Path:
11
+ """Locate the repo's ``schema/`` dir for the producer-side emitter.
12
+
13
+ Walks up from this file for the directory containing ``pyproject.toml`` (the
14
+ repo root in a source checkout / CI); falls back to the current working
15
+ directory when none is found (e.g. when the package is pip-installed, where a
16
+ ``parents[2]`` guess would land on an unwritable site-packages path).
17
+ """
18
+ here = Path(__file__).resolve()
19
+ for parent in here.parents:
20
+ if (parent / "pyproject.toml").is_file():
21
+ return parent / "schema"
22
+ return Path.cwd() / "schema"
23
+
24
+
25
+ SCHEMA_DIR = _default_schema_dir()
26
+ MODELS = {"result_envelope": ResultEnvelope}
27
+
28
+
29
+ def render(name: str) -> str:
30
+ """Render one schema as a deterministic JSON string."""
31
+ schema = MODELS[name].model_json_schema()
32
+ # Make the artifact self-describing so consumers (and jsonschema.validate)
33
+ # select the intended dialect instead of defaulting to Draft 7.
34
+ schema["$schema"] = "https://json-schema.org/draft/2020-12/schema"
35
+ # Carry the package version as a path segment (not a URI fragment): JSON Schema
36
+ # Draft 2020-12 forbids a non-empty fragment in "$id".
37
+ schema["$id"] = (
38
+ "https://github.com/talmolab/sleap-roots-contracts/schema/"
39
+ f"v{__version__}/{name}.schema.json"
40
+ )
41
+ return json.dumps(schema, indent=2, sort_keys=True) + "\n"
42
+
43
+
44
+ def emit_schema(schema_dir: Path | None = None) -> None:
45
+ """Write all schemas to ``schema_dir`` (defaults to the repo's ``schema/``)."""
46
+ target = schema_dir if schema_dir is not None else SCHEMA_DIR
47
+ target.mkdir(parents=True, exist_ok=True)
48
+ for name in MODELS:
49
+ (target / f"{name}.schema.json").write_text(render(name), encoding="utf-8")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ emit_schema()
@@ -0,0 +1,18 @@
1
+ # Seed trait definitions. Source of truth for full population: the traits computed by
2
+ # `sleap_roots` (expand by enumerating its trait outputs). on_unknown defaults to "warn".
3
+ primary_length:
4
+ unit: px
5
+ dtype: float
6
+ min: 0.0
7
+ description: Length of the primary root.
8
+ lateral_count:
9
+ unit: count
10
+ dtype: int
11
+ min: 0.0
12
+ description: Number of lateral roots detected.
13
+ crown_angle:
14
+ unit: deg
15
+ dtype: float
16
+ min: 0.0
17
+ max: 360.0
18
+ description: Crown root angle.