sectum-ai-spec 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. sectum_ai_spec-0.1.1/.gitignore +45 -0
  2. sectum_ai_spec-0.1.1/PKG-INFO +62 -0
  3. sectum_ai_spec-0.1.1/README.md +43 -0
  4. sectum_ai_spec-0.1.1/pyproject.toml +32 -0
  5. sectum_ai_spec-0.1.1/src/sectum_ai/spec/__init__.py +89 -0
  6. sectum_ai_spec-0.1.1/src/sectum_ai/spec/_logging.py +157 -0
  7. sectum_ai_spec-0.1.1/src/sectum_ai/spec/enums.py +106 -0
  8. sectum_ai_spec-0.1.1/src/sectum_ai/spec/errors.py +39 -0
  9. sectum_ai_spec-0.1.1/src/sectum_ai/spec/hashing.py +60 -0
  10. sectum_ai_spec-0.1.1/src/sectum_ai/spec/models.py +381 -0
  11. sectum_ai_spec-0.1.1/src/sectum_ai/spec/py.typed +0 -0
  12. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schema.py +81 -0
  13. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/ControlMapping.schema.json +30 -0
  14. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/CorpusDocument.schema.json +66 -0
  15. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/EvidencePack.schema.json +484 -0
  16. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/Finding.schema.json +174 -0
  17. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/GroundTruthManifest.schema.json +131 -0
  18. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/Marker.schema.json +98 -0
  19. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/Observation.schema.json +97 -0
  20. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/ProbeStep.schema.json +53 -0
  21. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/RunMetrics.schema.json +139 -0
  22. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/RunResult.schema.json +378 -0
  23. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/Scenario.schema.json +144 -0
  24. sectum_ai_spec-0.1.1/src/sectum_ai/spec/schemas/Substrate.schema.json +374 -0
  25. sectum_ai_spec-0.1.1/src/sectum_ai/spec/stats.py +106 -0
@@ -0,0 +1,45 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ .eggs/
7
+
8
+ # Builds / distributions
9
+ build/
10
+ dist/
11
+ *.whl
12
+
13
+ # mkdocs build output
14
+ site/
15
+
16
+ # uv / virtual environments
17
+ .venv/
18
+ venv/
19
+
20
+ # Tooling caches
21
+ .mypy_cache/
22
+ .ruff_cache/
23
+ .pytest_cache/
24
+ .coverage
25
+ .coverage.*
26
+ coverage.xml
27
+ htmlcov/
28
+
29
+ # Editors / OS
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ .DS_Store
34
+
35
+ # Example run artifacts (generated by examples/*/run.sh, incl. the
36
+ # out-residual/ workdir from the docs/samples regeneration recipe)
37
+ examples/*/out/
38
+ examples/*/out-residual/
39
+
40
+ # Sectum CLI default workdir (generated by seed/probe/report; not source)
41
+ .sectum-ai/
42
+ examples/*/.sectum-ai/
43
+
44
+ # Project-local engineering spec (not shared)
45
+ CLAUDE.md
@@ -0,0 +1,62 @@
1
+ Metadata-Version: 2.4
2
+ Name: sectum-ai-spec
3
+ Version: 0.1.1
4
+ Summary: Sectum AI - Pydantic data models and JSON Schema for the verification spec.
5
+ Project-URL: Homepage, https://sectum.ai
6
+ Project-URL: Repository, https://github.com/sectum-ai/sectum-ai
7
+ Author: Sectum AI
8
+ License-Expression: Apache-2.0
9
+ Keywords: ai-security,multi-tenant,schema,verification
10
+ Classifier: Development Status :: 2 - Pre-Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Security
15
+ Requires-Python: >=3.12
16
+ Requires-Dist: pydantic>=2.9
17
+ Requires-Dist: structlog>=24.4
18
+ Description-Content-Type: text/markdown
19
+
20
+ # sectum-ai-spec
21
+
22
+ Shared data models and JSON Schema for [Sectum AI](https://github.com/sectum-ai/sectum-ai),
23
+ the multi-tenant AI verification toolkit.
24
+
25
+ This distribution holds the Pydantic v2 models and exported JSON Schema that
26
+ every other Sectum package builds on — `Scenario`, `Marker`,
27
+ `GroundTruthManifest`, `ProbeStep`, `Observation`, `Finding`, `RunResult`, and
28
+ `EvidencePack` — plus the typed error hierarchy (`SectumError` and friends).
29
+ It is the lowest layer in the package graph and depends on nothing else in the
30
+ family.
31
+
32
+ ```sh
33
+ pip install sectum-ai-spec
34
+ ```
35
+
36
+ Most users install the umbrella package [`sectum-ai`](https://pypi.org/project/sectum-ai/)
37
+ instead, which pulls this in automatically.
38
+
39
+ ## JSON Schema
40
+
41
+ Every model is also published as a standalone JSON Schema document under
42
+ `sectum_ai/spec/schemas/<Model>.schema.json` (shipped in the wheel). Each carries a
43
+ `$schema` dialect (draft 2020-12) and a version-pinned `$id`
44
+ (`https://schemas.sectum.ai/<schema_version>/<Model>.schema.json`), so external
45
+ tooling can validate Sectum artifacts without importing Python:
46
+
47
+ ```python
48
+ from sectum_ai.spec import json_schemas
49
+ from sectum_ai.spec.schema import SCHEMA_DIR # the committed, packaged artifacts
50
+
51
+ finding_schema = json_schemas()["Finding"]
52
+ ```
53
+
54
+ The Pydantic models are authoritative; the schemas are generated from them.
55
+ Regenerate after a model change with `uv run python scripts/gen_schemas.py`
56
+ (a test fails if a committed artifact drifts). The `schema_version` field
57
+ versions the models — bump it for a backward-incompatible change, then regenerate.
58
+
59
+ - Documentation: <https://docs.sectum.ai>
60
+ - Source: <https://github.com/sectum-ai/sectum-ai>
61
+
62
+ Apache-2.0.
@@ -0,0 +1,43 @@
1
+ # sectum-ai-spec
2
+
3
+ Shared data models and JSON Schema for [Sectum AI](https://github.com/sectum-ai/sectum-ai),
4
+ the multi-tenant AI verification toolkit.
5
+
6
+ This distribution holds the Pydantic v2 models and exported JSON Schema that
7
+ every other Sectum package builds on — `Scenario`, `Marker`,
8
+ `GroundTruthManifest`, `ProbeStep`, `Observation`, `Finding`, `RunResult`, and
9
+ `EvidencePack` — plus the typed error hierarchy (`SectumError` and friends).
10
+ It is the lowest layer in the package graph and depends on nothing else in the
11
+ family.
12
+
13
+ ```sh
14
+ pip install sectum-ai-spec
15
+ ```
16
+
17
+ Most users install the umbrella package [`sectum-ai`](https://pypi.org/project/sectum-ai/)
18
+ instead, which pulls this in automatically.
19
+
20
+ ## JSON Schema
21
+
22
+ Every model is also published as a standalone JSON Schema document under
23
+ `sectum_ai/spec/schemas/<Model>.schema.json` (shipped in the wheel). Each carries a
24
+ `$schema` dialect (draft 2020-12) and a version-pinned `$id`
25
+ (`https://schemas.sectum.ai/<schema_version>/<Model>.schema.json`), so external
26
+ tooling can validate Sectum artifacts without importing Python:
27
+
28
+ ```python
29
+ from sectum_ai.spec import json_schemas
30
+ from sectum_ai.spec.schema import SCHEMA_DIR # the committed, packaged artifacts
31
+
32
+ finding_schema = json_schemas()["Finding"]
33
+ ```
34
+
35
+ The Pydantic models are authoritative; the schemas are generated from them.
36
+ Regenerate after a model change with `uv run python scripts/gen_schemas.py`
37
+ (a test fails if a committed artifact drifts). The `schema_version` field
38
+ versions the models — bump it for a backward-incompatible change, then regenerate.
39
+
40
+ - Documentation: <https://docs.sectum.ai>
41
+ - Source: <https://github.com/sectum-ai/sectum-ai>
42
+
43
+ Apache-2.0.
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "sectum-ai-spec"
7
+ version = "0.1.1"
8
+ description = "Sectum AI - Pydantic data models and JSON Schema for the verification spec."
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = "Apache-2.0"
12
+ authors = [{ name = "Sectum AI" }]
13
+ keywords = ["ai-security", "multi-tenant", "schema", "verification"]
14
+ classifiers = [
15
+ "Development Status :: 2 - Pre-Alpha",
16
+ "Intended Audience :: Developers",
17
+ "Operating System :: OS Independent",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Topic :: Security",
20
+ ]
21
+ dependencies = [
22
+ "pydantic>=2.9",
23
+ "structlog>=24.4",
24
+ ]
25
+
26
+ [project.urls]
27
+ Homepage = "https://sectum.ai"
28
+ Repository = "https://github.com/sectum-ai/sectum-ai"
29
+
30
+ [tool.hatch.build.targets.wheel]
31
+ only-include = ["src/sectum_ai"]
32
+ sources = ["src"]
@@ -0,0 +1,89 @@
1
+ """Sectum AI data models and JSON Schema (the ``sectum_ai.spec`` namespace package)."""
2
+
3
+ from sectum_ai.spec._logging import configure_logging, get_logger, redact_sensitive
4
+ from sectum_ai.spec.enums import (
5
+ AccessOutcome,
6
+ CoverageVerdict,
7
+ FindingStatus,
8
+ MarkerType,
9
+ PrincipalKind,
10
+ Severity,
11
+ Surface,
12
+ )
13
+ from sectum_ai.spec.errors import (
14
+ AdapterError,
15
+ ConfigError,
16
+ DetectionError,
17
+ ErasureUnsupported,
18
+ EvidenceError,
19
+ SectumError,
20
+ )
21
+ from sectum_ai.spec.hashing import canonical_hash, sha256_hex, to_canonical_json
22
+ from sectum_ai.spec.models import (
23
+ SCHEMA_VERSION,
24
+ ControlMapping,
25
+ CorpusDocument,
26
+ EvidencePack,
27
+ Finding,
28
+ GroundTruthManifest,
29
+ Marker,
30
+ Observation,
31
+ PlantedLocation,
32
+ Principal,
33
+ ProbeStep,
34
+ RunMetrics,
35
+ RunResult,
36
+ Scenario,
37
+ SectumModel,
38
+ SharedEntity,
39
+ Substrate,
40
+ SyntheticTenantSpec,
41
+ SyntheticUserSpec,
42
+ )
43
+ from sectum_ai.spec.schema import json_schemas, write_json_schemas
44
+ from sectum_ai.spec.stats import normal_quantile, wilson_interval
45
+
46
+ __all__ = [
47
+ "SCHEMA_VERSION",
48
+ "AccessOutcome",
49
+ "AdapterError",
50
+ "ConfigError",
51
+ "ControlMapping",
52
+ "CorpusDocument",
53
+ "CoverageVerdict",
54
+ "DetectionError",
55
+ "ErasureUnsupported",
56
+ "EvidenceError",
57
+ "EvidencePack",
58
+ "Finding",
59
+ "FindingStatus",
60
+ "GroundTruthManifest",
61
+ "Marker",
62
+ "MarkerType",
63
+ "Observation",
64
+ "PlantedLocation",
65
+ "Principal",
66
+ "PrincipalKind",
67
+ "ProbeStep",
68
+ "RunMetrics",
69
+ "RunResult",
70
+ "Scenario",
71
+ "SectumError",
72
+ "SectumModel",
73
+ "Severity",
74
+ "SharedEntity",
75
+ "Substrate",
76
+ "Surface",
77
+ "SyntheticTenantSpec",
78
+ "SyntheticUserSpec",
79
+ "canonical_hash",
80
+ "configure_logging",
81
+ "get_logger",
82
+ "json_schemas",
83
+ "normal_quantile",
84
+ "redact_sensitive",
85
+ "sha256_hex",
86
+ "to_canonical_json",
87
+ "wilson_interval",
88
+ "write_json_schemas",
89
+ ]
@@ -0,0 +1,157 @@
1
+ """Structured logging for Sectum AI (the engineering spec, section 16).
2
+
3
+ Sectum is a security product, so its logs must never leak secrets or raw tenant
4
+ content. This module configures :mod:`structlog` so that:
5
+
6
+ - logs render as JSON (machine-readable, SIEM-friendly) to **stderr** — stdout is
7
+ reserved for a command's own output (for example ``probe --output json``);
8
+ - ``DEBUG`` is suppressed by default (section 16: "DEBUG must be off by default");
9
+ - a redaction processor drops secret-bearing keys and raw tenant content from
10
+ every event emitted above ``DEBUG``.
11
+
12
+ Libraries obtain a logger with :func:`get_logger` and never configure logging
13
+ themselves; the application entry point (the CLI) calls :func:`configure_logging`
14
+ once at start-up. The test suite configures it through a fixture so output is
15
+ deterministic and stdout stays clean.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import logging
21
+ import re
22
+ import sys
23
+ from typing import cast
24
+
25
+ import structlog
26
+ from structlog.types import EventDict, FilteringBoundLogger, Processor, WrappedLogger
27
+
28
+ # Keys whose values are secrets or canary plaintext: never emitted above DEBUG.
29
+ _SECRET_KEYS = frozenset(
30
+ {
31
+ "secret",
32
+ "token",
33
+ "password",
34
+ "passphrase",
35
+ "api_key",
36
+ "apikey",
37
+ "authorization",
38
+ "credential",
39
+ "credentials",
40
+ "plaintext",
41
+ "canary",
42
+ "marker_plaintext",
43
+ }
44
+ )
45
+ # Keys carrying raw tenant content: never emitted above DEBUG (section 16).
46
+ _TENANT_CONTENT_KEYS = frozenset(
47
+ {"content", "raw_response", "answer", "query", "prompt", "text", "evidence_span"}
48
+ )
49
+ _SENSITIVE_KEYS = _SECRET_KEYS | _TENANT_CONTENT_KEYS
50
+ _REDACTED = "<redacted>"
51
+
52
+ # Canary/secret value shapes, scrubbed wherever they appear (nested under a benign
53
+ # key, inside the event message, or in an exception's text) so a future careless
54
+ # call site cannot leak one even without using a sensitive key name. The shapes
55
+ # mirror ``sectum_ai.substrate.markers``: the HARD canary's branded prefix and the
56
+ # three secret-canary credential formats. Entity canaries are ordinary-looking
57
+ # text, so they are covered by the key-name pass (``plaintext``/``canary``/…), not
58
+ # value-scrubbed, to avoid false positives on legitimate prose. The ``sk-``/``AKIA``
59
+ # shapes are anchored at a non-alphanumeric boundary so they match a standalone
60
+ # token, not a substring of a benign identifier (``task-``/``disk-`` + a long id,
61
+ # ``…AKIA…`` mid-word); an underscore still counts as a boundary so ``api_sk-…``
62
+ # is caught.
63
+ _SECRET_VALUE_RE = re.compile(
64
+ r"SECTUM-CANARY-[A-Z2-7]+" # HARD canary (branded high-entropy token)
65
+ r"|(?<![A-Za-z0-9])sk-[A-Za-z0-9]{20,}" # OpenAI-style API key
66
+ r"|(?<![A-Za-z0-9])AKIA[A-Z0-9]{16}" # AWS access-key id
67
+ r"|\b9\d{2}-\d{2}-\d{4}\b" # non-issuable 9xx US SSN shape
68
+ )
69
+
70
+
71
+ def _scrub_text(text: str) -> str:
72
+ """Replace any embedded canary/secret-shaped token with the redaction marker."""
73
+ return _SECRET_VALUE_RE.sub(_REDACTED, text)
74
+
75
+
76
+ def _redact_value(value: object) -> object:
77
+ """Recursively redact sensitive keys and scrub secret-shaped strings.
78
+
79
+ Walks nested mappings/sequences so a sensitive key or a secret-shaped value is
80
+ caught at any depth, not just at the top level. A sensitive key is dropped
81
+ wholesale; strings (including the event message) and the text of exception
82
+ values are scrubbed for embedded secret shapes. Other scalars pass through.
83
+ """
84
+ if isinstance(value, dict):
85
+ return {
86
+ key: (
87
+ _REDACTED
88
+ if isinstance(key, str) and key.lower() in _SENSITIVE_KEYS
89
+ else _redact_value(val)
90
+ )
91
+ for key, val in value.items()
92
+ }
93
+ if isinstance(value, list):
94
+ return [_redact_value(item) for item in value]
95
+ if isinstance(value, tuple):
96
+ return tuple(_redact_value(item) for item in value)
97
+ if isinstance(value, str):
98
+ return _scrub_text(value)
99
+ if isinstance(value, BaseException):
100
+ return _scrub_text(f"{type(value).__name__}: {value}")
101
+ return value
102
+
103
+
104
+ def redact_sensitive(_logger: WrappedLogger, _method: str, event_dict: EventDict) -> EventDict:
105
+ """Drop secret-bearing keys and raw tenant content from non-DEBUG events.
106
+
107
+ DEBUG is opt-in and off by default, so verbose local troubleshooting may see
108
+ raw values; everything at INFO and above is redacted (the engineering spec,
109
+ section 16: "never log secrets or raw tenant content above DEBUG"). Requires
110
+ ``add_log_level`` to run first so ``level`` is present in the event dict.
111
+
112
+ Redaction is recursive and value-aware (defense in depth): a sensitive key is
113
+ dropped at any nesting depth, and a canary/secret-shaped token is scrubbed
114
+ wherever it appears — nested under a benign key, embedded in the event
115
+ message, or carried in an exception's text — so a careless call site cannot
116
+ leak one. Today every production call site logs only operational metadata
117
+ (IDs, counts, model names, digests); this is the backstop for the future.
118
+ """
119
+ if event_dict.get("level") == "debug":
120
+ return event_dict
121
+ return cast(EventDict, _redact_value(event_dict))
122
+
123
+
124
+ def configure_logging(*, debug: bool = False, json_output: bool = True) -> None:
125
+ """Configure process-wide structured logging. Call once, from the entry point.
126
+
127
+ ``debug`` enables DEBUG-level events (off by default, per section 16);
128
+ ``json_output`` selects the JSON renderer (the default) over a plain console
129
+ renderer. Logs are always written to stderr so stdout stays reserved for a
130
+ command's own output.
131
+ """
132
+ renderer: Processor = (
133
+ structlog.processors.JSONRenderer()
134
+ if json_output
135
+ else structlog.dev.ConsoleRenderer(colors=False)
136
+ )
137
+ structlog.configure(
138
+ processors=[
139
+ structlog.contextvars.merge_contextvars,
140
+ structlog.processors.add_log_level,
141
+ structlog.processors.TimeStamper(fmt="iso", utc=True),
142
+ redact_sensitive,
143
+ renderer,
144
+ ],
145
+ wrapper_class=structlog.make_filtering_bound_logger(
146
+ logging.DEBUG if debug else logging.INFO
147
+ ),
148
+ logger_factory=structlog.PrintLoggerFactory(file=sys.stderr),
149
+ cache_logger_on_first_use=True,
150
+ )
151
+
152
+
153
+ def get_logger(name: str | None = None) -> FilteringBoundLogger:
154
+ """Return a structured logger; the binding name is by convention ``__name__``."""
155
+ # structlog.get_logger is typed -> Any (it returns a lazy proxy); the bound
156
+ # logger is a FilteringBoundLogger once configure_logging has run.
157
+ return cast(FilteringBoundLogger, structlog.get_logger(name))
@@ -0,0 +1,106 @@
1
+ """Enumerations shared across the Sectum AI data models."""
2
+
3
+ from enum import StrEnum
4
+
5
+
6
+ class MarkerType(StrEnum):
7
+ """The three canary marker types (the engineering spec, section 6.3)."""
8
+
9
+ HARD_CANARY = "HARD_CANARY"
10
+ ENTITY_CANARY = "ENTITY_CANARY"
11
+ SECRET_CANARY = "SECRET_CANARY"
12
+
13
+
14
+ class Severity(StrEnum):
15
+ """Finding severity levels."""
16
+
17
+ CRITICAL = "critical"
18
+ HIGH = "high"
19
+ MEDIUM = "medium"
20
+ LOW = "low"
21
+ INFO = "info"
22
+
23
+
24
+ class FindingStatus(StrEnum):
25
+ """Whether a finding is manifest-confirmed or merely a candidate."""
26
+
27
+ CONFIRMED = "confirmed"
28
+ UNVERIFIED = "unverified"
29
+
30
+
31
+ class AccessOutcome(StrEnum):
32
+ """How an authorization-boundary fetch resolved (the engineering spec, Class 1).
33
+
34
+ A direct cross-tenant fetch should be *denied*. The spec calls out the
35
+ ambiguity a competitor's scanner misses: a backend that returns ``200`` with
36
+ an empty body looks like a deny but never enforced one. ``RETURNED`` is the
37
+ object actually surfacing (a leak if the object is foreign); ``EMPTY`` is the
38
+ ambiguous empty result; ``DENIED`` is an explicit authorization refusal.
39
+ """
40
+
41
+ RETURNED = "returned"
42
+ EMPTY = "empty"
43
+ DENIED = "denied"
44
+
45
+
46
+ class PrincipalKind(StrEnum):
47
+ """The kind of isolation boundary a principal represents.
48
+
49
+ Sectum verifies that one principal's data does not reach another. A tenant
50
+ is the top-level principal; a user is a sub-principal within a tenant. The
51
+ substrate, detection, and surfaces are identical at either granularity -
52
+ only the boundary being verified differs (ADR-0006).
53
+ """
54
+
55
+ TENANT = "tenant"
56
+ USER = "user"
57
+
58
+
59
+ class Surface(StrEnum):
60
+ """A place tenant data can live or leak (the engineering spec, section 23)."""
61
+
62
+ API = "api"
63
+ VECTOR_DB = "vector_db"
64
+ RAG_PIPELINE = "rag_pipeline"
65
+ PROMPT_LOGS = "prompt_logs"
66
+ SEMANTIC_CACHE = "semantic_cache"
67
+ KV_CACHE = "kv_cache"
68
+ AGENT_MEMORY = "agent_memory"
69
+ AGENT_FRAMEWORK = "agent_framework"
70
+ MCP = "mcp"
71
+ MODEL_ADAPTER = "model_adapter"
72
+ EVAL_SET = "eval_set"
73
+ BACKUP = "backup"
74
+ SEARCH_INDEX = "search_index"
75
+ TRACING = "tracing"
76
+
77
+
78
+ class CoverageVerdict(StrEnum):
79
+ """The per-surface coverage verdict in a Class 11 erasure attestation.
80
+
81
+ A coverage block (surface -> verdict) makes the attestation honest about
82
+ *what it actually verified*: an attestation must never imply more coverage
83
+ than it has. The anti-over-claim guarantee is that a surface which was not
84
+ scanned can only ever be :data:`NOT_COVERED` - never :data:`ERASED` (the
85
+ engineering spec, section 7, Class 11).
86
+
87
+ The first three values are the tri-state every surface resolves to;
88
+ :data:`ATTESTABLE_WITH_CAVEAT` is the fourth, DPO-facing distinction the
89
+ spec calls out at Class 11 hiding place #8: the surface *was* scanned and a
90
+ baseline existed, but the backend exposes no programmatic per-tenant erasure
91
+ API, so the data is presumed retained until it ages out of the backend's
92
+ retention window. It is a documented backend limitation, never a clean pass
93
+ and never conflated with a residual-after-erasure *failure*.
94
+ """
95
+
96
+ ERASED = "ERASED"
97
+ """Covered and clean: a baseline existed and no marker survived erasure."""
98
+
99
+ RESIDUAL = "RESIDUAL"
100
+ """Covered and failed: the backend was asked to erase and a marker survived."""
101
+
102
+ ATTESTABLE_WITH_CAVEAT = "ATTESTABLE_WITH_CAVEAT"
103
+ """Covered, but the backend exposes no per-tenant erasure API (hiding place #8)."""
104
+
105
+ NOT_COVERED = "NOT_COVERED"
106
+ """Out of scope, not scanned, or no pre-erasure baseline - never ``ERASED``."""
@@ -0,0 +1,39 @@
1
+ """The Sectum AI typed exception hierarchy (the engineering spec, section 16).
2
+
3
+ Every error Sectum AI raises for a domain or runtime condition derives from
4
+ ``SectumError``, so one ``except`` clause can catch the whole family. The
5
+ hierarchy lives in ``sectum-ai-spec`` - the lowest package in the acyclic
6
+ package graph (ADR-0004) - so every other package raises these without a cycle.
7
+ """
8
+
9
+
10
+ class SectumError(Exception):
11
+ """Base class for every error Sectum AI raises for a domain condition."""
12
+
13
+
14
+ class ConfigError(SectumError):
15
+ """A scenario or configuration value is missing or invalid."""
16
+
17
+
18
+ class AdapterError(SectumError):
19
+ """An adapter is missing, misconfigured, or failed at runtime."""
20
+
21
+
22
+ class ErasureUnsupported(AdapterError):
23
+ """A backend exposes no programmatic per-tenant erasure API.
24
+
25
+ Raised from an adapter's ``delete`` when the backend cannot purge a
26
+ tenant's data through its API (erasure is governed by a retention policy
27
+ or a manual console action instead). The Class 11 erasure probe catches
28
+ this and itemises the surface as *attestable-with-caveat* (the engineering
29
+ spec, section 7, Class 11, hiding place #8) rather than reporting a false
30
+ erasure success - the data is presumed retained until proven otherwise.
31
+ """
32
+
33
+
34
+ class EvidenceError(SectumError):
35
+ """Building or verifying a tamper-evident evidence pack failed."""
36
+
37
+
38
+ class DetectionError(SectumError):
39
+ """The leak-detection pipeline could not produce a result."""
@@ -0,0 +1,60 @@
1
+ """Canonical serialization and hashing for Sectum AI models.
2
+
3
+ Hashes are computed over a canonical JSON form (sorted keys, no insignificant
4
+ whitespace) so the same logical content always yields the same digest. This is
5
+ the foundation of the reproducibility contract (the engineering spec, section 6.5) and the
6
+ evidence chain (the engineering spec, section 8).
7
+
8
+ Finite floats need no rounding: ``json.dumps`` emits CPython's shortest
9
+ round-tripping ``repr``, which is deterministic, so the same float value
10
+ canonicalizes identically across machines and Python versions. Canonicalization
11
+ is by ``repr``, which distinguishes ``-0.0`` from ``0.0`` (the one case where two
12
+ IEEE-754-equal values serialize differently); no metric carries a signed zero
13
+ whose sign is meaningful, so this is harmless. Rounding would only risk colliding
14
+ genuinely distinct metrics, so it is deliberately not done (ADR-0021). Non-finite
15
+ floats (NaN/Infinity) have no valid, injective JSON form and are refused below.
16
+ """
17
+
18
+ import hashlib
19
+ import json
20
+ from typing import Any
21
+
22
+ from pydantic import BaseModel
23
+
24
+
25
+ def to_canonical_json(obj: BaseModel | dict[str, Any] | list[Any]) -> bytes:
26
+ """Serialize an object to canonical JSON bytes: sorted keys, UTF-8, compact."""
27
+ data: Any = obj.model_dump(mode="json") if isinstance(obj, BaseModel) else obj
28
+ try:
29
+ text = json.dumps(
30
+ data,
31
+ sort_keys=True,
32
+ separators=(",", ":"),
33
+ ensure_ascii=False,
34
+ allow_nan=False,
35
+ )
36
+ except ValueError as error:
37
+ # A non-finite float (NaN/Infinity) would serialize as a JavaScript
38
+ # literal that is not valid JSON (RFC 8259): a third-party verifier
39
+ # using a strict parser could not reproduce the digest, and every NaN
40
+ # collapses to the same token (a non-injective canonical form). Refuse
41
+ # it so the canonical form stays valid and injective.
42
+ raise ValueError(f"cannot canonicalize a non-finite float: {error}") from error
43
+ except TypeError as error:
44
+ # A raw dict/list (a BaseModel is normalized first via
45
+ # model_dump(mode="json")) can carry a value json cannot serialize - a
46
+ # UUID, datetime, bytes, or a non-str key. json raises a bare TypeError;
47
+ # surface it as a clear, typed canonicalization failure so a caller sees
48
+ # why the digest could not be computed instead of an opaque traceback.
49
+ raise TypeError(f"cannot canonicalize a non-JSON-native value: {error}") from error
50
+ return text.encode("utf-8")
51
+
52
+
53
+ def sha256_hex(data: bytes) -> str:
54
+ """Return the hex-encoded SHA-256 digest of ``data``."""
55
+ return hashlib.sha256(data).hexdigest()
56
+
57
+
58
+ def canonical_hash(obj: BaseModel | dict[str, Any] | list[Any]) -> str:
59
+ """Return the SHA-256 hex digest of an object's canonical JSON form."""
60
+ return sha256_hex(to_canonical_json(obj))