PyPI - sectum-ai-spec - Versions diffs - 0.1.1__tar.gz - Mend

sectum-ai-spec 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

sectum_ai_spec-0.1.1/.gitignore ADDED Viewed

@@ -0,0 +1,45 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info/
+.eggs/
+# Builds / distributions
+build/
+dist/
+*.whl
+# mkdocs build output
+site/
+# uv / virtual environments
+.venv/
+venv/
+# Tooling caches
+.mypy_cache/
+.ruff_cache/
+.pytest_cache/
+.coverage
+.coverage.*
+coverage.xml
+htmlcov/
+# Editors / OS
+.idea/
+.vscode/
+*.swp
+.DS_Store
+# Example run artifacts (generated by examples/*/run.sh, incl. the
+# out-residual/ workdir from the docs/samples regeneration recipe)
+examples/*/out/
+examples/*/out-residual/
+# Sectum CLI default workdir (generated by seed/probe/report; not source)
+.sectum-ai/
+examples/*/.sectum-ai/
+# Project-local engineering spec (not shared)
+CLAUDE.md

sectum_ai_spec-0.1.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,62 @@
+Metadata-Version: 2.4
+Name: sectum-ai-spec
+Version: 0.1.1
+Summary: Sectum AI - Pydantic data models and JSON Schema for the verification spec.
+Project-URL: Homepage, https://sectum.ai
+Project-URL: Repository, https://github.com/sectum-ai/sectum-ai
+Author: Sectum AI
+License-Expression: Apache-2.0
+Keywords: ai-security,multi-tenant,schema,verification
+Classifier: Development Status :: 2 - Pre-Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Security
+Requires-Python: >=3.12
+Requires-Dist: pydantic>=2.9
+Requires-Dist: structlog>=24.4
+Description-Content-Type: text/markdown
+# sectum-ai-spec
+Shared data models and JSON Schema for [Sectum AI](https://github.com/sectum-ai/sectum-ai),
+the multi-tenant AI verification toolkit.
+This distribution holds the Pydantic v2 models and exported JSON Schema that
+every other Sectum package builds on — `Scenario`, `Marker`,
+`GroundTruthManifest`, `ProbeStep`, `Observation`, `Finding`, `RunResult`, and
+`EvidencePack` — plus the typed error hierarchy (`SectumError` and friends).
+It is the lowest layer in the package graph and depends on nothing else in the
+family.
+```sh
+pip install sectum-ai-spec
+```
+Most users install the umbrella package [`sectum-ai`](https://pypi.org/project/sectum-ai/)
+instead, which pulls this in automatically.
+## JSON Schema
+Every model is also published as a standalone JSON Schema document under
+`sectum_ai/spec/schemas/<Model>.schema.json` (shipped in the wheel). Each carries a
+`$schema` dialect (draft 2020-12) and a version-pinned `$id`
+(`https://schemas.sectum.ai/<schema_version>/<Model>.schema.json`), so external
+tooling can validate Sectum artifacts without importing Python:
+```python
+from sectum_ai.spec import json_schemas
+from sectum_ai.spec.schema import SCHEMA_DIR  # the committed, packaged artifacts
+finding_schema = json_schemas()["Finding"]
+```
+The Pydantic models are authoritative; the schemas are generated from them.
+Regenerate after a model change with `uv run python scripts/gen_schemas.py`
+(a test fails if a committed artifact drifts). The `schema_version` field
+versions the models — bump it for a backward-incompatible change, then regenerate.
+- Documentation: <https://docs.sectum.ai>
+- Source: <https://github.com/sectum-ai/sectum-ai>
+Apache-2.0.

sectum_ai_spec-0.1.1/README.md ADDED Viewed

@@ -0,0 +1,43 @@
+# sectum-ai-spec
+Shared data models and JSON Schema for [Sectum AI](https://github.com/sectum-ai/sectum-ai),
+the multi-tenant AI verification toolkit.
+This distribution holds the Pydantic v2 models and exported JSON Schema that
+every other Sectum package builds on — `Scenario`, `Marker`,
+`GroundTruthManifest`, `ProbeStep`, `Observation`, `Finding`, `RunResult`, and
+`EvidencePack` — plus the typed error hierarchy (`SectumError` and friends).
+It is the lowest layer in the package graph and depends on nothing else in the
+family.
+```sh
+pip install sectum-ai-spec
+```
+Most users install the umbrella package [`sectum-ai`](https://pypi.org/project/sectum-ai/)
+instead, which pulls this in automatically.
+## JSON Schema
+Every model is also published as a standalone JSON Schema document under
+`sectum_ai/spec/schemas/<Model>.schema.json` (shipped in the wheel). Each carries a
+`$schema` dialect (draft 2020-12) and a version-pinned `$id`
+(`https://schemas.sectum.ai/<schema_version>/<Model>.schema.json`), so external
+tooling can validate Sectum artifacts without importing Python:
+```python
+from sectum_ai.spec import json_schemas
+from sectum_ai.spec.schema import SCHEMA_DIR  # the committed, packaged artifacts
+finding_schema = json_schemas()["Finding"]
+```
+The Pydantic models are authoritative; the schemas are generated from them.
+Regenerate after a model change with `uv run python scripts/gen_schemas.py`
+(a test fails if a committed artifact drifts). The `schema_version` field
+versions the models — bump it for a backward-incompatible change, then regenerate.
+- Documentation: <https://docs.sectum.ai>
+- Source: <https://github.com/sectum-ai/sectum-ai>
+Apache-2.0.

sectum_ai_spec-0.1.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,32 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "sectum-ai-spec"
+version = "0.1.1"
+description = "Sectum AI - Pydantic data models and JSON Schema for the verification spec."
+readme = "README.md"
+requires-python = ">=3.12"
+license = "Apache-2.0"
+authors = [{ name = "Sectum AI" }]
+keywords = ["ai-security", "multi-tenant", "schema", "verification"]
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Intended Audience :: Developers",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Security",
+]
+dependencies = [
+    "pydantic>=2.9",
+    "structlog>=24.4",
+]
+[project.urls]
+Homepage = "https://sectum.ai"
+Repository = "https://github.com/sectum-ai/sectum-ai"
+[tool.hatch.build.targets.wheel]
+only-include = ["src/sectum_ai"]
+sources = ["src"]

sectum_ai_spec-0.1.1/src/sectum_ai/spec/__init__.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""Sectum AI data models and JSON Schema (the ``sectum_ai.spec`` namespace package)."""
+from sectum_ai.spec._logging import configure_logging, get_logger, redact_sensitive
+from sectum_ai.spec.enums import (
+    AccessOutcome,
+    CoverageVerdict,
+    FindingStatus,
+    MarkerType,
+    PrincipalKind,
+    Severity,
+    Surface,
+)
+from sectum_ai.spec.errors import (
+    AdapterError,
+    ConfigError,
+    DetectionError,
+    ErasureUnsupported,
+    EvidenceError,
+    SectumError,
+)
+from sectum_ai.spec.hashing import canonical_hash, sha256_hex, to_canonical_json
+from sectum_ai.spec.models import (
+    SCHEMA_VERSION,
+    ControlMapping,
+    CorpusDocument,
+    EvidencePack,
+    Finding,
+    GroundTruthManifest,
+    Marker,
+    Observation,
+    PlantedLocation,
+    Principal,
+    ProbeStep,
+    RunMetrics,
+    RunResult,
+    Scenario,
+    SectumModel,
+    SharedEntity,
+    Substrate,
+    SyntheticTenantSpec,
+    SyntheticUserSpec,
+)
+from sectum_ai.spec.schema import json_schemas, write_json_schemas
+from sectum_ai.spec.stats import normal_quantile, wilson_interval
+__all__ = [
+    "SCHEMA_VERSION",
+    "AccessOutcome",
+    "AdapterError",
+    "ConfigError",
+    "ControlMapping",
+    "CorpusDocument",
+    "CoverageVerdict",
+    "DetectionError",
+    "ErasureUnsupported",
+    "EvidenceError",
+    "EvidencePack",
+    "Finding",
+    "FindingStatus",
+    "GroundTruthManifest",
+    "Marker",
+    "MarkerType",
+    "Observation",
+    "PlantedLocation",
+    "Principal",
+    "PrincipalKind",
+    "ProbeStep",
+    "RunMetrics",
+    "RunResult",
+    "Scenario",
+    "SectumError",
+    "SectumModel",
+    "Severity",
+    "SharedEntity",
+    "Substrate",
+    "Surface",
+    "SyntheticTenantSpec",
+    "SyntheticUserSpec",
+    "canonical_hash",
+    "configure_logging",
+    "get_logger",
+    "json_schemas",
+    "normal_quantile",
+    "redact_sensitive",
+    "sha256_hex",
+    "to_canonical_json",
+    "wilson_interval",
+    "write_json_schemas",
+]

sectum_ai_spec-0.1.1/src/sectum_ai/spec/_logging.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""Structured logging for Sectum AI (the engineering spec, section 16).
+Sectum is a security product, so its logs must never leak secrets or raw tenant
+content. This module configures :mod:`structlog` so that:
+- logs render as JSON (machine-readable, SIEM-friendly) to **stderr** — stdout is
+  reserved for a command's own output (for example ``probe --output json``);
+- ``DEBUG`` is suppressed by default (section 16: "DEBUG must be off by default");
+- a redaction processor drops secret-bearing keys and raw tenant content from
+  every event emitted above ``DEBUG``.
+Libraries obtain a logger with :func:`get_logger` and never configure logging
+themselves; the application entry point (the CLI) calls :func:`configure_logging`
+once at start-up. The test suite configures it through a fixture so output is
+deterministic and stdout stays clean.
+"""
+from __future__ import annotations
+import logging
+import re
+import sys
+from typing import cast
+import structlog
+from structlog.types import EventDict, FilteringBoundLogger, Processor, WrappedLogger
+# Keys whose values are secrets or canary plaintext: never emitted above DEBUG.
+_SECRET_KEYS = frozenset(
+    {
+        "secret",
+        "token",
+        "password",
+        "passphrase",
+        "api_key",
+        "apikey",
+        "authorization",
+        "credential",
+        "credentials",
+        "plaintext",
+        "canary",
+        "marker_plaintext",
+    }
+)
+# Keys carrying raw tenant content: never emitted above DEBUG (section 16).
+_TENANT_CONTENT_KEYS = frozenset(
+    {"content", "raw_response", "answer", "query", "prompt", "text", "evidence_span"}
+)
+_SENSITIVE_KEYS = _SECRET_KEYS | _TENANT_CONTENT_KEYS
+_REDACTED = "<redacted>"
+# Canary/secret value shapes, scrubbed wherever they appear (nested under a benign
+# key, inside the event message, or in an exception's text) so a future careless
+# call site cannot leak one even without using a sensitive key name. The shapes
+# mirror ``sectum_ai.substrate.markers``: the HARD canary's branded prefix and the
+# three secret-canary credential formats. Entity canaries are ordinary-looking
+# text, so they are covered by the key-name pass (``plaintext``/``canary``/…), not
+# value-scrubbed, to avoid false positives on legitimate prose. The ``sk-``/``AKIA``
+# shapes are anchored at a non-alphanumeric boundary so they match a standalone
+# token, not a substring of a benign identifier (``task-``/``disk-`` + a long id,
+# ``…AKIA…`` mid-word); an underscore still counts as a boundary so ``api_sk-…``
+# is caught.
+_SECRET_VALUE_RE = re.compile(
+    r"SECTUM-CANARY-[A-Z2-7]+"  # HARD canary (branded high-entropy token)
+    r"|(?<![A-Za-z0-9])sk-[A-Za-z0-9]{20,}"  # OpenAI-style API key
+    r"|(?<![A-Za-z0-9])AKIA[A-Z0-9]{16}"  # AWS access-key id
+    r"|\b9\d{2}-\d{2}-\d{4}\b"  # non-issuable 9xx US SSN shape
+)
+def _scrub_text(text: str) -> str:
+    """Replace any embedded canary/secret-shaped token with the redaction marker."""
+    return _SECRET_VALUE_RE.sub(_REDACTED, text)
+def _redact_value(value: object) -> object:
+    """Recursively redact sensitive keys and scrub secret-shaped strings.
+    Walks nested mappings/sequences so a sensitive key or a secret-shaped value is
+    caught at any depth, not just at the top level. A sensitive key is dropped
+    wholesale; strings (including the event message) and the text of exception
+    values are scrubbed for embedded secret shapes. Other scalars pass through.
+    """
+    if isinstance(value, dict):
+        return {
+            key: (
+                _REDACTED
+                if isinstance(key, str) and key.lower() in _SENSITIVE_KEYS
+                else _redact_value(val)
+            )
+            for key, val in value.items()
+        }
+    if isinstance(value, list):
+        return [_redact_value(item) for item in value]
+    if isinstance(value, tuple):
+        return tuple(_redact_value(item) for item in value)
+    if isinstance(value, str):
+        return _scrub_text(value)
+    if isinstance(value, BaseException):
+        return _scrub_text(f"{type(value).__name__}: {value}")
+    return value
+def redact_sensitive(_logger: WrappedLogger, _method: str, event_dict: EventDict) -> EventDict:
+    """Drop secret-bearing keys and raw tenant content from non-DEBUG events.
+    DEBUG is opt-in and off by default, so verbose local troubleshooting may see
+    raw values; everything at INFO and above is redacted (the engineering spec,
+    section 16: "never log secrets or raw tenant content above DEBUG"). Requires
+    ``add_log_level`` to run first so ``level`` is present in the event dict.
+    Redaction is recursive and value-aware (defense in depth): a sensitive key is
+    dropped at any nesting depth, and a canary/secret-shaped token is scrubbed
+    wherever it appears — nested under a benign key, embedded in the event
+    message, or carried in an exception's text — so a careless call site cannot
+    leak one. Today every production call site logs only operational metadata
+    (IDs, counts, model names, digests); this is the backstop for the future.
+    """
+    if event_dict.get("level") == "debug":
+        return event_dict
+    return cast(EventDict, _redact_value(event_dict))
+def configure_logging(*, debug: bool = False, json_output: bool = True) -> None:
+    """Configure process-wide structured logging. Call once, from the entry point.
+    ``debug`` enables DEBUG-level events (off by default, per section 16);
+    ``json_output`` selects the JSON renderer (the default) over a plain console
+    renderer. Logs are always written to stderr so stdout stays reserved for a
+    command's own output.
+    """
+    renderer: Processor = (
+        structlog.processors.JSONRenderer()
+        if json_output
+        else structlog.dev.ConsoleRenderer(colors=False)
+    )
+    structlog.configure(
+        processors=[
+            structlog.contextvars.merge_contextvars,
+            structlog.processors.add_log_level,
+            structlog.processors.TimeStamper(fmt="iso", utc=True),
+            redact_sensitive,
+            renderer,
+        ],
+        wrapper_class=structlog.make_filtering_bound_logger(
+            logging.DEBUG if debug else logging.INFO
+        ),
+        logger_factory=structlog.PrintLoggerFactory(file=sys.stderr),
+        cache_logger_on_first_use=True,
+    )
+def get_logger(name: str | None = None) -> FilteringBoundLogger:
+    """Return a structured logger; the binding name is by convention ``__name__``."""
+    # structlog.get_logger is typed -> Any (it returns a lazy proxy); the bound
+    # logger is a FilteringBoundLogger once configure_logging has run.
+    return cast(FilteringBoundLogger, structlog.get_logger(name))

sectum_ai_spec-0.1.1/src/sectum_ai/spec/enums.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Enumerations shared across the Sectum AI data models."""
+from enum import StrEnum
+class MarkerType(StrEnum):
+    """The three canary marker types (the engineering spec, section 6.3)."""
+    HARD_CANARY = "HARD_CANARY"
+    ENTITY_CANARY = "ENTITY_CANARY"
+    SECRET_CANARY = "SECRET_CANARY"
+class Severity(StrEnum):
+    """Finding severity levels."""
+    CRITICAL = "critical"
+    HIGH = "high"
+    MEDIUM = "medium"
+    LOW = "low"
+    INFO = "info"
+class FindingStatus(StrEnum):
+    """Whether a finding is manifest-confirmed or merely a candidate."""
+    CONFIRMED = "confirmed"
+    UNVERIFIED = "unverified"
+class AccessOutcome(StrEnum):
+    """How an authorization-boundary fetch resolved (the engineering spec, Class 1).
+    A direct cross-tenant fetch should be *denied*. The spec calls out the
+    ambiguity a competitor's scanner misses: a backend that returns ``200`` with
+    an empty body looks like a deny but never enforced one. ``RETURNED`` is the
+    object actually surfacing (a leak if the object is foreign); ``EMPTY`` is the
+    ambiguous empty result; ``DENIED`` is an explicit authorization refusal.
+    """
+    RETURNED = "returned"
+    EMPTY = "empty"
+    DENIED = "denied"
+class PrincipalKind(StrEnum):
+    """The kind of isolation boundary a principal represents.
+    Sectum verifies that one principal's data does not reach another. A tenant
+    is the top-level principal; a user is a sub-principal within a tenant. The
+    substrate, detection, and surfaces are identical at either granularity -
+    only the boundary being verified differs (ADR-0006).
+    """
+    TENANT = "tenant"
+    USER = "user"
+class Surface(StrEnum):
+    """A place tenant data can live or leak (the engineering spec, section 23)."""
+    API = "api"
+    VECTOR_DB = "vector_db"
+    RAG_PIPELINE = "rag_pipeline"
+    PROMPT_LOGS = "prompt_logs"
+    SEMANTIC_CACHE = "semantic_cache"
+    KV_CACHE = "kv_cache"
+    AGENT_MEMORY = "agent_memory"
+    AGENT_FRAMEWORK = "agent_framework"
+    MCP = "mcp"
+    MODEL_ADAPTER = "model_adapter"
+    EVAL_SET = "eval_set"
+    BACKUP = "backup"
+    SEARCH_INDEX = "search_index"
+    TRACING = "tracing"
+class CoverageVerdict(StrEnum):
+    """The per-surface coverage verdict in a Class 11 erasure attestation.
+    A coverage block (surface -> verdict) makes the attestation honest about
+    *what it actually verified*: an attestation must never imply more coverage
+    than it has. The anti-over-claim guarantee is that a surface which was not
+    scanned can only ever be :data:`NOT_COVERED` - never :data:`ERASED` (the
+    engineering spec, section 7, Class 11).
+    The first three values are the tri-state every surface resolves to;
+    :data:`ATTESTABLE_WITH_CAVEAT` is the fourth, DPO-facing distinction the
+    spec calls out at Class 11 hiding place #8: the surface *was* scanned and a
+    baseline existed, but the backend exposes no programmatic per-tenant erasure
+    API, so the data is presumed retained until it ages out of the backend's
+    retention window. It is a documented backend limitation, never a clean pass
+    and never conflated with a residual-after-erasure *failure*.
+    """
+    ERASED = "ERASED"
+    """Covered and clean: a baseline existed and no marker survived erasure."""
+    RESIDUAL = "RESIDUAL"
+    """Covered and failed: the backend was asked to erase and a marker survived."""
+    ATTESTABLE_WITH_CAVEAT = "ATTESTABLE_WITH_CAVEAT"
+    """Covered, but the backend exposes no per-tenant erasure API (hiding place #8)."""
+    NOT_COVERED = "NOT_COVERED"
+    """Out of scope, not scanned, or no pre-erasure baseline - never ``ERASED``."""

sectum_ai_spec-0.1.1/src/sectum_ai/spec/errors.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""The Sectum AI typed exception hierarchy (the engineering spec, section 16).
+Every error Sectum AI raises for a domain or runtime condition derives from
+``SectumError``, so one ``except`` clause can catch the whole family. The
+hierarchy lives in ``sectum-ai-spec`` - the lowest package in the acyclic
+package graph (ADR-0004) - so every other package raises these without a cycle.
+"""
+class SectumError(Exception):
+    """Base class for every error Sectum AI raises for a domain condition."""
+class ConfigError(SectumError):
+    """A scenario or configuration value is missing or invalid."""
+class AdapterError(SectumError):
+    """An adapter is missing, misconfigured, or failed at runtime."""
+class ErasureUnsupported(AdapterError):
+    """A backend exposes no programmatic per-tenant erasure API.
+    Raised from an adapter's ``delete`` when the backend cannot purge a
+    tenant's data through its API (erasure is governed by a retention policy
+    or a manual console action instead). The Class 11 erasure probe catches
+    this and itemises the surface as *attestable-with-caveat* (the engineering
+    spec, section 7, Class 11, hiding place #8) rather than reporting a false
+    erasure success - the data is presumed retained until proven otherwise.
+    """
+class EvidenceError(SectumError):
+    """Building or verifying a tamper-evident evidence pack failed."""
+class DetectionError(SectumError):
+    """The leak-detection pipeline could not produce a result."""

sectum_ai_spec-0.1.1/src/sectum_ai/spec/hashing.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Canonical serialization and hashing for Sectum AI models.
+Hashes are computed over a canonical JSON form (sorted keys, no insignificant
+whitespace) so the same logical content always yields the same digest. This is
+the foundation of the reproducibility contract (the engineering spec, section 6.5) and the
+evidence chain (the engineering spec, section 8).
+Finite floats need no rounding: ``json.dumps`` emits CPython's shortest
+round-tripping ``repr``, which is deterministic, so the same float value
+canonicalizes identically across machines and Python versions. Canonicalization
+is by ``repr``, which distinguishes ``-0.0`` from ``0.0`` (the one case where two
+IEEE-754-equal values serialize differently); no metric carries a signed zero
+whose sign is meaningful, so this is harmless. Rounding would only risk colliding
+genuinely distinct metrics, so it is deliberately not done (ADR-0021). Non-finite
+floats (NaN/Infinity) have no valid, injective JSON form and are refused below.
+"""
+import hashlib
+import json
+from typing import Any
+from pydantic import BaseModel
+def to_canonical_json(obj: BaseModel | dict[str, Any] | list[Any]) -> bytes:
+    """Serialize an object to canonical JSON bytes: sorted keys, UTF-8, compact."""
+    data: Any = obj.model_dump(mode="json") if isinstance(obj, BaseModel) else obj
+    try:
+        text = json.dumps(
+            data,
+            sort_keys=True,
+            separators=(",", ":"),
+            ensure_ascii=False,
+            allow_nan=False,
+        )
+    except ValueError as error:
+        # A non-finite float (NaN/Infinity) would serialize as a JavaScript
+        # literal that is not valid JSON (RFC 8259): a third-party verifier
+        # using a strict parser could not reproduce the digest, and every NaN
+        # collapses to the same token (a non-injective canonical form). Refuse
+        # it so the canonical form stays valid and injective.
+        raise ValueError(f"cannot canonicalize a non-finite float: {error}") from error
+    except TypeError as error:
+        # A raw dict/list (a BaseModel is normalized first via
+        # model_dump(mode="json")) can carry a value json cannot serialize - a
+        # UUID, datetime, bytes, or a non-str key. json raises a bare TypeError;
+        # surface it as a clear, typed canonicalization failure so a caller sees
+        # why the digest could not be computed instead of an opaque traceback.
+        raise TypeError(f"cannot canonicalize a non-JSON-native value: {error}") from error
+    return text.encode("utf-8")
+def sha256_hex(data: bytes) -> str:
+    """Return the hex-encoded SHA-256 digest of ``data``."""
+    return hashlib.sha256(data).hexdigest()
+def canonical_hash(obj: BaseModel | dict[str, Any] | list[Any]) -> str:
+    """Return the SHA-256 hex digest of an object's canonical JSON form."""
+    return sha256_hex(to_canonical_json(obj))