PyPI - modelstat-sdk - Versions diffs - 0.0.1__py3-none-any.whl - Mend

modelstat-sdk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

modelstat/__init__.py +94 -0
modelstat/_version.py +8 -0
modelstat/capture.py +264 -0
modelstat/client.py +72 -0
modelstat/config.py +135 -0
modelstat/py.typed +0 -0
modelstat/redact.py +150 -0
modelstat/transport.py +97 -0
modelstat/wire.py +344 -0
modelstat/worker.py +183 -0
modelstat_sdk-0.0.1.dist-info/METADATA +158 -0
modelstat_sdk-0.0.1.dist-info/RECORD +13 -0
modelstat_sdk-0.0.1.dist-info/WHEEL +4 -0

modelstat/redact.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""The privacy floor: deterministic, dependency-light redaction that runs
+**in-process before any bytes leave the SDK**.
+This is a Python port of the daemon's ``SECRET_FLOOR``
+(``packages/core/src/redact-floor.ts``) plus the email / absolute-path PII
+rules, and a faithful peer of the Rust SDK's ``redact.rs``. It is the
+irreducible baseline -- even in "raw" remote mode the floor still scrubs live
+credentials; "raw" means *full turns*, not *leaked keys*.
+Placeholder style is **square brackets** (``[REDACTED:name]``), matching the
+Rust SDK.
+Parity note: unlike Rust's ``regex`` crate, Python's :mod:`re` supports
+look-around, so the boundary-sensitive 40-char AWS-secret blob is expressed with
+the original ``(?<!...)`` / ``(?!...)`` look-arounds rather than Rust's explicit
+boundary-capture workaround. The behavior is identical; the unit tests assert
+each credential family is caught.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import List, Pattern, Tuple
+__all__ = ["Redacted", "redact"]
+@dataclass
+class Redacted:
+    """Result of a redaction pass."""
+    text: str
+    # Count of secret-format matches replaced.
+    secrets: int = 0
+    # Count of PII matches replaced (emails, absolute paths).
+    pii: int = 0
+# Ordered specific -> generic. Specific provider keys run before the generic
+# env-secret / blob catchers so a known key is labelled precisely. Each entry is
+# a ``(compiled_pattern, replacement)`` pair; replacements that keep a captured
+# group use the ``\g<1>`` back-reference form.
+_FLOOR: List[Tuple[Pattern[str], str]] = [
+    (re.compile(r"sk-ant-[A-Za-z0-9_-]{20,}"), "[REDACTED:anthropic_key]"),
+    (re.compile(r"sk-(?:proj-)?[A-Za-z0-9_-]{20,}"), "[REDACTED:openai_key]"),
+    (re.compile(r"AIza[0-9A-Za-z_-]{35}"), "[REDACTED:google_api_key]"),
+    (re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b"), "[REDACTED:aws_access_key]"),
+    (re.compile(r"ghp_[A-Za-z0-9]{36,}"), "[REDACTED:github_pat]"),
+    (re.compile(r"gho_[A-Za-z0-9]{36,}"), "[REDACTED:github_oauth]"),
+    (re.compile(r"gh[sur]_[A-Za-z0-9]{36,}"), "[REDACTED:github_app]"),
+    (re.compile(r"xox[aboprs]-[A-Za-z0-9-]{10,}"), "[REDACTED:slack_token]"),
+    (
+        re.compile(r"(?:sk|pk|rk)_live_[A-Za-z0-9]{24,}"),
+        "[REDACTED:stripe_live_key]",
+    ),
+    (
+        re.compile(r"(?:sk|pk|rk)_test_[A-Za-z0-9]{24,}"),
+        "[REDACTED:stripe_test_key]",
+    ),
+    (
+        re.compile(r"[MN][A-Za-z\d]{23}\.[\w-]{6}\.[\w-]{27}"),
+        "[REDACTED:discord_token]",
+    ),
+    (
+        re.compile(
+            r"eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}"
+        ),
+        "[REDACTED:jwt]",
+    ),
+    (
+        re.compile(
+            r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----"
+            r"[\s\S]*?"
+            r"-----END (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----"
+        ),
+        "[REDACTED:private_key]",
+    ),
+    (
+        re.compile(r"ds_live_[A-Za-z0-9_-]{32,}"),
+        "[REDACTED:modelstat_device_secret]",
+    ),
+    # Generic env-style KEY=VALUE where KEY names a secret. Keeps the name.
+    (
+        re.compile(
+            r"\b([A-Z][A-Z0-9_]*(?:TOKEN|KEY|SECRET|PASSWORD|PASSWD|API)"
+            r"[A-Z0-9_]*)\s*[:=]\s*['\"]?([^\s'\"]{12,})['\"]?"
+        ),
+        r"\g<1>=[REDACTED:env_secret]",
+    ),
+    (
+        re.compile(r"Bearer\s+[A-Za-z0-9._~+/-]{20,}=*"),
+        "Bearer [REDACTED:bearer]",
+    ),
+    (
+        re.compile(
+            r"(postgres|mysql|mongodb|redis|amqp)(?:\+[a-z]+)?://"
+            r"[^:\s]+:([^@\s]+)@",
+            re.IGNORECASE,
+        ),
+        r"\g<1>://<user>:[REDACTED:db_password]@",
+    ),
+    # Most generic, LAST among secrets: the 40-char base64-ish blob (e.g. a lone
+    # AWS secret access key). Look-arounds leave an embedded blob inside a longer
+    # token alone -- the direct Python equivalent of the TS source.
+    (
+        re.compile(r"(?<![A-Za-z0-9/+=])[A-Za-z0-9/+=]{40}(?![A-Za-z0-9/+=])"),
+        "[REDACTED:aws_secret_key]",
+    ),
+]
+# PII patterns, applied after the secret floor.
+_EMAIL: Pattern[str] = re.compile(
+    r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}"
+)
+# Absolute home paths on macOS / Linux / Windows -- they leak usernames and
+# machine layout.
+_ABS_PATH: Pattern[str] = re.compile(
+    r"(?:/Users/|/home/)[^\s\"'`)]+|[A-Za-z]:\\Users\\[^\s\"'`)]+"
+)
+def redact(input_text: str) -> Redacted:
+    """Redact ``input_text`` against the floor.
+    Returns the cleaned text and per-class counts. Each class counts its matches
+    *before* replacing (mirroring the Rust reference), so the counts reflect the
+    number of distinct secrets/PII scrubbed at each stage.
+    """
+    text = input_text
+    secrets = 0
+    pii = 0
+    for pattern, replacement in _FLOOR:
+        matches = len(pattern.findall(text))
+        if matches:
+            text = pattern.sub(replacement, text)
+            secrets += matches
+    matches = len(_EMAIL.findall(text))
+    if matches:
+        text = _EMAIL.sub("[REDACTED:email]", text)
+        pii += matches
+    matches = len(_ABS_PATH.findall(text))
+    if matches:
+        text = _ABS_PATH.sub("[REDACTED:path]", text)
+        pii += matches
+    return Redacted(text=text, secrets=secrets, pii=pii)

modelstat/transport.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""How a built batch leaves the worker.
+The :class:`Transport` protocol lets tests run the whole pipeline in-process
+(via :class:`FakeTransport`) and lets the daemon / server paths share one
+worker. The real transport uses stdlib :mod:`urllib.request` so the runtime
+dependency footprint stays at a single package (``blake3``) -- no HTTP client
+dependency. Sending blocks, which is fine: it only ever runs on the background
+worker thread, never the caller's hot path.
+"""
+from __future__ import annotations
+import json
+import urllib.error
+import urllib.request
+from threading import Lock
+from typing import Any, Dict, List
+from .config import Config
+from .wire import IngestBatch
+__all__ = ["TransportError", "Transport", "FakeTransport", "HttpTransport"]
+class TransportError(Exception):
+    """A transport failure. The worker retries once, then drops the batch (in
+    local-daemon mode the daemon owns durable retry)."""
+    def __init__(self, message: str, status: int | None = None) -> None:
+        super().__init__(message)
+        self.status = status
+class Transport:
+    """Ships a built batch to its destination.
+    A minimal interface (duck-typed): any object with a ``send(batch_dict)``
+    method that returns ``None`` on success and raises :class:`TransportError`
+    on failure works as a transport.
+    """
+    def send(self, batch: Dict[str, Any]) -> None:  # pragma: no cover - interface
+        raise NotImplementedError
+class FakeTransport(Transport):
+    """In-memory transport for tests: records every batch it is handed."""
+    def __init__(self) -> None:
+        self._batches: List[Dict[str, Any]] = []
+        self._lock = Lock()
+    def send(self, batch: Dict[str, Any]) -> None:
+        with self._lock:
+            self._batches.append(batch)
+    def batches(self) -> List[Dict[str, Any]]:
+        """Snapshot of every batch sent so far (as serialized wire dicts)."""
+        with self._lock:
+            return list(self._batches)
+class HttpTransport(Transport):
+    """The real HTTP transport: ``POST <endpoint>`` with a bearer ingest key."""
+    def __init__(self, endpoint: str, bearer: str, timeout: float = 10.0) -> None:
+        self._endpoint = endpoint
+        self._bearer = bearer
+        self._timeout = timeout
+    @classmethod
+    def from_config(cls, cfg: Config) -> "HttpTransport":
+        return cls(endpoint=cfg.mode.endpoint(), bearer=cfg.ingest_key)
+    def send(self, batch: Dict[str, Any]) -> None:
+        body = json.dumps(batch).encode("utf-8")
+        req = urllib.request.Request(
+            self._endpoint,
+            data=body,
+            method="POST",
+            headers={
+                "Authorization": f"Bearer {self._bearer}",
+                "Content-Type": "application/json",
+            },
+        )
+        try:
+            with urllib.request.urlopen(req, timeout=self._timeout) as resp:
+                status = resp.status
+                if not (200 <= status < 300):
+                    raise TransportError(f"http status {status}", status=status)
+        except urllib.error.HTTPError as e:
+            # A non-2xx response surfaces here; preserve the status code.
+            raise TransportError(f"http status {e.code}", status=e.code) from e
+        except urllib.error.URLError as e:
+            raise TransportError(f"transport: {e.reason}") from e
+        except OSError as e:  # connection refused, timeout, DNS, ...
+            raise TransportError(f"transport: {e}") from e

modelstat/wire.py ADDED Viewed

@@ -0,0 +1,344 @@
+"""The ingest wire contract, as a **self-contained** set of dataclasses.
+This package is Apache-2.0 and must not depend on the (BSL-licensed) server
+``modelstat-core``, so the shapes that cross ``POST /v1/ingest`` are re-declared
+here. They mirror ``modelstat-core``'s ``RawEvent`` / ``ToolCallWire`` /
+``IngestBatch`` field-for-field; the golden-vector tests pin the deterministic
+id derivation to the server's algorithm so the two can never silently drift.
+Ids ride the wire as plain strings (the server deserializes them into its typed
+newtypes).
+PRIVACY INVARIANT (mirrors the server contract): tool-call records carry only
+hashes, byte sizes, and allowlisted command verbs -- never raw args, results,
+paths, or command text.
+Serialization rules (must match the server EXACTLY):
+* JSON keys are ``snake_case`` -- no renames.
+* The producing client's version ships as ``daemon_version`` (NOT
+  ``client_version``); the AI-tool label ships as ``agent`` (NOT ``tool``).
+* Optional keys are *omitted* when absent -- we never emit an explicit ``null``,
+  because the wire contract is additive and a stray ``null`` is not the same as
+  an absent key.
+* A missing or misnamed REQUIRED field is an HTTP 400 that rejects the whole
+  batch, so every required field below is always present in the emitted dict.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, Dict, List, Optional
+import blake3
+__all__ = [
+    "TokenUsage",
+    "EventKind",
+    "BillingMode",
+    "ToolCallStatus",
+    "GitContext",
+    "RawEvent",
+    "ToolCallWire",
+    "IngestBatch",
+    "content_hash",
+    "source_event_id",
+    "batch_id",
+    "format_rfc3339",
+]
+# ---- RFC3339 timestamp formatting ------------------------------------------
+def format_rfc3339(dt: datetime) -> str:
+    """Format ``dt`` as an RFC3339 UTC string with millisecond precision.
+    Produces e.g. ``"2026-06-19T00:00:00.000Z"`` -- the exact shape the server
+    expects. Naive datetimes are assumed to be UTC; aware datetimes are
+    converted to UTC. Millisecond (not microsecond) precision matches the
+    ``source_ref`` derivation, which uses ``timestamp_millis``.
+    """
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    dt = dt.astimezone(timezone.utc)
+    millis = dt.microsecond // 1000
+    return f"{dt.strftime('%Y-%m-%dT%H:%M:%S')}.{millis:03d}Z"
+# ---- token usage ------------------------------------------------------------
+@dataclass
+class TokenUsage:
+    """The five token classes (a fixed taxonomy). Counts default to zero.
+    All five keys are always emitted (the server expects the object), so this
+    serializes to ``{input, output, cache_creation, cache_read, reasoning}``
+    even when every count is zero.
+    """
+    input: int = 0
+    output: int = 0
+    cache_creation: int = 0
+    cache_read: int = 0
+    reasoning: int = 0
+    def total(self) -> int:
+        """Sum across all five classes."""
+        return (
+            self.input
+            + self.output
+            + self.cache_creation
+            + self.cache_read
+            + self.reasoning
+        )
+    def to_dict(self) -> Dict[str, int]:
+        return {
+            "input": self.input,
+            "output": self.output,
+            "cache_creation": self.cache_creation,
+            "cache_read": self.cache_read,
+            "reasoning": self.reasoning,
+        }
+# ---- enums (serialize to snake_case wire strings) ---------------------------
+class EventKind(str, Enum):
+    """The structural kind of a source event."""
+    USER_MESSAGE = "user_message"
+    ASSISTANT_MESSAGE = "assistant_message"
+    TOOL_CALL = "tool_call"
+    TOOL_RESULT = "tool_result"
+    SUMMARY = "summary"
+class BillingMode(str, Enum):
+    """How the provider billed the call."""
+    SUBSCRIPTION = "subscription"
+    API = "api"
+class ToolCallStatus(str, Enum):
+    """Outcome of a tool invocation."""
+    SUCCESS = "success"
+    ERROR = "error"
+    DENIED = "denied"
+    TIMEOUT = "timeout"
+    UNKNOWN = "unknown"
+# ---- git context ------------------------------------------------------------
+@dataclass
+class GitContext:
+    """Git context captured at the moment of the call (all optional)."""
+    remote_slug: Optional[str] = None
+    host: Optional[str] = None
+    branch: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {}
+        if self.remote_slug is not None:
+            out["remote_slug"] = self.remote_slug
+        if self.host is not None:
+            out["host"] = self.host
+        if self.branch is not None:
+            out["branch"] = self.branch
+        return out
+# ---- wire records -----------------------------------------------------------
+@dataclass
+class RawEvent:
+    """One LLM call as it crosses the ingest boundary.
+    Small and numeric, with at most a short redacted excerpt of text. The wire
+    key for the AI-tool label is ``agent`` (never ``tool``).
+    """
+    source_event_id: str
+    ts: datetime
+    kind: EventKind
+    # The **agent** -- which AI tool/integration produced the call (e.g.
+    # ``raw_sdk_openai``), not the provider. (The wire key is ``agent``.)
+    agent: str
+    provider: str
+    session_id: str
+    tokens: TokenUsage = field(default_factory=TokenUsage)
+    model: Optional[str] = None
+    cwd: Optional[str] = None
+    git: Optional[GitContext] = None
+    duration_ms: Optional[int] = None
+    billing: Optional[BillingMode] = None
+    # Redacted excerpt used to build summaries downstream. Capped at 320 chars
+    # in the standard (floor-redacted) path; carries the full redacted turns in
+    # remote-raw mode, where the server summarizes.
+    content_excerpt: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {
+            "source_event_id": self.source_event_id,
+            "ts": format_rfc3339(self.ts),
+            "kind": self.kind.value,
+            "agent": self.agent,
+            "provider": self.provider,
+            "session_id": self.session_id,
+            "tokens": self.tokens.to_dict(),
+        }
+        # Optional keys -- omit when absent (never emit null).
+        if self.model is not None:
+            out["model"] = self.model
+        if self.cwd is not None:
+            out["cwd"] = self.cwd
+        if self.git is not None:
+            out["git"] = self.git.to_dict()
+        if self.duration_ms is not None:
+            out["duration_ms"] = self.duration_ms
+        if self.billing is not None:
+            out["billing"] = self.billing.value
+        if self.content_excerpt is not None:
+            out["content_excerpt"] = self.content_excerpt
+        return out
+@dataclass
+class ToolCallWire:
+    """One tool invocation, privacy-reduced. Hashes and sizes only."""
+    external_call_id: str
+    session_id: str
+    source_event_id: str
+    # The **agent** (AI tool) that ran the call -- same space as RawEvent.agent.
+    agent: str
+    # ``builtin`` or ``mcp:<server>``.
+    server: str
+    # Bare tool name (``Bash``, ``create_pr``).
+    name: str
+    call_index: int
+    started_at: datetime
+    status: ToolCallStatus
+    # Hex sha256 of the serialized input; ``""`` when the call had no input.
+    args_hash: str
+    # Sha256 of the sorted top-level arg key names joined by ``,``; the literal
+    # ``none`` when the input is not an object.
+    signature_hash: str
+    args_bytes: int
+    result_bytes: int
+    segment_id: Optional[str] = None
+    turn_index: Optional[int] = None
+    ended_at: Optional[datetime] = None
+    model: Optional[str] = None
+    command_families: List[str] = field(default_factory=list)
+    def to_dict(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {
+            "external_call_id": self.external_call_id,
+            "session_id": self.session_id,
+            "source_event_id": self.source_event_id,
+            "agent": self.agent,
+            "server": self.server,
+            "name": self.name,
+            "call_index": self.call_index,
+            "started_at": format_rfc3339(self.started_at),
+            "status": self.status.value,
+            "args_hash": self.args_hash,
+            "signature_hash": self.signature_hash,
+            "args_bytes": self.args_bytes,
+            "result_bytes": self.result_bytes,
+        }
+        # ``segment_id`` and ``turn_index`` are intentionally never emitted by
+        # the SDK (segmentation is produced downstream), but we honor them if
+        # set for forward-compatibility.
+        if self.segment_id is not None:
+            out["segment_id"] = self.segment_id
+        if self.turn_index is not None:
+            out["turn_index"] = self.turn_index
+        if self.ended_at is not None:
+            out["ended_at"] = format_rfc3339(self.ended_at)
+        if self.model is not None:
+            out["model"] = self.model
+        # Omit ``command_families`` when empty; the server caps it at 3.
+        if self.command_families:
+            out["command_families"] = self.command_families
+        return out
+@dataclass
+class IngestBatch:
+    """The full ingest payload.
+    The SDK only ever emits ``events`` (+ ``tool_calls``); segmentation,
+    summarization, titles, and session-installs are produced downstream by the
+    daemon or server.
+    """
+    batch_id: str
+    device_id: str
+    # This SDK build's version string (<=40 chars). Ships as the wire
+    # ``daemon_version`` field -- the server's name for the producing client's
+    # version; an SDK is just another producer of the ingest contract.
+    daemon_version: str
+    events: List[RawEvent] = field(default_factory=list)
+    tool_calls: List[ToolCallWire] = field(default_factory=list)
+    def to_dict(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {
+            "batch_id": self.batch_id,
+            "device_id": self.device_id,
+            "daemon_version": self.daemon_version,
+            "events": [e.to_dict() for e in self.events],
+        }
+        # Omit ``tool_calls`` entirely when empty (do NOT send an empty list).
+        if self.tool_calls:
+            out["tool_calls"] = [t.to_dict() for t in self.tool_calls]
+        return out
+# ---- deterministic ids (mirror modelstat-core::ids) -------------------------
+# The ASCII unit separator joined between consecutive parts (never before the
+# first or after the last). This exact framing is what makes ``["ab", ""]``
+# differ from ``["a", "b"]``.
+_UNIT_SEPARATOR = b"\x1f"
+def content_hash(parts: List[str]) -> str:
+    """blake3 content hash of ``parts``.
+    The parts' UTF-8 bytes are joined by a single ``0x1F`` byte between
+    consecutive parts (NOT before the first / after the last), then hashed with
+    blake3 and rendered as lowercase hex truncated to the first 32 characters.
+    Identical to the server's ``content_hash`` so client- and server-derived ids
+    agree.
+    """
+    joined = _UNIT_SEPARATOR.join(p.encode("utf-8") for p in parts)
+    return blake3.blake3(joined).hexdigest()[:32]
+def source_event_id(device_id: str, source_ref: str) -> str:
+    """Stable per-source-event dedupe key: ``evt_<content_hash(device, ref)>``.
+    ``source_ref`` must be stable for the same logical call across retries.
+    """
+    return "evt_" + content_hash([device_id, source_ref])
+def batch_id(source_event_ids: List[str]) -> str:
+    """Deterministic batch id over the (sorted) source-event ids it carries.
+    A resend of the same events reuses the id and the server's manifest dedupes
+    it.
+    """
+    return "batch_" + content_hash(sorted(source_event_ids))