modelstat-sdk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
modelstat/redact.py ADDED
@@ -0,0 +1,150 @@
1
+ """The privacy floor: deterministic, dependency-light redaction that runs
2
+ **in-process before any bytes leave the SDK**.
3
+
4
+ This is a Python port of the daemon's ``SECRET_FLOOR``
5
+ (``packages/core/src/redact-floor.ts``) plus the email / absolute-path PII
6
+ rules, and a faithful peer of the Rust SDK's ``redact.rs``. It is the
7
+ irreducible baseline -- even in "raw" remote mode the floor still scrubs live
8
+ credentials; "raw" means *full turns*, not *leaked keys*.
9
+
10
+ Placeholder style is **square brackets** (``[REDACTED:name]``), matching the
11
+ Rust SDK.
12
+
13
+ Parity note: unlike Rust's ``regex`` crate, Python's :mod:`re` supports
14
+ look-around, so the boundary-sensitive 40-char AWS-secret blob is expressed with
15
+ the original ``(?<!...)`` / ``(?!...)`` look-arounds rather than Rust's explicit
16
+ boundary-capture workaround. The behavior is identical; the unit tests assert
17
+ each credential family is caught.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import re
23
+ from dataclasses import dataclass
24
+ from typing import List, Pattern, Tuple
25
+
26
+ __all__ = ["Redacted", "redact"]
27
+
28
+
29
+ @dataclass
30
+ class Redacted:
31
+ """Result of a redaction pass."""
32
+
33
+ text: str
34
+ # Count of secret-format matches replaced.
35
+ secrets: int = 0
36
+ # Count of PII matches replaced (emails, absolute paths).
37
+ pii: int = 0
38
+
39
+
40
+ # Ordered specific -> generic. Specific provider keys run before the generic
41
+ # env-secret / blob catchers so a known key is labelled precisely. Each entry is
42
+ # a ``(compiled_pattern, replacement)`` pair; replacements that keep a captured
43
+ # group use the ``\g<1>`` back-reference form.
44
+ _FLOOR: List[Tuple[Pattern[str], str]] = [
45
+ (re.compile(r"sk-ant-[A-Za-z0-9_-]{20,}"), "[REDACTED:anthropic_key]"),
46
+ (re.compile(r"sk-(?:proj-)?[A-Za-z0-9_-]{20,}"), "[REDACTED:openai_key]"),
47
+ (re.compile(r"AIza[0-9A-Za-z_-]{35}"), "[REDACTED:google_api_key]"),
48
+ (re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b"), "[REDACTED:aws_access_key]"),
49
+ (re.compile(r"ghp_[A-Za-z0-9]{36,}"), "[REDACTED:github_pat]"),
50
+ (re.compile(r"gho_[A-Za-z0-9]{36,}"), "[REDACTED:github_oauth]"),
51
+ (re.compile(r"gh[sur]_[A-Za-z0-9]{36,}"), "[REDACTED:github_app]"),
52
+ (re.compile(r"xox[aboprs]-[A-Za-z0-9-]{10,}"), "[REDACTED:slack_token]"),
53
+ (
54
+ re.compile(r"(?:sk|pk|rk)_live_[A-Za-z0-9]{24,}"),
55
+ "[REDACTED:stripe_live_key]",
56
+ ),
57
+ (
58
+ re.compile(r"(?:sk|pk|rk)_test_[A-Za-z0-9]{24,}"),
59
+ "[REDACTED:stripe_test_key]",
60
+ ),
61
+ (
62
+ re.compile(r"[MN][A-Za-z\d]{23}\.[\w-]{6}\.[\w-]{27}"),
63
+ "[REDACTED:discord_token]",
64
+ ),
65
+ (
66
+ re.compile(
67
+ r"eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}"
68
+ ),
69
+ "[REDACTED:jwt]",
70
+ ),
71
+ (
72
+ re.compile(
73
+ r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----"
74
+ r"[\s\S]*?"
75
+ r"-----END (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----"
76
+ ),
77
+ "[REDACTED:private_key]",
78
+ ),
79
+ (
80
+ re.compile(r"ds_live_[A-Za-z0-9_-]{32,}"),
81
+ "[REDACTED:modelstat_device_secret]",
82
+ ),
83
+ # Generic env-style KEY=VALUE where KEY names a secret. Keeps the name.
84
+ (
85
+ re.compile(
86
+ r"\b([A-Z][A-Z0-9_]*(?:TOKEN|KEY|SECRET|PASSWORD|PASSWD|API)"
87
+ r"[A-Z0-9_]*)\s*[:=]\s*['\"]?([^\s'\"]{12,})['\"]?"
88
+ ),
89
+ r"\g<1>=[REDACTED:env_secret]",
90
+ ),
91
+ (
92
+ re.compile(r"Bearer\s+[A-Za-z0-9._~+/-]{20,}=*"),
93
+ "Bearer [REDACTED:bearer]",
94
+ ),
95
+ (
96
+ re.compile(
97
+ r"(postgres|mysql|mongodb|redis|amqp)(?:\+[a-z]+)?://"
98
+ r"[^:\s]+:([^@\s]+)@",
99
+ re.IGNORECASE,
100
+ ),
101
+ r"\g<1>://<user>:[REDACTED:db_password]@",
102
+ ),
103
+ # Most generic, LAST among secrets: the 40-char base64-ish blob (e.g. a lone
104
+ # AWS secret access key). Look-arounds leave an embedded blob inside a longer
105
+ # token alone -- the direct Python equivalent of the TS source.
106
+ (
107
+ re.compile(r"(?<![A-Za-z0-9/+=])[A-Za-z0-9/+=]{40}(?![A-Za-z0-9/+=])"),
108
+ "[REDACTED:aws_secret_key]",
109
+ ),
110
+ ]
111
+
112
+ # PII patterns, applied after the secret floor.
113
+ _EMAIL: Pattern[str] = re.compile(
114
+ r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}"
115
+ )
116
+ # Absolute home paths on macOS / Linux / Windows -- they leak usernames and
117
+ # machine layout.
118
+ _ABS_PATH: Pattern[str] = re.compile(
119
+ r"(?:/Users/|/home/)[^\s\"'`)]+|[A-Za-z]:\\Users\\[^\s\"'`)]+"
120
+ )
121
+
122
+
123
+ def redact(input_text: str) -> Redacted:
124
+ """Redact ``input_text`` against the floor.
125
+
126
+ Returns the cleaned text and per-class counts. Each class counts its matches
127
+ *before* replacing (mirroring the Rust reference), so the counts reflect the
128
+ number of distinct secrets/PII scrubbed at each stage.
129
+ """
130
+ text = input_text
131
+ secrets = 0
132
+ pii = 0
133
+
134
+ for pattern, replacement in _FLOOR:
135
+ matches = len(pattern.findall(text))
136
+ if matches:
137
+ text = pattern.sub(replacement, text)
138
+ secrets += matches
139
+
140
+ matches = len(_EMAIL.findall(text))
141
+ if matches:
142
+ text = _EMAIL.sub("[REDACTED:email]", text)
143
+ pii += matches
144
+
145
+ matches = len(_ABS_PATH.findall(text))
146
+ if matches:
147
+ text = _ABS_PATH.sub("[REDACTED:path]", text)
148
+ pii += matches
149
+
150
+ return Redacted(text=text, secrets=secrets, pii=pii)
modelstat/transport.py ADDED
@@ -0,0 +1,97 @@
1
+ """How a built batch leaves the worker.
2
+
3
+ The :class:`Transport` protocol lets tests run the whole pipeline in-process
4
+ (via :class:`FakeTransport`) and lets the daemon / server paths share one
5
+ worker. The real transport uses stdlib :mod:`urllib.request` so the runtime
6
+ dependency footprint stays at a single package (``blake3``) -- no HTTP client
7
+ dependency. Sending blocks, which is fine: it only ever runs on the background
8
+ worker thread, never the caller's hot path.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import urllib.error
15
+ import urllib.request
16
+ from threading import Lock
17
+ from typing import Any, Dict, List
18
+
19
+ from .config import Config
20
+ from .wire import IngestBatch
21
+
22
+ __all__ = ["TransportError", "Transport", "FakeTransport", "HttpTransport"]
23
+
24
+
25
+ class TransportError(Exception):
26
+ """A transport failure. The worker retries once, then drops the batch (in
27
+ local-daemon mode the daemon owns durable retry)."""
28
+
29
+ def __init__(self, message: str, status: int | None = None) -> None:
30
+ super().__init__(message)
31
+ self.status = status
32
+
33
+
34
+ class Transport:
35
+ """Ships a built batch to its destination.
36
+
37
+ A minimal interface (duck-typed): any object with a ``send(batch_dict)``
38
+ method that returns ``None`` on success and raises :class:`TransportError`
39
+ on failure works as a transport.
40
+ """
41
+
42
+ def send(self, batch: Dict[str, Any]) -> None: # pragma: no cover - interface
43
+ raise NotImplementedError
44
+
45
+
46
+ class FakeTransport(Transport):
47
+ """In-memory transport for tests: records every batch it is handed."""
48
+
49
+ def __init__(self) -> None:
50
+ self._batches: List[Dict[str, Any]] = []
51
+ self._lock = Lock()
52
+
53
+ def send(self, batch: Dict[str, Any]) -> None:
54
+ with self._lock:
55
+ self._batches.append(batch)
56
+
57
+ def batches(self) -> List[Dict[str, Any]]:
58
+ """Snapshot of every batch sent so far (as serialized wire dicts)."""
59
+ with self._lock:
60
+ return list(self._batches)
61
+
62
+
63
+ class HttpTransport(Transport):
64
+ """The real HTTP transport: ``POST <endpoint>`` with a bearer ingest key."""
65
+
66
+ def __init__(self, endpoint: str, bearer: str, timeout: float = 10.0) -> None:
67
+ self._endpoint = endpoint
68
+ self._bearer = bearer
69
+ self._timeout = timeout
70
+
71
+ @classmethod
72
+ def from_config(cls, cfg: Config) -> "HttpTransport":
73
+ return cls(endpoint=cfg.mode.endpoint(), bearer=cfg.ingest_key)
74
+
75
+ def send(self, batch: Dict[str, Any]) -> None:
76
+ body = json.dumps(batch).encode("utf-8")
77
+ req = urllib.request.Request(
78
+ self._endpoint,
79
+ data=body,
80
+ method="POST",
81
+ headers={
82
+ "Authorization": f"Bearer {self._bearer}",
83
+ "Content-Type": "application/json",
84
+ },
85
+ )
86
+ try:
87
+ with urllib.request.urlopen(req, timeout=self._timeout) as resp:
88
+ status = resp.status
89
+ if not (200 <= status < 300):
90
+ raise TransportError(f"http status {status}", status=status)
91
+ except urllib.error.HTTPError as e:
92
+ # A non-2xx response surfaces here; preserve the status code.
93
+ raise TransportError(f"http status {e.code}", status=e.code) from e
94
+ except urllib.error.URLError as e:
95
+ raise TransportError(f"transport: {e.reason}") from e
96
+ except OSError as e: # connection refused, timeout, DNS, ...
97
+ raise TransportError(f"transport: {e}") from e
modelstat/wire.py ADDED
@@ -0,0 +1,344 @@
1
+ """The ingest wire contract, as a **self-contained** set of dataclasses.
2
+
3
+ This package is Apache-2.0 and must not depend on the (BSL-licensed) server
4
+ ``modelstat-core``, so the shapes that cross ``POST /v1/ingest`` are re-declared
5
+ here. They mirror ``modelstat-core``'s ``RawEvent`` / ``ToolCallWire`` /
6
+ ``IngestBatch`` field-for-field; the golden-vector tests pin the deterministic
7
+ id derivation to the server's algorithm so the two can never silently drift.
8
+ Ids ride the wire as plain strings (the server deserializes them into its typed
9
+ newtypes).
10
+
11
+ PRIVACY INVARIANT (mirrors the server contract): tool-call records carry only
12
+ hashes, byte sizes, and allowlisted command verbs -- never raw args, results,
13
+ paths, or command text.
14
+
15
+ Serialization rules (must match the server EXACTLY):
16
+
17
+ * JSON keys are ``snake_case`` -- no renames.
18
+ * The producing client's version ships as ``daemon_version`` (NOT
19
+ ``client_version``); the AI-tool label ships as ``agent`` (NOT ``tool``).
20
+ * Optional keys are *omitted* when absent -- we never emit an explicit ``null``,
21
+ because the wire contract is additive and a stray ``null`` is not the same as
22
+ an absent key.
23
+ * A missing or misnamed REQUIRED field is an HTTP 400 that rejects the whole
24
+ batch, so every required field below is always present in the emitted dict.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ from dataclasses import dataclass, field
30
+ from datetime import datetime, timezone
31
+ from enum import Enum
32
+ from typing import Any, Dict, List, Optional
33
+
34
+ import blake3
35
+
36
+ __all__ = [
37
+ "TokenUsage",
38
+ "EventKind",
39
+ "BillingMode",
40
+ "ToolCallStatus",
41
+ "GitContext",
42
+ "RawEvent",
43
+ "ToolCallWire",
44
+ "IngestBatch",
45
+ "content_hash",
46
+ "source_event_id",
47
+ "batch_id",
48
+ "format_rfc3339",
49
+ ]
50
+
51
+
52
+ # ---- RFC3339 timestamp formatting ------------------------------------------
53
+
54
+
55
+ def format_rfc3339(dt: datetime) -> str:
56
+ """Format ``dt`` as an RFC3339 UTC string with millisecond precision.
57
+
58
+ Produces e.g. ``"2026-06-19T00:00:00.000Z"`` -- the exact shape the server
59
+ expects. Naive datetimes are assumed to be UTC; aware datetimes are
60
+ converted to UTC. Millisecond (not microsecond) precision matches the
61
+ ``source_ref`` derivation, which uses ``timestamp_millis``.
62
+ """
63
+ if dt.tzinfo is None:
64
+ dt = dt.replace(tzinfo=timezone.utc)
65
+ dt = dt.astimezone(timezone.utc)
66
+ millis = dt.microsecond // 1000
67
+ return f"{dt.strftime('%Y-%m-%dT%H:%M:%S')}.{millis:03d}Z"
68
+
69
+
70
+ # ---- token usage ------------------------------------------------------------
71
+
72
+
73
+ @dataclass
74
+ class TokenUsage:
75
+ """The five token classes (a fixed taxonomy). Counts default to zero.
76
+
77
+ All five keys are always emitted (the server expects the object), so this
78
+ serializes to ``{input, output, cache_creation, cache_read, reasoning}``
79
+ even when every count is zero.
80
+ """
81
+
82
+ input: int = 0
83
+ output: int = 0
84
+ cache_creation: int = 0
85
+ cache_read: int = 0
86
+ reasoning: int = 0
87
+
88
+ def total(self) -> int:
89
+ """Sum across all five classes."""
90
+ return (
91
+ self.input
92
+ + self.output
93
+ + self.cache_creation
94
+ + self.cache_read
95
+ + self.reasoning
96
+ )
97
+
98
+ def to_dict(self) -> Dict[str, int]:
99
+ return {
100
+ "input": self.input,
101
+ "output": self.output,
102
+ "cache_creation": self.cache_creation,
103
+ "cache_read": self.cache_read,
104
+ "reasoning": self.reasoning,
105
+ }
106
+
107
+
108
+ # ---- enums (serialize to snake_case wire strings) ---------------------------
109
+
110
+
111
+ class EventKind(str, Enum):
112
+ """The structural kind of a source event."""
113
+
114
+ USER_MESSAGE = "user_message"
115
+ ASSISTANT_MESSAGE = "assistant_message"
116
+ TOOL_CALL = "tool_call"
117
+ TOOL_RESULT = "tool_result"
118
+ SUMMARY = "summary"
119
+
120
+
121
+ class BillingMode(str, Enum):
122
+ """How the provider billed the call."""
123
+
124
+ SUBSCRIPTION = "subscription"
125
+ API = "api"
126
+
127
+
128
+ class ToolCallStatus(str, Enum):
129
+ """Outcome of a tool invocation."""
130
+
131
+ SUCCESS = "success"
132
+ ERROR = "error"
133
+ DENIED = "denied"
134
+ TIMEOUT = "timeout"
135
+ UNKNOWN = "unknown"
136
+
137
+
138
+ # ---- git context ------------------------------------------------------------
139
+
140
+
141
+ @dataclass
142
+ class GitContext:
143
+ """Git context captured at the moment of the call (all optional)."""
144
+
145
+ remote_slug: Optional[str] = None
146
+ host: Optional[str] = None
147
+ branch: Optional[str] = None
148
+
149
+ def to_dict(self) -> Dict[str, Any]:
150
+ out: Dict[str, Any] = {}
151
+ if self.remote_slug is not None:
152
+ out["remote_slug"] = self.remote_slug
153
+ if self.host is not None:
154
+ out["host"] = self.host
155
+ if self.branch is not None:
156
+ out["branch"] = self.branch
157
+ return out
158
+
159
+
160
+ # ---- wire records -----------------------------------------------------------
161
+
162
+
163
+ @dataclass
164
+ class RawEvent:
165
+ """One LLM call as it crosses the ingest boundary.
166
+
167
+ Small and numeric, with at most a short redacted excerpt of text. The wire
168
+ key for the AI-tool label is ``agent`` (never ``tool``).
169
+ """
170
+
171
+ source_event_id: str
172
+ ts: datetime
173
+ kind: EventKind
174
+ # The **agent** -- which AI tool/integration produced the call (e.g.
175
+ # ``raw_sdk_openai``), not the provider. (The wire key is ``agent``.)
176
+ agent: str
177
+ provider: str
178
+ session_id: str
179
+ tokens: TokenUsage = field(default_factory=TokenUsage)
180
+ model: Optional[str] = None
181
+ cwd: Optional[str] = None
182
+ git: Optional[GitContext] = None
183
+ duration_ms: Optional[int] = None
184
+ billing: Optional[BillingMode] = None
185
+ # Redacted excerpt used to build summaries downstream. Capped at 320 chars
186
+ # in the standard (floor-redacted) path; carries the full redacted turns in
187
+ # remote-raw mode, where the server summarizes.
188
+ content_excerpt: Optional[str] = None
189
+
190
+ def to_dict(self) -> Dict[str, Any]:
191
+ out: Dict[str, Any] = {
192
+ "source_event_id": self.source_event_id,
193
+ "ts": format_rfc3339(self.ts),
194
+ "kind": self.kind.value,
195
+ "agent": self.agent,
196
+ "provider": self.provider,
197
+ "session_id": self.session_id,
198
+ "tokens": self.tokens.to_dict(),
199
+ }
200
+ # Optional keys -- omit when absent (never emit null).
201
+ if self.model is not None:
202
+ out["model"] = self.model
203
+ if self.cwd is not None:
204
+ out["cwd"] = self.cwd
205
+ if self.git is not None:
206
+ out["git"] = self.git.to_dict()
207
+ if self.duration_ms is not None:
208
+ out["duration_ms"] = self.duration_ms
209
+ if self.billing is not None:
210
+ out["billing"] = self.billing.value
211
+ if self.content_excerpt is not None:
212
+ out["content_excerpt"] = self.content_excerpt
213
+ return out
214
+
215
+
216
+ @dataclass
217
+ class ToolCallWire:
218
+ """One tool invocation, privacy-reduced. Hashes and sizes only."""
219
+
220
+ external_call_id: str
221
+ session_id: str
222
+ source_event_id: str
223
+ # The **agent** (AI tool) that ran the call -- same space as RawEvent.agent.
224
+ agent: str
225
+ # ``builtin`` or ``mcp:<server>``.
226
+ server: str
227
+ # Bare tool name (``Bash``, ``create_pr``).
228
+ name: str
229
+ call_index: int
230
+ started_at: datetime
231
+ status: ToolCallStatus
232
+ # Hex sha256 of the serialized input; ``""`` when the call had no input.
233
+ args_hash: str
234
+ # Sha256 of the sorted top-level arg key names joined by ``,``; the literal
235
+ # ``none`` when the input is not an object.
236
+ signature_hash: str
237
+ args_bytes: int
238
+ result_bytes: int
239
+ segment_id: Optional[str] = None
240
+ turn_index: Optional[int] = None
241
+ ended_at: Optional[datetime] = None
242
+ model: Optional[str] = None
243
+ command_families: List[str] = field(default_factory=list)
244
+
245
+ def to_dict(self) -> Dict[str, Any]:
246
+ out: Dict[str, Any] = {
247
+ "external_call_id": self.external_call_id,
248
+ "session_id": self.session_id,
249
+ "source_event_id": self.source_event_id,
250
+ "agent": self.agent,
251
+ "server": self.server,
252
+ "name": self.name,
253
+ "call_index": self.call_index,
254
+ "started_at": format_rfc3339(self.started_at),
255
+ "status": self.status.value,
256
+ "args_hash": self.args_hash,
257
+ "signature_hash": self.signature_hash,
258
+ "args_bytes": self.args_bytes,
259
+ "result_bytes": self.result_bytes,
260
+ }
261
+ # ``segment_id`` and ``turn_index`` are intentionally never emitted by
262
+ # the SDK (segmentation is produced downstream), but we honor them if
263
+ # set for forward-compatibility.
264
+ if self.segment_id is not None:
265
+ out["segment_id"] = self.segment_id
266
+ if self.turn_index is not None:
267
+ out["turn_index"] = self.turn_index
268
+ if self.ended_at is not None:
269
+ out["ended_at"] = format_rfc3339(self.ended_at)
270
+ if self.model is not None:
271
+ out["model"] = self.model
272
+ # Omit ``command_families`` when empty; the server caps it at 3.
273
+ if self.command_families:
274
+ out["command_families"] = self.command_families
275
+ return out
276
+
277
+
278
+ @dataclass
279
+ class IngestBatch:
280
+ """The full ingest payload.
281
+
282
+ The SDK only ever emits ``events`` (+ ``tool_calls``); segmentation,
283
+ summarization, titles, and session-installs are produced downstream by the
284
+ daemon or server.
285
+ """
286
+
287
+ batch_id: str
288
+ device_id: str
289
+ # This SDK build's version string (<=40 chars). Ships as the wire
290
+ # ``daemon_version`` field -- the server's name for the producing client's
291
+ # version; an SDK is just another producer of the ingest contract.
292
+ daemon_version: str
293
+ events: List[RawEvent] = field(default_factory=list)
294
+ tool_calls: List[ToolCallWire] = field(default_factory=list)
295
+
296
+ def to_dict(self) -> Dict[str, Any]:
297
+ out: Dict[str, Any] = {
298
+ "batch_id": self.batch_id,
299
+ "device_id": self.device_id,
300
+ "daemon_version": self.daemon_version,
301
+ "events": [e.to_dict() for e in self.events],
302
+ }
303
+ # Omit ``tool_calls`` entirely when empty (do NOT send an empty list).
304
+ if self.tool_calls:
305
+ out["tool_calls"] = [t.to_dict() for t in self.tool_calls]
306
+ return out
307
+
308
+
309
+ # ---- deterministic ids (mirror modelstat-core::ids) -------------------------
310
+
311
+ # The ASCII unit separator joined between consecutive parts (never before the
312
+ # first or after the last). This exact framing is what makes ``["ab", ""]``
313
+ # differ from ``["a", "b"]``.
314
+ _UNIT_SEPARATOR = b"\x1f"
315
+
316
+
317
+ def content_hash(parts: List[str]) -> str:
318
+ """blake3 content hash of ``parts``.
319
+
320
+ The parts' UTF-8 bytes are joined by a single ``0x1F`` byte between
321
+ consecutive parts (NOT before the first / after the last), then hashed with
322
+ blake3 and rendered as lowercase hex truncated to the first 32 characters.
323
+ Identical to the server's ``content_hash`` so client- and server-derived ids
324
+ agree.
325
+ """
326
+ joined = _UNIT_SEPARATOR.join(p.encode("utf-8") for p in parts)
327
+ return blake3.blake3(joined).hexdigest()[:32]
328
+
329
+
330
+ def source_event_id(device_id: str, source_ref: str) -> str:
331
+ """Stable per-source-event dedupe key: ``evt_<content_hash(device, ref)>``.
332
+
333
+ ``source_ref`` must be stable for the same logical call across retries.
334
+ """
335
+ return "evt_" + content_hash([device_id, source_ref])
336
+
337
+
338
+ def batch_id(source_event_ids: List[str]) -> str:
339
+ """Deterministic batch id over the (sorted) source-event ids it carries.
340
+
341
+ A resend of the same events reuses the id and the server's manifest dedupes
342
+ it.
343
+ """
344
+ return "batch_" + content_hash(sorted(source_event_ids))