cap-sdk-python 2.5.2__tar.gz → 2.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/PKG-INFO +110 -1
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/README.md +107 -0
- cap_sdk_python-2.5.4/cap/__init__.py +145 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/bus.py +13 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/client.py +9 -0
- cap_sdk_python-2.5.4/cap/errors.py +116 -0
- cap_sdk_python-2.5.4/cap/heartbeat.py +153 -0
- cap_sdk_python-2.5.4/cap/metrics.py +33 -0
- cap_sdk_python-2.5.4/cap/middleware.py +50 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/alert_pb2_grpc.py +1 -1
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/buspacket_pb2_grpc.py +1 -1
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/handshake_pb2_grpc.py +1 -1
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/heartbeat_pb2_grpc.py +1 -1
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/job_pb2_grpc.py +1 -1
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/safety_pb2_grpc.py +1 -1
- cap_sdk_python-2.5.4/cap/progress.py +100 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/runtime.py +193 -81
- cap_sdk_python-2.5.4/cap/testing.py +125 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/worker.py +51 -7
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap_sdk_python.egg-info/PKG-INFO +110 -1
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap_sdk_python.egg-info/SOURCES.txt +12 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap_sdk_python.egg-info/requires.txt +3 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/pyproject.toml +4 -1
- cap_sdk_python-2.5.4/tests/test_errors.py +46 -0
- cap_sdk_python-2.5.4/tests/test_heartbeat.py +311 -0
- cap_sdk_python-2.5.4/tests/test_metrics.py +117 -0
- cap_sdk_python-2.5.4/tests/test_middleware.py +162 -0
- cap_sdk_python-2.5.4/tests/test_progress.py +260 -0
- cap_sdk_python-2.5.4/tests/test_testing.py +45 -0
- cap_sdk_python-2.5.2/cap/__init__.py +0 -70
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/__init__.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/__init__.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/__init__.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/__init__.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/alert_pb2.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/buspacket_pb2.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/handshake_pb2.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/heartbeat_pb2.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/job_pb2.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/pb/cordum/agent/v1/safety_pb2.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/subjects.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap/validate.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap_sdk_python.egg-info/dependency_links.txt +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/cap_sdk_python.egg-info/top_level.txt +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/setup.cfg +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/tests/test_conformance.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/tests/test_runtime.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/tests/test_sdk.py +0 -0
- {cap_sdk_python-2.5.2 → cap_sdk_python-2.5.4}/tests/test_validate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cap-sdk-python
|
|
3
|
-
Version: 2.5.
|
|
3
|
+
Version: 2.5.4
|
|
4
4
|
Summary: CAP (Cordum Agent Protocol) Python SDK
|
|
5
5
|
Author-email: Cordum <eng@cordum.io>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -16,6 +16,8 @@ Requires-Dist: nats-py>=2.6.0
|
|
|
16
16
|
Requires-Dist: cryptography>=41.0.0
|
|
17
17
|
Requires-Dist: pydantic>=2.6.0
|
|
18
18
|
Requires-Dist: redis>=5.0.0
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pdoc>=14.0; extra == "dev"
|
|
19
21
|
|
|
20
22
|
# CAP Python SDK
|
|
21
23
|
|
|
@@ -97,6 +99,27 @@ Asyncio-first SDK with NATS helpers for CAP workers and clients.
|
|
|
97
99
|
|
|
98
100
|
Swap out `cap.bus` if you need a different transport.
|
|
99
101
|
|
|
102
|
+
## Testing
|
|
103
|
+
|
|
104
|
+
The `cap.testing` module lets you test handlers without running NATS or Redis.
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from cap.testing import run_handler
|
|
108
|
+
from cap.pb.cordum.agent.v1 import job_pb2
|
|
109
|
+
|
|
110
|
+
async def test_echo():
|
|
111
|
+
result = await run_handler(
|
|
112
|
+
lambda ctx, data: {"echo": data["prompt"]},
|
|
113
|
+
{"prompt": "hello"},
|
|
114
|
+
topic="job.echo",
|
|
115
|
+
)
|
|
116
|
+
assert result.status == job_pb2.JOB_STATUS_SUCCEEDED
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
- `run_handler(handler, input, **options)` — runs a single handler invocation and returns the `JobResult`.
|
|
120
|
+
- `create_test_agent(**options)` — returns `(agent, mock_nats, store)` pre-wired with `MockNATS` + `InMemoryBlobStore`.
|
|
121
|
+
- `MockNATS` — in-memory NATS mock for custom test setups.
|
|
122
|
+
|
|
100
123
|
## Runtime (High-Level SDK)
|
|
101
124
|
The runtime hides NATS/Redis plumbing and gives you typed handlers.
|
|
102
125
|
|
|
@@ -120,6 +143,92 @@ async def summarize(ctx: Context, data: Input) -> Output:
|
|
|
120
143
|
asyncio.run(agent.run())
|
|
121
144
|
```
|
|
122
145
|
|
|
146
|
+
### Middleware
|
|
147
|
+
|
|
148
|
+
Add cross-cutting concerns (logging, auth, metrics) without modifying handlers:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from cap.middleware import logging_middleware
|
|
152
|
+
|
|
153
|
+
# Built-in logging middleware
|
|
154
|
+
agent.use(logging_middleware())
|
|
155
|
+
|
|
156
|
+
# Custom middleware
|
|
157
|
+
async def timing(ctx, data, next_fn):
|
|
158
|
+
import time
|
|
159
|
+
start = time.monotonic()
|
|
160
|
+
result = await next_fn(ctx, data)
|
|
161
|
+
elapsed = time.monotonic() - start
|
|
162
|
+
print(f"job {ctx.job_id} took {elapsed:.3f}s")
|
|
163
|
+
return result
|
|
164
|
+
|
|
165
|
+
agent.use(timing)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Middleware executes in registration order (FIFO). Each can inspect context,
|
|
169
|
+
measure timing, or short-circuit by returning without calling `next_fn`.
|
|
170
|
+
|
|
123
171
|
### Environment
|
|
124
172
|
- `NATS_URL` (default `nats://127.0.0.1:4222`)
|
|
125
173
|
- `REDIS_URL` (default `redis://127.0.0.1:6379/0`)
|
|
174
|
+
|
|
175
|
+
## Generating API Docs
|
|
176
|
+
|
|
177
|
+
Generate HTML API reference locally using [pdoc](https://pdoc.dev/):
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
pip install cap-sdk-python[dev]
|
|
181
|
+
pdoc ./cap --output-dir docs
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Output is written to `docs/` (gitignored). Open `docs/index.html` to browse.
|
|
185
|
+
|
|
186
|
+
## Observability
|
|
187
|
+
|
|
188
|
+
### Structured Logging
|
|
189
|
+
The runtime Agent and Worker use `logging.Logger` (stdlib) for structured logging. All log calls include contextual fields (`job_id`, `trace_id`, `topic`, `sender_id`). Pass a custom logger or leave as default:
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
import logging
|
|
193
|
+
from cap.runtime import Agent
|
|
194
|
+
|
|
195
|
+
logger = logging.getLogger("my-agent")
|
|
196
|
+
logger.setLevel(logging.DEBUG)
|
|
197
|
+
agent = Agent(logger=logger)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### MetricsHook
|
|
201
|
+
Implement the `MetricsHook` protocol to integrate with Prometheus, OpenTelemetry, or any metrics system:
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from cap.metrics import MetricsHook
|
|
205
|
+
|
|
206
|
+
class MetricsHook(Protocol):
|
|
207
|
+
def on_job_received(self, job_id: str, topic: str) -> None: ...
|
|
208
|
+
def on_job_completed(self, job_id: str, duration_ms: int, status: str) -> None: ...
|
|
209
|
+
def on_job_failed(self, job_id: str, error_msg: str) -> None: ...
|
|
210
|
+
def on_heartbeat_sent(self, worker_id: str) -> None: ...
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
The default is `NoopMetrics` (zero overhead). Example Prometheus integration:
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from cap.runtime import Agent
|
|
217
|
+
|
|
218
|
+
class PromMetrics:
|
|
219
|
+
def on_job_received(self, job_id, topic):
|
|
220
|
+
jobs_received.labels(topic=topic).inc()
|
|
221
|
+
|
|
222
|
+
def on_job_completed(self, job_id, duration_ms, status):
|
|
223
|
+
job_duration.labels(status=status).observe(duration_ms)
|
|
224
|
+
|
|
225
|
+
def on_job_failed(self, job_id, error_msg):
|
|
226
|
+
jobs_failed.inc()
|
|
227
|
+
|
|
228
|
+
def on_heartbeat_sent(self, worker_id):
|
|
229
|
+
pass
|
|
230
|
+
|
|
231
|
+
agent = Agent(metrics=PromMetrics())
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
The `trace_id` is propagated through all log and metrics calls for distributed tracing correlation.
|
|
@@ -78,6 +78,27 @@ Asyncio-first SDK with NATS helpers for CAP workers and clients.
|
|
|
78
78
|
|
|
79
79
|
Swap out `cap.bus` if you need a different transport.
|
|
80
80
|
|
|
81
|
+
## Testing
|
|
82
|
+
|
|
83
|
+
The `cap.testing` module lets you test handlers without running NATS or Redis.
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from cap.testing import run_handler
|
|
87
|
+
from cap.pb.cordum.agent.v1 import job_pb2
|
|
88
|
+
|
|
89
|
+
async def test_echo():
|
|
90
|
+
result = await run_handler(
|
|
91
|
+
lambda ctx, data: {"echo": data["prompt"]},
|
|
92
|
+
{"prompt": "hello"},
|
|
93
|
+
topic="job.echo",
|
|
94
|
+
)
|
|
95
|
+
assert result.status == job_pb2.JOB_STATUS_SUCCEEDED
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
- `run_handler(handler, input, **options)` — runs a single handler invocation and returns the `JobResult`.
|
|
99
|
+
- `create_test_agent(**options)` — returns `(agent, mock_nats, store)` pre-wired with `MockNATS` + `InMemoryBlobStore`.
|
|
100
|
+
- `MockNATS` — in-memory NATS mock for custom test setups.
|
|
101
|
+
|
|
81
102
|
## Runtime (High-Level SDK)
|
|
82
103
|
The runtime hides NATS/Redis plumbing and gives you typed handlers.
|
|
83
104
|
|
|
@@ -101,6 +122,92 @@ async def summarize(ctx: Context, data: Input) -> Output:
|
|
|
101
122
|
asyncio.run(agent.run())
|
|
102
123
|
```
|
|
103
124
|
|
|
125
|
+
### Middleware
|
|
126
|
+
|
|
127
|
+
Add cross-cutting concerns (logging, auth, metrics) without modifying handlers:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from cap.middleware import logging_middleware
|
|
131
|
+
|
|
132
|
+
# Built-in logging middleware
|
|
133
|
+
agent.use(logging_middleware())
|
|
134
|
+
|
|
135
|
+
# Custom middleware
|
|
136
|
+
async def timing(ctx, data, next_fn):
|
|
137
|
+
import time
|
|
138
|
+
start = time.monotonic()
|
|
139
|
+
result = await next_fn(ctx, data)
|
|
140
|
+
elapsed = time.monotonic() - start
|
|
141
|
+
print(f"job {ctx.job_id} took {elapsed:.3f}s")
|
|
142
|
+
return result
|
|
143
|
+
|
|
144
|
+
agent.use(timing)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Middleware executes in registration order (FIFO). Each can inspect context,
|
|
148
|
+
measure timing, or short-circuit by returning without calling `next_fn`.
|
|
149
|
+
|
|
104
150
|
### Environment
|
|
105
151
|
- `NATS_URL` (default `nats://127.0.0.1:4222`)
|
|
106
152
|
- `REDIS_URL` (default `redis://127.0.0.1:6379/0`)
|
|
153
|
+
|
|
154
|
+
## Generating API Docs
|
|
155
|
+
|
|
156
|
+
Generate HTML API reference locally using [pdoc](https://pdoc.dev/):
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
pip install cap-sdk-python[dev]
|
|
160
|
+
pdoc ./cap --output-dir docs
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Output is written to `docs/` (gitignored). Open `docs/index.html` to browse.
|
|
164
|
+
|
|
165
|
+
## Observability
|
|
166
|
+
|
|
167
|
+
### Structured Logging
|
|
168
|
+
The runtime Agent and Worker use `logging.Logger` (stdlib) for structured logging. All log calls include contextual fields (`job_id`, `trace_id`, `topic`, `sender_id`). Pass a custom logger or leave as default:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import logging
|
|
172
|
+
from cap.runtime import Agent
|
|
173
|
+
|
|
174
|
+
logger = logging.getLogger("my-agent")
|
|
175
|
+
logger.setLevel(logging.DEBUG)
|
|
176
|
+
agent = Agent(logger=logger)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### MetricsHook
|
|
180
|
+
Implement the `MetricsHook` protocol to integrate with Prometheus, OpenTelemetry, or any metrics system:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from cap.metrics import MetricsHook
|
|
184
|
+
|
|
185
|
+
class MetricsHook(Protocol):
|
|
186
|
+
def on_job_received(self, job_id: str, topic: str) -> None: ...
|
|
187
|
+
def on_job_completed(self, job_id: str, duration_ms: int, status: str) -> None: ...
|
|
188
|
+
def on_job_failed(self, job_id: str, error_msg: str) -> None: ...
|
|
189
|
+
def on_heartbeat_sent(self, worker_id: str) -> None: ...
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
The default is `NoopMetrics` (zero overhead). Example Prometheus integration:
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
from cap.runtime import Agent
|
|
196
|
+
|
|
197
|
+
class PromMetrics:
|
|
198
|
+
def on_job_received(self, job_id, topic):
|
|
199
|
+
jobs_received.labels(topic=topic).inc()
|
|
200
|
+
|
|
201
|
+
def on_job_completed(self, job_id, duration_ms, status):
|
|
202
|
+
job_duration.labels(status=status).observe(duration_ms)
|
|
203
|
+
|
|
204
|
+
def on_job_failed(self, job_id, error_msg):
|
|
205
|
+
jobs_failed.inc()
|
|
206
|
+
|
|
207
|
+
def on_heartbeat_sent(self, worker_id):
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
agent = Agent(metrics=PromMetrics())
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
The `trace_id` is propagated through all log and metrics calls for distributed tracing correlation.
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""CAP (Cordum Agent Protocol) SDK for Python.
|
|
2
|
+
|
|
3
|
+
Provides helpers for submitting jobs, running workers, and building
|
|
4
|
+
high-level agents on the CAP bus.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import types
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from google.protobuf import runtime_version as _runtime_version # noqa: F401
|
|
12
|
+
except Exception:
|
|
13
|
+
try:
|
|
14
|
+
import google.protobuf as _protobuf
|
|
15
|
+
except Exception:
|
|
16
|
+
_protobuf = None
|
|
17
|
+
|
|
18
|
+
_shim = types.SimpleNamespace()
|
|
19
|
+
|
|
20
|
+
class _Domain:
|
|
21
|
+
PUBLIC = 0
|
|
22
|
+
|
|
23
|
+
def _validate(*_args, **_kwargs):
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
_shim.Domain = _Domain
|
|
27
|
+
_shim.ValidateProtobufRuntimeVersion = _validate
|
|
28
|
+
sys.modules["google.protobuf.runtime_version"] = _shim
|
|
29
|
+
if _protobuf is not None:
|
|
30
|
+
setattr(_protobuf, "runtime_version", _shim)
|
|
31
|
+
|
|
32
|
+
from .client import submit_job
|
|
33
|
+
from .worker import run_worker
|
|
34
|
+
from .bus import connect_nats
|
|
35
|
+
from .runtime import Agent, Context, BlobStore, RedisBlobStore, InMemoryBlobStore
|
|
36
|
+
from .middleware import Middleware, NextFn, logging_middleware
|
|
37
|
+
from .metrics import MetricsHook, NoopMetrics
|
|
38
|
+
from .heartbeat import (
|
|
39
|
+
heartbeat_payload,
|
|
40
|
+
heartbeat_payload_with_memory,
|
|
41
|
+
heartbeat_payload_with_progress,
|
|
42
|
+
emit_heartbeat,
|
|
43
|
+
heartbeat_loop,
|
|
44
|
+
)
|
|
45
|
+
from .progress import (
|
|
46
|
+
progress_payload,
|
|
47
|
+
cancel_payload,
|
|
48
|
+
emit_progress,
|
|
49
|
+
emit_cancel,
|
|
50
|
+
)
|
|
51
|
+
from .validate import (
|
|
52
|
+
ValidationError,
|
|
53
|
+
validate_job_request,
|
|
54
|
+
validate_job_result,
|
|
55
|
+
validate_bus_packet,
|
|
56
|
+
)
|
|
57
|
+
from .errors import (
|
|
58
|
+
CAPError,
|
|
59
|
+
VersionMismatchError,
|
|
60
|
+
MalformedPacketError,
|
|
61
|
+
UnknownPayloadError,
|
|
62
|
+
SignatureInvalidError,
|
|
63
|
+
SignatureMissingError,
|
|
64
|
+
JobTimeoutError,
|
|
65
|
+
ResourceExhaustedError,
|
|
66
|
+
PermissionDeniedError,
|
|
67
|
+
InvalidInputError,
|
|
68
|
+
JobNotFoundError,
|
|
69
|
+
DuplicateJobError,
|
|
70
|
+
WorkerUnavailableError,
|
|
71
|
+
SafetyDeniedError,
|
|
72
|
+
PolicyViolationError,
|
|
73
|
+
RiskTagBlockedError,
|
|
74
|
+
PublishFailedError,
|
|
75
|
+
SubscribeFailedError,
|
|
76
|
+
ConnectionLostError,
|
|
77
|
+
)
|
|
78
|
+
from .subjects import (
|
|
79
|
+
SUBJECT_SUBMIT,
|
|
80
|
+
SUBJECT_RESULT,
|
|
81
|
+
SUBJECT_HEARTBEAT,
|
|
82
|
+
SUBJECT_ALERT,
|
|
83
|
+
SUBJECT_PROGRESS,
|
|
84
|
+
SUBJECT_CANCEL,
|
|
85
|
+
SUBJECT_DLQ,
|
|
86
|
+
SUBJECT_WORKFLOW_EVENT,
|
|
87
|
+
SUBJECT_HANDSHAKE,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
__all__ = [
|
|
91
|
+
"submit_job",
|
|
92
|
+
"run_worker",
|
|
93
|
+
"connect_nats",
|
|
94
|
+
"Agent",
|
|
95
|
+
"Context",
|
|
96
|
+
"BlobStore",
|
|
97
|
+
"RedisBlobStore",
|
|
98
|
+
"InMemoryBlobStore",
|
|
99
|
+
"Middleware",
|
|
100
|
+
"NextFn",
|
|
101
|
+
"logging_middleware",
|
|
102
|
+
"MetricsHook",
|
|
103
|
+
"NoopMetrics",
|
|
104
|
+
"heartbeat_payload",
|
|
105
|
+
"heartbeat_payload_with_memory",
|
|
106
|
+
"heartbeat_payload_with_progress",
|
|
107
|
+
"emit_heartbeat",
|
|
108
|
+
"heartbeat_loop",
|
|
109
|
+
"progress_payload",
|
|
110
|
+
"cancel_payload",
|
|
111
|
+
"emit_progress",
|
|
112
|
+
"emit_cancel",
|
|
113
|
+
"ValidationError",
|
|
114
|
+
"validate_job_request",
|
|
115
|
+
"validate_job_result",
|
|
116
|
+
"validate_bus_packet",
|
|
117
|
+
"SUBJECT_SUBMIT",
|
|
118
|
+
"SUBJECT_RESULT",
|
|
119
|
+
"SUBJECT_HEARTBEAT",
|
|
120
|
+
"SUBJECT_ALERT",
|
|
121
|
+
"SUBJECT_PROGRESS",
|
|
122
|
+
"SUBJECT_CANCEL",
|
|
123
|
+
"SUBJECT_DLQ",
|
|
124
|
+
"SUBJECT_WORKFLOW_EVENT",
|
|
125
|
+
"SUBJECT_HANDSHAKE",
|
|
126
|
+
"CAPError",
|
|
127
|
+
"VersionMismatchError",
|
|
128
|
+
"MalformedPacketError",
|
|
129
|
+
"UnknownPayloadError",
|
|
130
|
+
"SignatureInvalidError",
|
|
131
|
+
"SignatureMissingError",
|
|
132
|
+
"JobTimeoutError",
|
|
133
|
+
"ResourceExhaustedError",
|
|
134
|
+
"PermissionDeniedError",
|
|
135
|
+
"InvalidInputError",
|
|
136
|
+
"JobNotFoundError",
|
|
137
|
+
"DuplicateJobError",
|
|
138
|
+
"WorkerUnavailableError",
|
|
139
|
+
"SafetyDeniedError",
|
|
140
|
+
"PolicyViolationError",
|
|
141
|
+
"RiskTagBlockedError",
|
|
142
|
+
"PublishFailedError",
|
|
143
|
+
"SubscribeFailedError",
|
|
144
|
+
"ConnectionLostError",
|
|
145
|
+
]
|
|
@@ -3,6 +3,8 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class NATSConfig:
|
|
6
|
+
"""NATS connection configuration."""
|
|
7
|
+
|
|
6
8
|
def __init__(
|
|
7
9
|
self,
|
|
8
10
|
url: str,
|
|
@@ -19,6 +21,17 @@ class NATSConfig:
|
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
async def connect_nats(cfg: NATSConfig):
|
|
24
|
+
"""Open a NATS connection using the provided configuration.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
cfg: Connection settings.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
A connected NATS client.
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
RuntimeError: If the ``nats-py`` package is not installed.
|
|
34
|
+
"""
|
|
22
35
|
try:
|
|
23
36
|
import nats # type: ignore
|
|
24
37
|
except ImportError as exc:
|
|
@@ -17,6 +17,15 @@ async def submit_job(
|
|
|
17
17
|
sender_id: str,
|
|
18
18
|
private_key: Optional[ec.EllipticCurvePrivateKey] = None,
|
|
19
19
|
):
|
|
20
|
+
"""Publish a JobRequest onto the CAP submit subject.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
nc: An active NATS connection.
|
|
24
|
+
job_request: A protobuf JobRequest message.
|
|
25
|
+
trace_id: Distributed trace identifier propagated through the bus.
|
|
26
|
+
sender_id: Identity of the sender (used in the BusPacket envelope).
|
|
27
|
+
private_key: Optional ECDSA private key for signing the packet.
|
|
28
|
+
"""
|
|
20
29
|
ts = timestamp_pb2.Timestamp()
|
|
21
30
|
ts.GetCurrentTime()
|
|
22
31
|
packet = buspacket_pb2.BusPacket()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Typed error classes matching the CAP ErrorCode registry.
|
|
2
|
+
|
|
3
|
+
See spec/13-error-codes.md for the full taxonomy.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CAPError(Exception):
|
|
8
|
+
"""Base class for all CAP protocol errors."""
|
|
9
|
+
|
|
10
|
+
code: str = "ERROR_CODE_UNSPECIFIED"
|
|
11
|
+
numeric_code: int = 0
|
|
12
|
+
|
|
13
|
+
def __init__(self, message: str) -> None:
|
|
14
|
+
super().__init__(message)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Protocol errors (100-199)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VersionMismatchError(CAPError):
|
|
21
|
+
code = "ERROR_CODE_PROTOCOL_VERSION_MISMATCH"
|
|
22
|
+
numeric_code = 100
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MalformedPacketError(CAPError):
|
|
26
|
+
code = "ERROR_CODE_PROTOCOL_MALFORMED_PACKET"
|
|
27
|
+
numeric_code = 101
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class UnknownPayloadError(CAPError):
|
|
31
|
+
code = "ERROR_CODE_PROTOCOL_UNKNOWN_PAYLOAD"
|
|
32
|
+
numeric_code = 102
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SignatureInvalidError(CAPError):
|
|
36
|
+
code = "ERROR_CODE_PROTOCOL_SIGNATURE_INVALID"
|
|
37
|
+
numeric_code = 103
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SignatureMissingError(CAPError):
|
|
41
|
+
code = "ERROR_CODE_PROTOCOL_SIGNATURE_MISSING"
|
|
42
|
+
numeric_code = 104
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Job errors (200-299)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class JobTimeoutError(CAPError):
|
|
49
|
+
code = "ERROR_CODE_JOB_TIMEOUT"
|
|
50
|
+
numeric_code = 200
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ResourceExhaustedError(CAPError):
|
|
54
|
+
code = "ERROR_CODE_JOB_RESOURCE_EXHAUSTED"
|
|
55
|
+
numeric_code = 201
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class PermissionDeniedError(CAPError):
|
|
59
|
+
code = "ERROR_CODE_JOB_PERMISSION_DENIED"
|
|
60
|
+
numeric_code = 202
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class InvalidInputError(CAPError):
|
|
64
|
+
code = "ERROR_CODE_JOB_INVALID_INPUT"
|
|
65
|
+
numeric_code = 203
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class JobNotFoundError(CAPError):
|
|
69
|
+
code = "ERROR_CODE_JOB_NOT_FOUND"
|
|
70
|
+
numeric_code = 204
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class DuplicateJobError(CAPError):
|
|
74
|
+
code = "ERROR_CODE_JOB_DUPLICATE"
|
|
75
|
+
numeric_code = 205
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class WorkerUnavailableError(CAPError):
|
|
79
|
+
code = "ERROR_CODE_JOB_WORKER_UNAVAILABLE"
|
|
80
|
+
numeric_code = 206
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Safety errors (300-399)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class SafetyDeniedError(CAPError):
|
|
87
|
+
code = "ERROR_CODE_SAFETY_DENIED"
|
|
88
|
+
numeric_code = 300
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class PolicyViolationError(CAPError):
|
|
92
|
+
code = "ERROR_CODE_SAFETY_POLICY_VIOLATION"
|
|
93
|
+
numeric_code = 301
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class RiskTagBlockedError(CAPError):
|
|
97
|
+
code = "ERROR_CODE_SAFETY_RISK_TAG_BLOCKED"
|
|
98
|
+
numeric_code = 302
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Transport errors (400-499)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class PublishFailedError(CAPError):
|
|
105
|
+
code = "ERROR_CODE_TRANSPORT_PUBLISH_FAILED"
|
|
106
|
+
numeric_code = 400
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class SubscribeFailedError(CAPError):
|
|
110
|
+
code = "ERROR_CODE_TRANSPORT_SUBSCRIBE_FAILED"
|
|
111
|
+
numeric_code = 401
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class ConnectionLostError(CAPError):
|
|
115
|
+
code = "ERROR_CODE_TRANSPORT_CONNECTION_LOST"
|
|
116
|
+
numeric_code = 402
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Heartbeat helpers for CAP Python SDK.
|
|
2
|
+
|
|
3
|
+
These helpers build and publish heartbeat BusPacket envelopes.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Callable, Optional
|
|
9
|
+
|
|
10
|
+
from cryptography.hazmat.primitives import hashes
|
|
11
|
+
from cryptography.hazmat.primitives.asymmetric import ec
|
|
12
|
+
from google.protobuf import timestamp_pb2
|
|
13
|
+
|
|
14
|
+
from cap.client import DEFAULT_PROTOCOL_VERSION
|
|
15
|
+
from cap.metrics import MetricsHook
|
|
16
|
+
from cap.pb.cordum.agent.v1 import buspacket_pb2, heartbeat_pb2
|
|
17
|
+
from cap.subjects import SUBJECT_HEARTBEAT
|
|
18
|
+
|
|
19
|
+
_logger = logging.getLogger("cap.heartbeat")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def heartbeat_payload(
|
|
23
|
+
worker_id: str,
|
|
24
|
+
pool: str,
|
|
25
|
+
active_jobs: int,
|
|
26
|
+
max_parallel: int,
|
|
27
|
+
cpu_load: float,
|
|
28
|
+
) -> bytes:
|
|
29
|
+
"""Build a heartbeat payload with CPU utilization only."""
|
|
30
|
+
return heartbeat_payload_with_progress(
|
|
31
|
+
worker_id=worker_id,
|
|
32
|
+
pool=pool,
|
|
33
|
+
active_jobs=active_jobs,
|
|
34
|
+
max_parallel=max_parallel,
|
|
35
|
+
cpu_load=cpu_load,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def heartbeat_payload_with_memory(
|
|
40
|
+
worker_id: str,
|
|
41
|
+
pool: str,
|
|
42
|
+
active_jobs: int,
|
|
43
|
+
max_parallel: int,
|
|
44
|
+
cpu_load: float,
|
|
45
|
+
memory_load: float,
|
|
46
|
+
) -> bytes:
|
|
47
|
+
"""Build a heartbeat payload including memory utilization."""
|
|
48
|
+
return heartbeat_payload_with_progress(
|
|
49
|
+
worker_id=worker_id,
|
|
50
|
+
pool=pool,
|
|
51
|
+
active_jobs=active_jobs,
|
|
52
|
+
max_parallel=max_parallel,
|
|
53
|
+
cpu_load=cpu_load,
|
|
54
|
+
memory_load=memory_load,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def heartbeat_payload_with_progress(
|
|
59
|
+
worker_id: str,
|
|
60
|
+
pool: str,
|
|
61
|
+
active_jobs: int,
|
|
62
|
+
max_parallel: int,
|
|
63
|
+
cpu_load: float,
|
|
64
|
+
memory_load: float = 0.0,
|
|
65
|
+
progress_pct: int = 0,
|
|
66
|
+
last_memo: str = "",
|
|
67
|
+
) -> bytes:
|
|
68
|
+
"""Build a heartbeat payload including optional progress fields."""
|
|
69
|
+
ts = timestamp_pb2.Timestamp()
|
|
70
|
+
ts.GetCurrentTime()
|
|
71
|
+
|
|
72
|
+
packet = buspacket_pb2.BusPacket()
|
|
73
|
+
packet.sender_id = worker_id
|
|
74
|
+
packet.protocol_version = DEFAULT_PROTOCOL_VERSION
|
|
75
|
+
packet.created_at.CopyFrom(ts)
|
|
76
|
+
packet.heartbeat.CopyFrom(
|
|
77
|
+
heartbeat_pb2.Heartbeat(
|
|
78
|
+
worker_id=worker_id,
|
|
79
|
+
pool=pool,
|
|
80
|
+
active_jobs=active_jobs,
|
|
81
|
+
max_parallel_jobs=max_parallel,
|
|
82
|
+
cpu_load=cpu_load,
|
|
83
|
+
memory_load=memory_load,
|
|
84
|
+
progress_pct=progress_pct,
|
|
85
|
+
last_memo=last_memo,
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
return packet.SerializeToString(deterministic=True)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def emit_heartbeat(
|
|
92
|
+
nc,
|
|
93
|
+
payload: bytes,
|
|
94
|
+
private_key: Optional[ec.EllipticCurvePrivateKey] = None,
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Publish one heartbeat packet to the heartbeat subject."""
|
|
97
|
+
data = payload
|
|
98
|
+
if private_key is not None:
|
|
99
|
+
packet = buspacket_pb2.BusPacket()
|
|
100
|
+
packet.ParseFromString(payload)
|
|
101
|
+
packet.ClearField("signature")
|
|
102
|
+
unsigned_data = packet.SerializeToString(deterministic=True)
|
|
103
|
+
packet.signature = private_key.sign(unsigned_data, ec.ECDSA(hashes.SHA256()))
|
|
104
|
+
data = packet.SerializeToString(deterministic=True)
|
|
105
|
+
|
|
106
|
+
await nc.publish(SUBJECT_HEARTBEAT, data)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
async def heartbeat_loop(
|
|
110
|
+
nc,
|
|
111
|
+
payload_fn: Callable[[], bytes],
|
|
112
|
+
interval: float = 5.0,
|
|
113
|
+
private_key: Optional[ec.EllipticCurvePrivateKey] = None,
|
|
114
|
+
metrics: MetricsHook | None = None,
|
|
115
|
+
cancel_event: asyncio.Event | None = None,
|
|
116
|
+
) -> None:
|
|
117
|
+
"""Emit heartbeat packets periodically until cancelled."""
|
|
118
|
+
sleep_interval = max(0.0, interval)
|
|
119
|
+
|
|
120
|
+
while True:
|
|
121
|
+
if cancel_event is not None and cancel_event.is_set():
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
if cancel_event is None:
|
|
125
|
+
await asyncio.sleep(sleep_interval)
|
|
126
|
+
else:
|
|
127
|
+
sleep_task = asyncio.create_task(asyncio.sleep(sleep_interval))
|
|
128
|
+
cancel_task = asyncio.create_task(cancel_event.wait())
|
|
129
|
+
done, pending = await asyncio.wait(
|
|
130
|
+
{sleep_task, cancel_task},
|
|
131
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
for task in pending:
|
|
135
|
+
task.cancel()
|
|
136
|
+
if pending:
|
|
137
|
+
await asyncio.gather(*pending, return_exceptions=True)
|
|
138
|
+
|
|
139
|
+
if cancel_task in done and cancel_event.is_set():
|
|
140
|
+
return
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
payload = payload_fn()
|
|
144
|
+
await emit_heartbeat(nc=nc, payload=payload, private_key=private_key)
|
|
145
|
+
if metrics is not None:
|
|
146
|
+
packet = buspacket_pb2.BusPacket()
|
|
147
|
+
packet.ParseFromString(payload)
|
|
148
|
+
worker_id = packet.heartbeat.worker_id or packet.sender_id
|
|
149
|
+
metrics.on_heartbeat_sent(worker_id)
|
|
150
|
+
except asyncio.CancelledError:
|
|
151
|
+
raise
|
|
152
|
+
except Exception:
|
|
153
|
+
_logger.exception("heartbeat emission failed")
|