agentmark-sdk 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/.gitignore +3 -1
  2. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/CHANGELOG.md +43 -0
  3. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/PKG-INFO +1 -1
  4. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/pyproject.toml +1 -1
  5. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/__init__.py +2 -0
  6. agentmark_sdk-0.3.0/src/agentmark_sdk/span_hooks.py +80 -0
  7. agentmark_sdk-0.3.0/tests/test_span_hooks.py +46 -0
  8. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/package.json +0 -0
  9. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/config.py +0 -0
  10. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/decorator.py +0 -0
  11. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/masking_processor.py +0 -0
  12. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/otlp_json_exporter.py +0 -0
  13. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/pii_masker.py +0 -0
  14. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/py.typed +0 -0
  15. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/sampler.py +0 -0
  16. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/sdk.py +0 -0
  17. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/serialize.py +0 -0
  18. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/src/agentmark_sdk/trace.py +0 -0
  19. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/__init__.py +0 -0
  20. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/test_decorator.py +0 -0
  21. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/test_masking_processor.py +0 -0
  22. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/test_otlp_json_exporter.py +0 -0
  23. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/test_pii_masker.py +0 -0
  24. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/test_sampler.py +0 -0
  25. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/test_sdk.py +0 -0
  26. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/test_serialize.py +0 -0
  27. {agentmark_sdk-0.2.1 → agentmark_sdk-0.3.0}/tests/test_trace.py +0 -0
@@ -9,7 +9,9 @@ yalc.lock
9
9
  .env
10
10
  *storybook.log
11
11
  storybook-static
12
- tmp-dev*/
12
+ # CLI test fixtures (test/dev.test.ts et al. create tmp-* dirs in packages/cli;
13
+ # crashed runs can leave them behind)
14
+ tmp-*/
13
15
  .claude
14
16
 
15
17
  # Nx
@@ -1,3 +1,46 @@
1
+ ## 0.3.0 (2026-06-09)
2
+
3
+ ### 🚀 Features
4
+
5
+ - feat(webhook): the runner owns dispatch; evals reach the cloud on every path ([#717](https://github.com/agentmark-ai/agentmark/pull/717))
6
+
7
+ The New Experiment dialog showed *"No evals available"* for deployed apps even
8
+ when they registered evals. Root cause: no single object owned "what the
9
+ deployed app exposes," so the eval registry had to travel a hand-assembled chain
10
+ (client → executor → runner → handler → dispatch → transport) that every entry
11
+ path re-wired — and any path could drop it. The Python managed handler hand-rolled
12
+ dispatch and 400'd on `get-evals`; the TS managed server forwarded the dispatch
13
+ envelope raw; the BYO `createWebhookRunner` built a client with no `evals` input
14
+ at all. This makes the chain non-assemblable.
15
+
16
+ - **Dispatch lives on the runner.** `WebhookRunner.dispatch(event)` (TS + Python)
17
+ routes prompt-run / dataset-run / get-evals, sourcing evals from its OWN
18
+ client — no passable, omittable client argument. The canonical managed handler
19
+ is `handler = runner.dispatch` (or `adapterHandler.dispatch`). `runner.client`
20
+ / `getEvalNames()` are public so a runner satisfies the control-plane contract.
21
+
22
+ - **`evals` is threaded through every builder.** TS `createWebhookRunner({ evals })`
23
+ and the new Python `create_webhook_runner(executor, evals=…)` register evals
24
+ once → they both run in experiments and list in the dialog. Adapter factories
25
+ already threaded evals; now the BYO path does too.
26
+
27
+ - **Adapters delegate, don't reimplement.** Pydantic / claude / ai-sdk-v4 / v5
28
+ webhook handlers expose `.dispatch` + `.client` by delegating to the shared
29
+ runner (both span hooks bundled at construction); no per-adapter dispatch code.
30
+
31
+ - **Anti-drift.** `conformance-vectors/protocol-catalog.json` gains a normative
32
+ `webhookJobs` section; both languages assert their REAL dispatch's job-type set
33
+ (`WEBHOOK_JOB_TYPES` / `WebhookRequest['type']`) is exhaustive over it, and the
34
+ get-evals payload stays pinned to `control-plane.json` on the dev AND managed
35
+ surfaces. Adding a job to one language without the other fails the other's CI.
36
+
37
+ New public API (minor) across prompt-core (TS + Python), the SDK
38
+ (`createWebhookRunner` `evals` option), and the adapters (`dispatch`/`client`).
39
+ Back-compat: `handleWebhookRequest(event, handler, client?)` still works; the
40
+ managed servers still accept legacy flat results. The managed Node server now
41
+ unwraps the dispatch envelope (the TS half of the empty dialog) — see
42
+ `apps/builder` machine-execute-contract test (monorepo, not released here).
43
+
1
44
  ## 0.2.1 (2026-04-16)
2
45
 
3
46
  ### 🩹 Fixes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentmark-sdk
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: AgentMark SDK for Python - Tracing and Observability
5
5
  Author-email: AgentMark <support@agentmark.co>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "agentmark-sdk"
3
- version = "0.2.1"
3
+ version = "0.3.0"
4
4
  description = "AgentMark SDK for Python - Tracing and Observability"
5
5
  requires-python = ">=3.10"
6
6
  keywords = [
@@ -46,6 +46,7 @@ from .pii_masker import CustomPattern, PiiMaskerConfig, create_pii_masker
46
46
  from .sampler import AgentmarkSampler
47
47
  from .sdk import AgentMarkSDK
48
48
  from .serialize import serialize_value
49
+ from .span_hooks import create_agentmark_span_hooks
49
50
  from .trace import SpanContext, SpanOptions, SpanResult, span, span_context, span_context_sync
50
51
 
51
52
  __all__ = [
@@ -57,6 +58,7 @@ __all__ = [
57
58
  "span",
58
59
  "span_context",
59
60
  "span_context_sync",
61
+ "create_agentmark_span_hooks",
60
62
  "observe",
61
63
  "SpanOptions",
62
64
  "SpanContext",
@@ -0,0 +1,80 @@
1
+ """AgentMark span hooks for the shared ``WebhookRunner``.
2
+
3
+ ``create_agentmark_span_hooks()`` is the Python counterpart of the TypeScript
4
+ ``createAgentmarkSpanHooks()`` (``@agentmark-ai/sdk``): the one call that wires a
5
+ runner so every prompt run and every experiment item is traced to AgentMark. A
6
+ bring-your-own-SDK app passes it to ``create_webhook_runner`` (which also
7
+ defaults to it when this SDK is installed), so Python BYO tracing is as
8
+ turn-key as TypeScript.
9
+
10
+ The hooks map the runner's per-call params (``ExperimentItemParams`` /
11
+ ``PromptSpanParams`` from ``agentmark.prompt_core``, duck-typed here so this SDK
12
+ stays prompt-core-free) onto ``span_context``. They are intentionally identical
13
+ to the per-adapter hooks the pydantic / claude adapters define inline today —
14
+ those should adopt this single source in a follow-up so the mapping lives once.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ from contextlib import asynccontextmanager, suppress
21
+ from dataclasses import dataclass
22
+ from typing import Any
23
+
24
+ from .trace import SpanOptions, span_context
25
+
26
+
27
+ @dataclass
28
+ class _AgentMarkSpanCtx:
29
+ """Adapts a span context to the hook Protocol the shared runner expects:
30
+ ``trace_id`` + ``set_attribute(key, value)``."""
31
+
32
+ _inner: Any
33
+ trace_id: str = ""
34
+
35
+ def set_attribute(self, key: str, value: str) -> None:
36
+ with suppress(Exception):
37
+ self._inner.set_attribute(key, value)
38
+
39
+
40
+ @asynccontextmanager
41
+ async def _item_span(params: Any) -> Any:
42
+ """Per-item experiment span: maps the runner's params → SpanOptions."""
43
+ dataset_expected = (
44
+ json.dumps(params.dataset_expected_output)
45
+ if params.dataset_expected_output is not None
46
+ else None
47
+ )
48
+ dataset_input_json = (
49
+ json.dumps(params.dataset_input, default=str)
50
+ if params.dataset_input is not None
51
+ else None
52
+ )
53
+ options = SpanOptions(
54
+ name=f"experiment-{params.dataset_run_name}-{params.index}",
55
+ prompt_name=params.prompt_name,
56
+ dataset_run_id=params.experiment_run_id,
57
+ dataset_run_name=params.dataset_run_name,
58
+ dataset_item_name=params.dataset_item_name,
59
+ dataset_expected_output=dataset_expected,
60
+ dataset_input=dataset_input_json,
61
+ dataset_path=params.dataset_path,
62
+ metadata={"commit_sha": params.commit_sha} if params.commit_sha else None,
63
+ )
64
+ async with span_context(options) as ctx:
65
+ yield _AgentMarkSpanCtx(_inner=ctx, trace_id=ctx.trace_id)
66
+
67
+
68
+ @asynccontextmanager
69
+ async def _prompt_span(params: Any) -> Any:
70
+ """Prompt-level span for a single run."""
71
+ options = SpanOptions(name=params.name, prompt_name=params.prompt_name)
72
+ async with span_context(options) as ctx:
73
+ yield _AgentMarkSpanCtx(_inner=ctx, trace_id=ctx.trace_id)
74
+
75
+
76
+ def create_agentmark_span_hooks() -> dict[str, Any]:
77
+ """Return ``{"prompt_span_hook", "item_span_hook"}`` for a ``WebhookRunner`` —
78
+ every run and experiment item traced to AgentMark. Mirrors the TS
79
+ ``createAgentmarkSpanHooks()``."""
80
+ return {"prompt_span_hook": _prompt_span, "item_span_hook": _item_span}
@@ -0,0 +1,46 @@
1
+ """create_agentmark_span_hooks — the shared WebhookRunner span hooks.
2
+
3
+ Python counterpart of the TS ``createAgentmarkSpanHooks()``: the one call that
4
+ makes ``create_webhook_runner`` (and the adapters) trace every run and
5
+ experiment item. The hooks take the runner's per-call params duck-typed, so a
6
+ plain namespace stands in for ``PromptSpanParams`` / ``ExperimentItemParams``.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from types import SimpleNamespace
12
+
13
+ from agentmark_sdk import create_agentmark_span_hooks
14
+
15
+
16
+ def test_returns_both_runner_hooks() -> None:
17
+ hooks = create_agentmark_span_hooks()
18
+ assert set(hooks) == {"prompt_span_hook", "item_span_hook"}
19
+
20
+
21
+ async def test_prompt_span_yields_an_annotatable_ctx() -> None:
22
+ hooks = create_agentmark_span_hooks()
23
+ params = SimpleNamespace(name="run", prompt_name="greeting")
24
+ async with hooks["prompt_span_hook"](params) as ctx:
25
+ assert hasattr(ctx, "trace_id")
26
+ ctx.set_attribute("k", "v") # must not raise
27
+
28
+
29
+ async def test_item_span_maps_dataset_params_without_error() -> None:
30
+ # Exercises the dataset-field mapping (json.dumps of expected/input + the
31
+ # commit_sha metadata branch), the part most likely to drift.
32
+ hooks = create_agentmark_span_hooks()
33
+ params = SimpleNamespace(
34
+ dataset_run_name="exp",
35
+ index=0,
36
+ prompt_name="greeting",
37
+ experiment_run_id="run-1",
38
+ dataset_item_name="item-0",
39
+ dataset_expected_output={"ok": True},
40
+ dataset_input={"q": "hi"},
41
+ dataset_path="data/x.jsonl",
42
+ commit_sha="abc",
43
+ )
44
+ async with hooks["item_span_hook"](params) as ctx:
45
+ assert hasattr(ctx, "trace_id")
46
+ ctx.set_attribute("k", "v")