assay-it 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ /target
2
+ archive/
3
+ **/*.rs.bk
4
+ Cargo.lock
5
+ .eval
6
+ .DS_Store
7
+ *.log
8
+ junit.xml
9
+ sarif.json
10
+ run.json
11
+ otel.jsonl
12
+ out/
13
+ venv/
14
+ .venv/
15
+ __pycache__/
16
+ *.pyc
17
+ .pytest_cache/
18
+ *.egg-info/
19
+ dist/
20
+ build/
21
+ .env
22
+ .coverage
23
+ htmlcov/
24
+ .mypy_cache/
25
+ .ruff_cache/
26
+
27
+ # Release & Smoke Test
28
+ .venv_smoke/
29
+ smoke_test_ws/
30
+ verdict_sdk.egg-info/
31
+ assay_python_sdk.egg-info/
assay_it-0.8.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 verdict-dev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: assay-it
3
+ Version: 0.8.0
4
+ Summary: High-performance evaluation framework for LLM agents
5
+ Project-URL: Homepage, https://assay.dev
6
+ Project-URL: Repository, https://github.com/Rul1an/assay
7
+ Project-URL: Issues, https://github.com/Rul1an/assay/issues
8
+ Author-email: Assay <hello@assay.dev>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2025 verdict-dev
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Classifier: Development Status :: 4 - Beta
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Programming Language :: Python :: 3
35
+ Classifier: Programming Language :: Python :: 3.8
36
+ Classifier: Programming Language :: Python :: 3.9
37
+ Classifier: Programming Language :: Python :: 3.10
38
+ Classifier: Programming Language :: Python :: 3.11
39
+ Classifier: Programming Language :: Python :: 3.12
40
+ Classifier: Topic :: Software Development :: Quality Assurance
41
+ Requires-Python: >=3.9
42
+ Requires-Dist: pyyaml>=6.0
43
+ Provides-Extra: dev
44
+ Requires-Dist: black; extra == 'dev'
45
+ Requires-Dist: isort; extra == 'dev'
46
+ Requires-Dist: mypy; extra == 'dev'
47
+ Requires-Dist: pytest; extra == 'dev'
48
+ Requires-Dist: pytest-asyncio; extra == 'dev'
49
+ Provides-Extra: openai
50
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
51
+ Description-Content-Type: text/markdown
52
+
53
+ # Assay Python SDK
54
+
55
+ **Record deterministic traces from your Python agents for regression gating.**
56
+
57
+ ## 🚀 Golden Quickstart
58
+
59
+ The fastest way to regression test your AI agent.
60
+
61
+ ### 1. Installation
62
+ ```bash
63
+ pip install assay
64
+ ```
65
+
66
+ ### 2. Record (`record.py`)
67
+ Run your agent through the SDK to capture a trace. Pass your tool functions to `tool_executors` so Assay can record their inputs and outputs.
68
+
69
+ ```python
70
+ import os
71
+ import openai
72
+ from assay_sdk import TraceWriter, record_chat_completions_with_tools
73
+
74
+ # 1. Setup Client & Tools
75
+ client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "mock"))
76
+ TOOLS = [{
77
+ "type": "function",
78
+ "function": {
79
+ "name": "GetWeather",
80
+ "parameters": {"type": "object", "properties": {"location": {"type": "string"}}}
81
+ }
82
+ }]
83
+
84
+ # 2. Define Execution Logic (The "Real" Code)
85
+ def get_weather(args):
86
+ return {"temp": 22, "location": args.get("location")}
87
+
88
+ # 3. Record the Loop
89
+ writer = TraceWriter("traces/quickstart.jsonl")
90
+ result = record_chat_completions_with_tools(
91
+ writer=writer,
92
+ client=client,
93
+ model="gpt-4o",
94
+ messages=[{"role": "user", "content": "Weather in Tokyo?"}],
95
+ tools=TOOLS,
96
+ tool_executors={"GetWeather": get_weather}, # Link schema -> function
97
+ episode_id="weather_demo",
98
+ test_id="weather_check"
99
+ )
100
+ print(f"Agent Final Answer: {result['content']}")
101
+ ```
102
+
103
+ ### 3. Configure (`assay.yaml`)
104
+ Tell Assay what to check.
105
+
106
+ ```yaml
107
+ version: 1
108
+ model: "trace"
109
+ tests:
110
+ - id: weather_check
111
+ input:
112
+ prompt: "Weather in Tokyo?" # Matches the recorded prompt
113
+ expected:
114
+ type: regex_match
115
+ pattern: ".*" # Pass if any content returned (baseline check)
116
+ ```
117
+
118
+ ### 4. Verify
119
+ Run the regression gate. This replays your trace against the recorded tool outputs to ensure determinism.
120
+
121
+ ```bash
122
+ # Verify strictly (fails if any tool call arg changed even slightly)
123
+ assay ci --config assay.yaml --trace-file traces/quickstart.jsonl --replay-strict --db :memory:
124
+ ```
125
+
126
+ ---
127
+
128
+ ## 🌊 Advanced: Streaming support
129
+ Capture streaming responses while maintaining tool call execution.
130
+
131
+ ```python
132
+ from assay_sdk import record_chat_completions_stream_with_tools
133
+
134
+ # ... setup client & writer ...
135
+
136
+ result = record_chat_completions_stream_with_tools(
137
+ writer=writer,
138
+ # ... args ...
139
+ stream=True # SDK handles chunk aggregation automatically
140
+ # tool_executors={...} # Required if tools are used
141
+ )
142
+ ```
143
+ *Note: The hybrid wrapper (`record_chat_completions_stream_with_tools`) streams the thinking tokens to the user, executes tools, and then performs a standard follow-up call.*
144
+
145
+ ## 🛡️ Advanced: Privacy & Redaction
146
+ Protect sensitive data (PII, API keys) from ever hitting the trace file.
147
+
148
+ ```python
149
+ from assay_sdk import TraceWriter, make_redactor
150
+
151
+ # Create a redactor that scrubs keys and regex patterns
152
+ redactor = make_redactor(
153
+ key_denylist={"authorization", "password", "api_key"},
154
+ patterns=[r"sk-[a-zA-Z0-9]{20,}"] # Mask OpenAI keys
155
+ )
156
+
157
+ # Attach to writer - happens automatically on write
158
+ writer = TraceWriter("traces/secure.jsonl", redact_fn=redactor)
159
+ ```
160
+
161
+ ## ⚡ Async Support
162
+ Native `async` support for high-throughput applications (FastAPI, etc.) is available via the `assay_sdk.async_openai` submodule. It provides full parity with the sync API, including loop and streaming support.
163
+
164
+ ## ❓ Troubleshooting
165
+
166
+ ### `E_TRACE_EPISODE_MISSING`
167
+ **Cause**: The `test_id` or `episode_id` in your trace doesn't match what `assay ci` expected from its config (or implicit default).
168
+ **Fix**: Ensure your `assay.yaml` test IDs match the `test_id` passed to `record_chat_completions...`.
169
+
170
+ ### "Duplicate prompt in strict replay"
171
+ **Cause**: You ran `record.py` twice without cleaning the trace file, so it contains two identical episodes. `assay ci` in strict mode doesn't know which one to replay.
172
+ **Fix**:
173
+ 1. Truncate the file before recording: `trace_path = "traces/my_trace.jsonl"; open(trace_path, 'w').close()`.
174
+ 2. Use unique `episode_id`s (e.g. UUIDs) for every run.
@@ -0,0 +1,122 @@
1
+ # Assay Python SDK
2
+
3
+ **Record deterministic traces from your Python agents for regression gating.**
4
+
5
+ ## 🚀 Golden Quickstart
6
+
7
+ The fastest way to regression test your AI agent.
8
+
9
+ ### 1. Installation
10
+ ```bash
11
+ pip install assay
12
+ ```
13
+
14
+ ### 2. Record (`record.py`)
15
+ Run your agent through the SDK to capture a trace. Pass your tool functions to `tool_executors` so Assay can record their inputs and outputs.
16
+
17
+ ```python
18
+ import os
19
+ import openai
20
+ from assay_sdk import TraceWriter, record_chat_completions_with_tools
21
+
22
+ # 1. Setup Client & Tools
23
+ client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "mock"))
24
+ TOOLS = [{
25
+ "type": "function",
26
+ "function": {
27
+ "name": "GetWeather",
28
+ "parameters": {"type": "object", "properties": {"location": {"type": "string"}}}
29
+ }
30
+ }]
31
+
32
+ # 2. Define Execution Logic (The "Real" Code)
33
+ def get_weather(args):
34
+ return {"temp": 22, "location": args.get("location")}
35
+
36
+ # 3. Record the Loop
37
+ writer = TraceWriter("traces/quickstart.jsonl")
38
+ result = record_chat_completions_with_tools(
39
+ writer=writer,
40
+ client=client,
41
+ model="gpt-4o",
42
+ messages=[{"role": "user", "content": "Weather in Tokyo?"}],
43
+ tools=TOOLS,
44
+ tool_executors={"GetWeather": get_weather}, # Link schema -> function
45
+ episode_id="weather_demo",
46
+ test_id="weather_check"
47
+ )
48
+ print(f"Agent Final Answer: {result['content']}")
49
+ ```
50
+
51
+ ### 3. Configure (`assay.yaml`)
52
+ Tell Assay what to check.
53
+
54
+ ```yaml
55
+ version: 1
56
+ model: "trace"
57
+ tests:
58
+ - id: weather_check
59
+ input:
60
+ prompt: "Weather in Tokyo?" # Matches the recorded prompt
61
+ expected:
62
+ type: regex_match
63
+ pattern: ".*" # Pass if any content returned (baseline check)
64
+ ```
65
+
66
+ ### 4. Verify
67
+ Run the regression gate. This replays your trace against the recorded tool outputs to ensure determinism.
68
+
69
+ ```bash
70
+ # Verify strictly (fails if any tool call arg changed even slightly)
71
+ assay ci --config assay.yaml --trace-file traces/quickstart.jsonl --replay-strict --db :memory:
72
+ ```
73
+
74
+ ---
75
+
76
+ ## 🌊 Advanced: Streaming support
77
+ Capture streaming responses while maintaining tool call execution.
78
+
79
+ ```python
80
+ from assay_sdk import record_chat_completions_stream_with_tools
81
+
82
+ # ... setup client & writer ...
83
+
84
+ result = record_chat_completions_stream_with_tools(
85
+ writer=writer,
86
+ # ... args ...
87
+ stream=True # SDK handles chunk aggregation automatically
88
+ # tool_executors={...} # Required if tools are used
89
+ )
90
+ ```
91
+ *Note: The hybrid wrapper (`record_chat_completions_stream_with_tools`) streams the thinking tokens to the user, executes tools, and then performs a standard follow-up call.*
92
+
93
+ ## 🛡️ Advanced: Privacy & Redaction
94
+ Protect sensitive data (PII, API keys) from ever hitting the trace file.
95
+
96
+ ```python
97
+ from assay_sdk import TraceWriter, make_redactor
98
+
99
+ # Create a redactor that scrubs keys and regex patterns
100
+ redactor = make_redactor(
101
+ key_denylist={"authorization", "password", "api_key"},
102
+ patterns=[r"sk-[a-zA-Z0-9]{20,}"] # Mask OpenAI keys
103
+ )
104
+
105
+ # Attach to writer - happens automatically on write
106
+ writer = TraceWriter("traces/secure.jsonl", redact_fn=redactor)
107
+ ```
108
+
109
+ ## ⚡ Async Support
110
+ Native `async` support for high-throughput applications (FastAPI, etc.) is available via the `assay_sdk.async_openai` submodule. It provides full parity with the sync API, including loop and streaming support.
111
+
112
+ ## ❓ Troubleshooting
113
+
114
+ ### `E_TRACE_EPISODE_MISSING`
115
+ **Cause**: The `test_id` or `episode_id` in your trace doesn't match what `assay ci` expected from its config (or implicit default).
116
+ **Fix**: Ensure your `assay.yaml` test IDs match the `test_id` passed to `record_chat_completions...`.
117
+
118
+ ### "Duplicate prompt in strict replay"
119
+ **Cause**: You ran `record.py` twice without cleaning the trace file, so it contains two identical episodes. `assay ci` in strict mode doesn't know which one to replay.
120
+ **Fix**:
121
+ 1. Truncate the file before recording: `trace_path = "traces/my_trace.jsonl"; open(trace_path, 'w').close()`.
122
+ 2. Use unique `episode_id`s (e.g. UUIDs) for every run.
@@ -0,0 +1,24 @@
1
+ """
2
+ Verdict Python SDK: deterministic trace recording for regression gating.
3
+ """
4
+
5
+ from .clock import FrozenClock, SystemClock
6
+ from .openai_instrumentor import (record_chat_completions,
7
+ record_chat_completions_with_tools)
8
+ from .openai_stream_wrapper import (record_chat_completions_stream,
9
+ record_chat_completions_stream_with_tools)
10
+ from .recorder import EpisodeRecorder
11
+ from .redaction import make_redactor
12
+ from .writer import TraceWriter
13
+
14
+ __all__ = [
15
+ "TraceWriter",
16
+ "EpisodeRecorder",
17
+ "SystemClock",
18
+ "FrozenClock",
19
+ "record_chat_completions",
20
+ "record_chat_completions_with_tools",
21
+ "make_redactor",
22
+ "record_chat_completions_stream",
23
+ "record_chat_completions_stream_with_tools",
24
+ ]
@@ -0,0 +1,25 @@
1
+ import sys
2
+
3
+ from .doctor import run_doctor
4
+
5
+
6
+ def main():
7
+ if len(sys.argv) < 2:
8
+ print("Usage: assay <command>")
9
+ print("Commands:")
10
+ print(" doctor Run health checks")
11
+ sys.exit(1)
12
+
13
+ cmd = sys.argv[1]
14
+ if cmd == "doctor":
15
+ run_doctor()
16
+ elif cmd == "enforce":
17
+ # Placeholder for future Phase 3.1
18
+ print("Not implemented yet.")
19
+ else:
20
+ print(f"Unknown command: {cmd}")
21
+ sys.exit(1)
22
+
23
+
24
+ if __name__ == "__main__":
25
+ main()
@@ -0,0 +1,100 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Any, Dict, Optional
7
+
8
+ from .result import CompareResult, EvalRun, ResultArtifacts
9
+
10
+
11
+ class RunWriter:
12
+ """Manages persistence of EvalRuns and CompareResults to disk."""
13
+
14
+ def __init__(self, workdir: Path) -> None:
15
+ self.workdir = workdir
16
+ self.runs_dir = workdir / "runs"
17
+ self.runs_dir.mkdir(parents=True, exist_ok=True)
18
+
19
+ def write_run(self, run: EvalRun) -> ResultArtifacts:
20
+ """Persists EvalRun to .eval/runs/<run_id>/."""
21
+ run_dir = self.runs_dir / run.run_id
22
+ run_dir.mkdir(parents=True, exist_ok=True)
23
+
24
+ run_json = run_dir / "run.json"
25
+ results_jsonl = run_dir / "results.jsonl"
26
+ junit_xml = run_dir / "junit.xml"
27
+ summary_md = run_dir / "summary.md"
28
+ artifacts_json = run_dir / "artifacts.json"
29
+
30
+ self._write_json(run_json, run.to_dict())
31
+
32
+ with open(results_jsonl, "w", encoding="utf-8") as f:
33
+ for t in run.tests:
34
+ f.write(json.dumps(t.to_dict()) + "\n")
35
+
36
+ if run.artifacts.junit_xml:
37
+ with open(junit_xml, "w", encoding="utf-8") as f:
38
+ f.write(run.to_junit_xml())
39
+
40
+ with open(summary_md, "w", encoding="utf-8") as f:
41
+ f.write(run.to_github_summary())
42
+
43
+ index_data = {
44
+ "run_id": run.run_id,
45
+ "created_at_ms": run.created_at_ms,
46
+ "paths": {
47
+ "run_json": str(run_json.absolute()),
48
+ "results_jsonl": str(results_jsonl.absolute()),
49
+ "junit_xml": str(junit_xml.absolute()),
50
+ "summary_md": str(summary_md.absolute()),
51
+ "sarif": None,
52
+ "diff_json": None,
53
+ "trace_path": str(run.trace_path.absolute()),
54
+ },
55
+ }
56
+ self._write_json(artifacts_json, index_data)
57
+
58
+ # Return updated artifacts object reflecting true paths
59
+ # (Though EvalRun object passed in hasn't changed, callers should know the reliable paths)
60
+ return ResultArtifacts(
61
+ run_json=run_json,
62
+ results_jsonl=results_jsonl,
63
+ junit_xml=junit_xml,
64
+ sarif=None,
65
+ diff_json=None,
66
+ trace_path=run.trace_path,
67
+ )
68
+
69
+ def write_diff(self, result: CompareResult) -> None:
70
+ """Persists diff.json to the current run's directory."""
71
+ if not result.current_run_id:
72
+ return
73
+ run_dir = self.runs_dir / result.current_run_id
74
+ if not run_dir.exists():
75
+ run_dir.mkdir(parents=True, exist_ok=True)
76
+
77
+ # 1. diff.json
78
+ diff_file = run_dir / "diff.json"
79
+ self._write_json(diff_file, result.to_dict())
80
+
81
+ # 2. Update summary.md with comparison details
82
+ summary_path = run_dir / "summary.md"
83
+ with open(summary_path, "w", encoding="utf-8") as f:
84
+ f.write(result.to_github_summary())
85
+
86
+ # 3. Update artifacts.json
87
+ index_path = run_dir / "artifacts.json"
88
+ if index_path.exists():
89
+ with open(index_path, "r", encoding="utf-8") as f:
90
+ index_data = json.load(f)
91
+ else:
92
+ # Should exist, but fallback
93
+ index_data = {"run_id": result.current_run_id, "paths": {}}
94
+
95
+ index_data["paths"]["diff_json"] = str(diff_file.absolute())
96
+ self._write_json(index_path, index_data)
97
+
98
+ def _write_json(self, path: Path, data: Dict[str, Any]) -> None:
99
+ with open(path, "w", encoding="utf-8") as f:
100
+ json.dump(data, f, indent=2, ensure_ascii=False)