assay-it 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- assay_it-0.8.0/.gitignore +31 -0
- assay_it-0.8.0/LICENSE +21 -0
- assay_it-0.8.0/PKG-INFO +174 -0
- assay_it-0.8.0/README.md +122 -0
- assay_it-0.8.0/assay/__init__.py +24 -0
- assay_it-0.8.0/assay/__main__.py +25 -0
- assay_it-0.8.0/assay/artifacts.py +100 -0
- assay_it-0.8.0/assay/async_openai.py +400 -0
- assay_it-0.8.0/assay/baseline.py +93 -0
- assay_it-0.8.0/assay/clock.py +25 -0
- assay_it-0.8.0/assay/config.py +67 -0
- assay_it-0.8.0/assay/config_loader.py +237 -0
- assay_it-0.8.0/assay/context.py +30 -0
- assay_it-0.8.0/assay/doctor.py +91 -0
- assay_it-0.8.0/assay/errors.py +59 -0
- assay_it-0.8.0/assay/evaluator.py +385 -0
- assay_it-0.8.0/assay/judge/__init__.py +0 -0
- assay_it-0.8.0/assay/judge/cache.py +60 -0
- assay_it-0.8.0/assay/judge/client.py +10 -0
- assay_it-0.8.0/assay/judge/openai_judge.py +120 -0
- assay_it-0.8.0/assay/judge/types.py +25 -0
- assay_it-0.8.0/assay/metrics/builtin.py +79 -0
- assay_it-0.8.0/assay/metrics/ops.py +11 -0
- assay_it-0.8.0/assay/mocks/openai_stream_mock.py +136 -0
- assay_it-0.8.0/assay/openai_instrumentor.py +331 -0
- assay_it-0.8.0/assay/openai_stream_wrapper.py +264 -0
- assay_it-0.8.0/assay/openai_streaming.py +140 -0
- assay_it-0.8.0/assay/recorder.py +165 -0
- assay_it-0.8.0/assay/redaction.py +57 -0
- assay_it-0.8.0/assay/result.py +276 -0
- assay_it-0.8.0/assay/trace_reader.py +69 -0
- assay_it-0.8.0/assay/writer.py +21 -0
- assay_it-0.8.0/examples/openai-demo/README.md +57 -0
- assay_it-0.8.0/pyproject.toml +61 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/target
|
|
2
|
+
archive/
|
|
3
|
+
**/*.rs.bk
|
|
4
|
+
Cargo.lock
|
|
5
|
+
.eval
|
|
6
|
+
.DS_Store
|
|
7
|
+
*.log
|
|
8
|
+
junit.xml
|
|
9
|
+
sarif.json
|
|
10
|
+
run.json
|
|
11
|
+
otel.jsonl
|
|
12
|
+
out/
|
|
13
|
+
venv/
|
|
14
|
+
.venv/
|
|
15
|
+
__pycache__/
|
|
16
|
+
*.pyc
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
*.egg-info/
|
|
19
|
+
dist/
|
|
20
|
+
build/
|
|
21
|
+
.env
|
|
22
|
+
.coverage
|
|
23
|
+
htmlcov/
|
|
24
|
+
.mypy_cache/
|
|
25
|
+
.ruff_cache/
|
|
26
|
+
|
|
27
|
+
# Release & Smoke Test
|
|
28
|
+
.venv_smoke/
|
|
29
|
+
smoke_test_ws/
|
|
30
|
+
verdict_sdk.egg-info/
|
|
31
|
+
assay_python_sdk.egg-info/
|
assay_it-0.8.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 verdict-dev
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
assay_it-0.8.0/PKG-INFO
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: assay-it
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: High-performance evaluation framework for LLM agents
|
|
5
|
+
Project-URL: Homepage, https://assay.dev
|
|
6
|
+
Project-URL: Repository, https://github.com/Rul1an/assay
|
|
7
|
+
Project-URL: Issues, https://github.com/Rul1an/assay/issues
|
|
8
|
+
Author-email: Assay <hello@assay.dev>
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2025 verdict-dev
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Classifier: Development Status :: 4 - Beta
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Programming Language :: Python :: 3
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
41
|
+
Requires-Python: >=3.9
|
|
42
|
+
Requires-Dist: pyyaml>=6.0
|
|
43
|
+
Provides-Extra: dev
|
|
44
|
+
Requires-Dist: black; extra == 'dev'
|
|
45
|
+
Requires-Dist: isort; extra == 'dev'
|
|
46
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
47
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
48
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
49
|
+
Provides-Extra: openai
|
|
50
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
|
|
53
|
+
# Assay Python SDK
|
|
54
|
+
|
|
55
|
+
**Record deterministic traces from your Python agents for regression gating.**
|
|
56
|
+
|
|
57
|
+
## 🚀 Golden Quickstart
|
|
58
|
+
|
|
59
|
+
The fastest way to regression test your AI agent.
|
|
60
|
+
|
|
61
|
+
### 1. Installation
|
|
62
|
+
```bash
|
|
63
|
+
pip install assay
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 2. Record (`record.py`)
|
|
67
|
+
Run your agent through the SDK to capture a trace. Pass your tool functions to `tool_executors` so Assay can record their inputs and outputs.
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import os
|
|
71
|
+
import openai
|
|
72
|
+
from assay_sdk import TraceWriter, record_chat_completions_with_tools
|
|
73
|
+
|
|
74
|
+
# 1. Setup Client & Tools
|
|
75
|
+
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "mock"))
|
|
76
|
+
TOOLS = [{
|
|
77
|
+
"type": "function",
|
|
78
|
+
"function": {
|
|
79
|
+
"name": "GetWeather",
|
|
80
|
+
"parameters": {"type": "object", "properties": {"location": {"type": "string"}}}
|
|
81
|
+
}
|
|
82
|
+
}]
|
|
83
|
+
|
|
84
|
+
# 2. Define Execution Logic (The "Real" Code)
|
|
85
|
+
def get_weather(args):
|
|
86
|
+
return {"temp": 22, "location": args.get("location")}
|
|
87
|
+
|
|
88
|
+
# 3. Record the Loop
|
|
89
|
+
writer = TraceWriter("traces/quickstart.jsonl")
|
|
90
|
+
result = record_chat_completions_with_tools(
|
|
91
|
+
writer=writer,
|
|
92
|
+
client=client,
|
|
93
|
+
model="gpt-4o",
|
|
94
|
+
messages=[{"role": "user", "content": "Weather in Tokyo?"}],
|
|
95
|
+
tools=TOOLS,
|
|
96
|
+
tool_executors={"GetWeather": get_weather}, # Link schema -> function
|
|
97
|
+
episode_id="weather_demo",
|
|
98
|
+
test_id="weather_check"
|
|
99
|
+
)
|
|
100
|
+
print(f"Agent Final Answer: {result['content']}")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### 3. Configure (`assay.yaml`)
|
|
104
|
+
Tell Assay what to check.
|
|
105
|
+
|
|
106
|
+
```yaml
|
|
107
|
+
version: 1
|
|
108
|
+
model: "trace"
|
|
109
|
+
tests:
|
|
110
|
+
- id: weather_check
|
|
111
|
+
input:
|
|
112
|
+
prompt: "Weather in Tokyo?" # Matches the recorded prompt
|
|
113
|
+
expected:
|
|
114
|
+
type: regex_match
|
|
115
|
+
pattern: ".*" # Pass if any content returned (baseline check)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### 4. Verify
|
|
119
|
+
Run the regression gate. This replays your trace against the recorded tool outputs to ensure determinism.
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
# Verify strictly (fails if any tool call arg changed even slightly)
|
|
123
|
+
assay ci --config assay.yaml --trace-file traces/quickstart.jsonl --replay-strict --db :memory:
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## 🌊 Advanced: Streaming support
|
|
129
|
+
Capture streaming responses while maintaining tool call execution.
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from assay_sdk import record_chat_completions_stream_with_tools
|
|
133
|
+
|
|
134
|
+
# ... setup client & writer ...
|
|
135
|
+
|
|
136
|
+
result = record_chat_completions_stream_with_tools(
|
|
137
|
+
writer=writer,
|
|
138
|
+
# ... args ...
|
|
139
|
+
stream=True # SDK handles chunk aggregation automatically
|
|
140
|
+
# tool_executors={...} # Required if tools are used
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
*Note: The hybrid wrapper (`record_chat_completions_stream_with_tools`) streams the thinking tokens to the user, executes tools, and then performs a standard follow-up call.*
|
|
144
|
+
|
|
145
|
+
## 🛡️ Advanced: Privacy & Redaction
|
|
146
|
+
Protect sensitive data (PII, API keys) from ever hitting the trace file.
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from assay_sdk import TraceWriter, make_redactor
|
|
150
|
+
|
|
151
|
+
# Create a redactor that scrubs keys and regex patterns
|
|
152
|
+
redactor = make_redactor(
|
|
153
|
+
key_denylist={"authorization", "password", "api_key"},
|
|
154
|
+
patterns=[r"sk-[a-zA-Z0-9]{20,}"] # Mask OpenAI keys
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Attach to writer - happens automatically on write
|
|
158
|
+
writer = TraceWriter("traces/secure.jsonl", redact_fn=redactor)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## ⚡ Async Support
|
|
162
|
+
Native `async` support for high-throughput applications (FastAPI, etc.) is available via the `assay_sdk.async_openai` submodule. It provides full parity with the sync API, including loop and streaming support.
|
|
163
|
+
|
|
164
|
+
## ❓ Troubleshooting
|
|
165
|
+
|
|
166
|
+
### `E_TRACE_EPISODE_MISSING`
|
|
167
|
+
**Cause**: The `test_id` or `episode_id` in your trace doesn't match what `assay ci` expected from its config (or implicit default).
|
|
168
|
+
**Fix**: Ensure your `assay.yaml` test IDs match the `test_id` passed to `record_chat_completions...`.
|
|
169
|
+
|
|
170
|
+
### "Duplicate prompt in strict replay"
|
|
171
|
+
**Cause**: You ran `record.py` twice without cleaning the trace file, so it contains two identical episodes. `assay ci` in strict mode doesn't know which one to replay.
|
|
172
|
+
**Fix**:
|
|
173
|
+
1. Truncate the file before recording: `trace_path = "traces/my_trace.jsonl"; open(trace_path, 'w').close()`.
|
|
174
|
+
2. Use unique `episode_id`s (e.g. UUIDs) for every run.
|
assay_it-0.8.0/README.md
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# Assay Python SDK
|
|
2
|
+
|
|
3
|
+
**Record deterministic traces from your Python agents for regression gating.**
|
|
4
|
+
|
|
5
|
+
## 🚀 Golden Quickstart
|
|
6
|
+
|
|
7
|
+
The fastest way to regression test your AI agent.
|
|
8
|
+
|
|
9
|
+
### 1. Installation
|
|
10
|
+
```bash
|
|
11
|
+
pip install assay
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
### 2. Record (`record.py`)
|
|
15
|
+
Run your agent through the SDK to capture a trace. Pass your tool functions to `tool_executors` so Assay can record their inputs and outputs.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import os
|
|
19
|
+
import openai
|
|
20
|
+
from assay_sdk import TraceWriter, record_chat_completions_with_tools
|
|
21
|
+
|
|
22
|
+
# 1. Setup Client & Tools
|
|
23
|
+
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "mock"))
|
|
24
|
+
TOOLS = [{
|
|
25
|
+
"type": "function",
|
|
26
|
+
"function": {
|
|
27
|
+
"name": "GetWeather",
|
|
28
|
+
"parameters": {"type": "object", "properties": {"location": {"type": "string"}}}
|
|
29
|
+
}
|
|
30
|
+
}]
|
|
31
|
+
|
|
32
|
+
# 2. Define Execution Logic (The "Real" Code)
|
|
33
|
+
def get_weather(args):
|
|
34
|
+
return {"temp": 22, "location": args.get("location")}
|
|
35
|
+
|
|
36
|
+
# 3. Record the Loop
|
|
37
|
+
writer = TraceWriter("traces/quickstart.jsonl")
|
|
38
|
+
result = record_chat_completions_with_tools(
|
|
39
|
+
writer=writer,
|
|
40
|
+
client=client,
|
|
41
|
+
model="gpt-4o",
|
|
42
|
+
messages=[{"role": "user", "content": "Weather in Tokyo?"}],
|
|
43
|
+
tools=TOOLS,
|
|
44
|
+
tool_executors={"GetWeather": get_weather}, # Link schema -> function
|
|
45
|
+
episode_id="weather_demo",
|
|
46
|
+
test_id="weather_check"
|
|
47
|
+
)
|
|
48
|
+
print(f"Agent Final Answer: {result['content']}")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### 3. Configure (`assay.yaml`)
|
|
52
|
+
Tell Assay what to check.
|
|
53
|
+
|
|
54
|
+
```yaml
|
|
55
|
+
version: 1
|
|
56
|
+
model: "trace"
|
|
57
|
+
tests:
|
|
58
|
+
- id: weather_check
|
|
59
|
+
input:
|
|
60
|
+
prompt: "Weather in Tokyo?" # Matches the recorded prompt
|
|
61
|
+
expected:
|
|
62
|
+
type: regex_match
|
|
63
|
+
pattern: ".*" # Pass if any content returned (baseline check)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 4. Verify
|
|
67
|
+
Run the regression gate. This replays your trace against the recorded tool outputs to ensure determinism.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Verify strictly (fails if any tool call arg changed even slightly)
|
|
71
|
+
assay ci --config assay.yaml --trace-file traces/quickstart.jsonl --replay-strict --db :memory:
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## 🌊 Advanced: Streaming support
|
|
77
|
+
Capture streaming responses while maintaining tool call execution.
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from assay_sdk import record_chat_completions_stream_with_tools
|
|
81
|
+
|
|
82
|
+
# ... setup client & writer ...
|
|
83
|
+
|
|
84
|
+
result = record_chat_completions_stream_with_tools(
|
|
85
|
+
writer=writer,
|
|
86
|
+
# ... args ...
|
|
87
|
+
stream=True # SDK handles chunk aggregation automatically
|
|
88
|
+
# tool_executors={...} # Required if tools are used
|
|
89
|
+
)
|
|
90
|
+
```
|
|
91
|
+
*Note: The hybrid wrapper (`record_chat_completions_stream_with_tools`) streams the thinking tokens to the user, executes tools, and then performs a standard follow-up call.*
|
|
92
|
+
|
|
93
|
+
## 🛡️ Advanced: Privacy & Redaction
|
|
94
|
+
Protect sensitive data (PII, API keys) from ever hitting the trace file.
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from assay_sdk import TraceWriter, make_redactor
|
|
98
|
+
|
|
99
|
+
# Create a redactor that scrubs keys and regex patterns
|
|
100
|
+
redactor = make_redactor(
|
|
101
|
+
key_denylist={"authorization", "password", "api_key"},
|
|
102
|
+
patterns=[r"sk-[a-zA-Z0-9]{20,}"] # Mask OpenAI keys
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Attach to writer - happens automatically on write
|
|
106
|
+
writer = TraceWriter("traces/secure.jsonl", redact_fn=redactor)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## ⚡ Async Support
|
|
110
|
+
Native `async` support for high-throughput applications (FastAPI, etc.) is available via the `assay_sdk.async_openai` submodule. It provides full parity with the sync API, including loop and streaming support.
|
|
111
|
+
|
|
112
|
+
## ❓ Troubleshooting
|
|
113
|
+
|
|
114
|
+
### `E_TRACE_EPISODE_MISSING`
|
|
115
|
+
**Cause**: The `test_id` or `episode_id` in your trace doesn't match what `assay ci` expected from its config (or implicit default).
|
|
116
|
+
**Fix**: Ensure your `assay.yaml` test IDs match the `test_id` passed to `record_chat_completions...`.
|
|
117
|
+
|
|
118
|
+
### "Duplicate prompt in strict replay"
|
|
119
|
+
**Cause**: You ran `record.py` twice without cleaning the trace file, so it contains two identical episodes. `assay ci` in strict mode doesn't know which one to replay.
|
|
120
|
+
**Fix**:
|
|
121
|
+
1. Truncate the file before recording: `trace_path = "traces/my_trace.jsonl"; open(trace_path, 'w').close()`.
|
|
122
|
+
2. Use unique `episode_id`s (e.g. UUIDs) for every run.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Verdict Python SDK: deterministic trace recording for regression gating.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .clock import FrozenClock, SystemClock
|
|
6
|
+
from .openai_instrumentor import (record_chat_completions,
|
|
7
|
+
record_chat_completions_with_tools)
|
|
8
|
+
from .openai_stream_wrapper import (record_chat_completions_stream,
|
|
9
|
+
record_chat_completions_stream_with_tools)
|
|
10
|
+
from .recorder import EpisodeRecorder
|
|
11
|
+
from .redaction import make_redactor
|
|
12
|
+
from .writer import TraceWriter
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"TraceWriter",
|
|
16
|
+
"EpisodeRecorder",
|
|
17
|
+
"SystemClock",
|
|
18
|
+
"FrozenClock",
|
|
19
|
+
"record_chat_completions",
|
|
20
|
+
"record_chat_completions_with_tools",
|
|
21
|
+
"make_redactor",
|
|
22
|
+
"record_chat_completions_stream",
|
|
23
|
+
"record_chat_completions_stream_with_tools",
|
|
24
|
+
]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
|
|
3
|
+
from .doctor import run_doctor
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main():
|
|
7
|
+
if len(sys.argv) < 2:
|
|
8
|
+
print("Usage: assay <command>")
|
|
9
|
+
print("Commands:")
|
|
10
|
+
print(" doctor Run health checks")
|
|
11
|
+
sys.exit(1)
|
|
12
|
+
|
|
13
|
+
cmd = sys.argv[1]
|
|
14
|
+
if cmd == "doctor":
|
|
15
|
+
run_doctor()
|
|
16
|
+
elif cmd == "enforce":
|
|
17
|
+
# Placeholder for future Phase 3.1
|
|
18
|
+
print("Not implemented yet.")
|
|
19
|
+
else:
|
|
20
|
+
print(f"Unknown command: {cmd}")
|
|
21
|
+
sys.exit(1)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if __name__ == "__main__":
|
|
25
|
+
main()
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, Optional
|
|
7
|
+
|
|
8
|
+
from .result import CompareResult, EvalRun, ResultArtifacts
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RunWriter:
|
|
12
|
+
"""Manages persistence of EvalRuns and CompareResults to disk."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, workdir: Path) -> None:
|
|
15
|
+
self.workdir = workdir
|
|
16
|
+
self.runs_dir = workdir / "runs"
|
|
17
|
+
self.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
|
|
19
|
+
def write_run(self, run: EvalRun) -> ResultArtifacts:
|
|
20
|
+
"""Persists EvalRun to .eval/runs/<run_id>/."""
|
|
21
|
+
run_dir = self.runs_dir / run.run_id
|
|
22
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
run_json = run_dir / "run.json"
|
|
25
|
+
results_jsonl = run_dir / "results.jsonl"
|
|
26
|
+
junit_xml = run_dir / "junit.xml"
|
|
27
|
+
summary_md = run_dir / "summary.md"
|
|
28
|
+
artifacts_json = run_dir / "artifacts.json"
|
|
29
|
+
|
|
30
|
+
self._write_json(run_json, run.to_dict())
|
|
31
|
+
|
|
32
|
+
with open(results_jsonl, "w", encoding="utf-8") as f:
|
|
33
|
+
for t in run.tests:
|
|
34
|
+
f.write(json.dumps(t.to_dict()) + "\n")
|
|
35
|
+
|
|
36
|
+
if run.artifacts.junit_xml:
|
|
37
|
+
with open(junit_xml, "w", encoding="utf-8") as f:
|
|
38
|
+
f.write(run.to_junit_xml())
|
|
39
|
+
|
|
40
|
+
with open(summary_md, "w", encoding="utf-8") as f:
|
|
41
|
+
f.write(run.to_github_summary())
|
|
42
|
+
|
|
43
|
+
index_data = {
|
|
44
|
+
"run_id": run.run_id,
|
|
45
|
+
"created_at_ms": run.created_at_ms,
|
|
46
|
+
"paths": {
|
|
47
|
+
"run_json": str(run_json.absolute()),
|
|
48
|
+
"results_jsonl": str(results_jsonl.absolute()),
|
|
49
|
+
"junit_xml": str(junit_xml.absolute()),
|
|
50
|
+
"summary_md": str(summary_md.absolute()),
|
|
51
|
+
"sarif": None,
|
|
52
|
+
"diff_json": None,
|
|
53
|
+
"trace_path": str(run.trace_path.absolute()),
|
|
54
|
+
},
|
|
55
|
+
}
|
|
56
|
+
self._write_json(artifacts_json, index_data)
|
|
57
|
+
|
|
58
|
+
# Return updated artifacts object reflecting true paths
|
|
59
|
+
# (Though EvalRun object passed in hasn't changed, callers should know the reliable paths)
|
|
60
|
+
return ResultArtifacts(
|
|
61
|
+
run_json=run_json,
|
|
62
|
+
results_jsonl=results_jsonl,
|
|
63
|
+
junit_xml=junit_xml,
|
|
64
|
+
sarif=None,
|
|
65
|
+
diff_json=None,
|
|
66
|
+
trace_path=run.trace_path,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def write_diff(self, result: CompareResult) -> None:
|
|
70
|
+
"""Persists diff.json to the current run's directory."""
|
|
71
|
+
if not result.current_run_id:
|
|
72
|
+
return
|
|
73
|
+
run_dir = self.runs_dir / result.current_run_id
|
|
74
|
+
if not run_dir.exists():
|
|
75
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
|
|
77
|
+
# 1. diff.json
|
|
78
|
+
diff_file = run_dir / "diff.json"
|
|
79
|
+
self._write_json(diff_file, result.to_dict())
|
|
80
|
+
|
|
81
|
+
# 2. Update summary.md with comparison details
|
|
82
|
+
summary_path = run_dir / "summary.md"
|
|
83
|
+
with open(summary_path, "w", encoding="utf-8") as f:
|
|
84
|
+
f.write(result.to_github_summary())
|
|
85
|
+
|
|
86
|
+
# 3. Update artifacts.json
|
|
87
|
+
index_path = run_dir / "artifacts.json"
|
|
88
|
+
if index_path.exists():
|
|
89
|
+
with open(index_path, "r", encoding="utf-8") as f:
|
|
90
|
+
index_data = json.load(f)
|
|
91
|
+
else:
|
|
92
|
+
# Should exist, but fallback
|
|
93
|
+
index_data = {"run_id": result.current_run_id, "paths": {}}
|
|
94
|
+
|
|
95
|
+
index_data["paths"]["diff_json"] = str(diff_file.absolute())
|
|
96
|
+
self._write_json(index_path, index_data)
|
|
97
|
+
|
|
98
|
+
def _write_json(self, path: Path, data: Dict[str, Any]) -> None:
|
|
99
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
100
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|