llm-diff 1.2.0__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llm_diff-1.2.0 → llm_diff-1.2.2}/PKG-INFO +30 -16
- {llm_diff-1.2.0 → llm_diff-1.2.2}/README.md +24 -11
- llm_diff-1.2.2/llm_diff/__init__.py +51 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/api.py +112 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/cache.py +30 -1
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/diff.py +36 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/judge.py +49 -1
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/pricing.py +14 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/providers.py +29 -1
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/report.py +17 -0
- llm_diff-1.2.2/llm_diff/schema_events.py +622 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/pyproject.toml +6 -5
- llm_diff-1.2.0/llm_diff/__init__.py +0 -26
- {llm_diff-1.2.0 → llm_diff-1.2.2}/.gitignore +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/LICENSE +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/batch.py +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/cli.py +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/config.py +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/metrics.py +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/multi.py +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/renderer.py +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/semantic.py +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/templates/batch_report.html.j2 +0 -0
- {llm_diff-1.2.0 → llm_diff-1.2.2}/llm_diff/templates/report.html.j2 +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llm-diff
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: A CLI tool for comparing LLM outputs — semantically, visually, and at scale
|
|
5
|
-
Project-URL: Homepage, https://github.com/
|
|
6
|
-
Project-URL: Repository, https://github.com/
|
|
7
|
-
Project-URL: Bug Tracker, https://github.com/
|
|
8
|
-
Project-URL: Documentation, https://github.com/
|
|
5
|
+
Project-URL: Homepage, https://github.com/veerarag1973/llmdiff
|
|
6
|
+
Project-URL: Repository, https://github.com/veerarag1973/llmdiff
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/veerarag1973/llmdiff/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/veerarag1973/llmdiff/tree/main/docs
|
|
9
9
|
License: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: claude,cli,diff,llm,openai,prompt-testing
|
|
@@ -24,6 +24,7 @@ Classifier: Topic :: Software Development :: Testing
|
|
|
24
24
|
Requires-Python: >=3.9
|
|
25
25
|
Requires-Dist: click>=8.1
|
|
26
26
|
Requires-Dist: jinja2>=3.1
|
|
27
|
+
Requires-Dist: llm-toolkit-schema>=1.1.0
|
|
27
28
|
Requires-Dist: openai>=1.14
|
|
28
29
|
Requires-Dist: python-dotenv>=1.0
|
|
29
30
|
Requires-Dist: pyyaml>=6.0
|
|
@@ -46,9 +47,9 @@ Description-Content-Type: text/markdown
|
|
|
46
47
|
|
|
47
48
|
**A CLI tool and Python library for comparing LLM outputs — semantically, visually, and at scale.**
|
|
48
49
|
|
|
49
|
-
[](https://
|
|
50
|
+
[](https://pypi.org/project/llm-diff/1.2.2/)
|
|
51
|
+
[](https://pypi.org/project/llm-diff/)
|
|
52
|
+
[](https://pypi.org/project/llm-diff/)
|
|
52
53
|
[](https://pypi.org/project/llm-diff/)
|
|
53
54
|
[](LICENSE)
|
|
54
55
|
[](CHANGELOG.md)
|
|
@@ -58,7 +59,9 @@ Description-Content-Type: text/markdown
|
|
|
58
59
|
`llm-diff` calls two LLM models in parallel, diffs their responses word-by-word,
|
|
59
60
|
scores them semantically, and renders results in the terminal or as a
|
|
60
61
|
self-contained HTML report. It scales to batch workloads, caches API responses,
|
|
61
|
-
|
|
62
|
+
gates CI pipelines via `--fail-under`, and emits structured
|
|
63
|
+
[llm-toolkit-schema](https://pypi.org/project/llm-toolkit-schema/) events for
|
|
64
|
+
observability tooling.
|
|
62
65
|
|
|
63
66
|
## What is llm-diff?
|
|
64
67
|
|
|
@@ -76,13 +79,20 @@ threshold — making it a first-class citizen in CI/CD pipelines.
|
|
|
76
79
|
Version 1.2 adds LLM-as-a-Judge scoring, per-call USD cost tracking,
|
|
77
80
|
multi-model (3–4 model) comparison, and structured JSON diff.
|
|
78
81
|
|
|
82
|
+
Version 1.2.2 integrates [llm-toolkit-schema](https://pypi.org/project/llm-toolkit-schema/)
|
|
83
|
+
as a built-in observability layer: every comparison, model call, cache lookup,
|
|
84
|
+
cost record, and judge evaluation now emits a validated schema event that can be
|
|
85
|
+
collected in memory, exported to JSONL, or forwarded to any custom backend.
|
|
86
|
+
|
|
79
87
|
## Documentation
|
|
80
88
|
|
|
81
89
|
| Guide | Description |
|
|
82
90
|
|-------|-------------|
|
|
83
91
|
| [Getting Started](docs/getting-started.md) | Installation, API keys, first diff |
|
|
92
|
+
| [Tutorials](docs/tutorials/README.md) | Step-by-step learning path from first run to Python API (12 tutorials) |
|
|
84
93
|
| [CLI Reference](docs/cli-reference.md) | All flags, option groups, exit codes, YAML format |
|
|
85
94
|
| [Python API](docs/api.md) | All public functions, dataclasses, and field descriptions |
|
|
95
|
+
| [Schema Events](docs/schema-events.md) | Observability integration with llm-toolkit-schema |
|
|
86
96
|
| [Configuration](docs/configuration.md) | `.llmdiff` TOML schema, env vars, config priority |
|
|
87
97
|
| [Provider Setup](docs/providers.md) | OpenAI, Groq, Mistral, Ollama, LM Studio, Anthropic |
|
|
88
98
|
| [HTML Reports](docs/html-reports.md) | Report anatomy, batch reports, judge card, cost table |
|
|
@@ -94,6 +104,9 @@ multi-model (3–4 model) comparison, and structured JSON diff.
|
|
|
94
104
|
# Install with semantic scoring support
|
|
95
105
|
pip install "llm-diff[semantic]"
|
|
96
106
|
|
|
107
|
+
# Install with schema-events observability
|
|
108
|
+
pip install "llm-diff[semantic]" llm-toolkit-schema
|
|
109
|
+
|
|
97
110
|
# Set an API key
|
|
98
111
|
export OPENAI_API_KEY="sk-..."
|
|
99
112
|
|
|
@@ -107,18 +120,19 @@ llm-diff "Explain recursion." -a gpt-4o -b gpt-4o-mini --semantic --out report.h
|
|
|
107
120
|
llm-diff --batch prompts.yml -a gpt-4o -b gpt-4o-mini --semantic --fail-under 0.85
|
|
108
121
|
```
|
|
109
122
|
|
|
110
|
-
See [Getting Started](docs/getting-started.md) for
|
|
111
|
-
|
|
112
|
-
|
|
123
|
+
See [Getting Started](docs/getting-started.md) for quick examples, or work through the
|
|
124
|
+
[Tutorials](docs/tutorials/README.md) for a guided learning path covering prompt engineering,
|
|
125
|
+
batch evaluation, CI/CD gating, LLM-as-a-Judge, cost tracking, and the Python API.
|
|
113
126
|
|
|
114
127
|
## Getting Help
|
|
115
128
|
|
|
116
129
|
| | |
|
|
117
130
|
|---|---|
|
|
118
|
-
| **Bug reports** | [Open an issue](https://github.com/
|
|
119
|
-
| **Feature requests** | [Open a feature request](https://github.com/
|
|
120
|
-
| **Questions & discussion** | [GitHub Discussions](https://github.com/
|
|
121
|
-
| **Open issues** | [github.com/
|
|
131
|
+
| **Bug reports** | [Open an issue](https://github.com/veerarag1973/llmdiff/issues/new?labels=bug&template=bug_report.md) |
|
|
132
|
+
| **Feature requests** | [Open a feature request](https://github.com/veerarag1973/llmdiff/issues/new?labels=enhancement&template=feature_request.md) |
|
|
133
|
+
| **Questions & discussion** | [GitHub Discussions](https://github.com/veerarag1973/llmdiff/discussions) |
|
|
134
|
+
| **Open issues** | [github.com/veerarag1973/llmdiff/issues](https://github.com/veerarag1973/llmdiff/issues) |
|
|
135
|
+
| **PyPI project page** | [pypi.org/project/llm-diff](https://pypi.org/project/llm-diff/) |
|
|
122
136
|
| **Roadmap** | [IMPLEMENTATION_PLAN.md](IMPLEMENTATION_PLAN.md) |
|
|
123
137
|
| **Changelog** | [CHANGELOG.md](CHANGELOG.md) |
|
|
124
138
|
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
**A CLI tool and Python library for comparing LLM outputs — semantically, visually, and at scale.**
|
|
4
4
|
|
|
5
|
-
[](https://
|
|
5
|
+
[](https://pypi.org/project/llm-diff/1.2.2/)
|
|
6
|
+
[](https://pypi.org/project/llm-diff/)
|
|
7
|
+
[](https://pypi.org/project/llm-diff/)
|
|
8
8
|
[](https://pypi.org/project/llm-diff/)
|
|
9
9
|
[](LICENSE)
|
|
10
10
|
[](CHANGELOG.md)
|
|
@@ -14,7 +14,9 @@
|
|
|
14
14
|
`llm-diff` calls two LLM models in parallel, diffs their responses word-by-word,
|
|
15
15
|
scores them semantically, and renders results in the terminal or as a
|
|
16
16
|
self-contained HTML report. It scales to batch workloads, caches API responses,
|
|
17
|
-
|
|
17
|
+
gates CI pipelines via `--fail-under`, and emits structured
|
|
18
|
+
[llm-toolkit-schema](https://pypi.org/project/llm-toolkit-schema/) events for
|
|
19
|
+
observability tooling.
|
|
18
20
|
|
|
19
21
|
## What is llm-diff?
|
|
20
22
|
|
|
@@ -32,13 +34,20 @@ threshold — making it a first-class citizen in CI/CD pipelines.
|
|
|
32
34
|
Version 1.2 adds LLM-as-a-Judge scoring, per-call USD cost tracking,
|
|
33
35
|
multi-model (3–4 model) comparison, and structured JSON diff.
|
|
34
36
|
|
|
37
|
+
Version 1.2.2 integrates [llm-toolkit-schema](https://pypi.org/project/llm-toolkit-schema/)
|
|
38
|
+
as a built-in observability layer: every comparison, model call, cache lookup,
|
|
39
|
+
cost record, and judge evaluation now emits a validated schema event that can be
|
|
40
|
+
collected in memory, exported to JSONL, or forwarded to any custom backend.
|
|
41
|
+
|
|
35
42
|
## Documentation
|
|
36
43
|
|
|
37
44
|
| Guide | Description |
|
|
38
45
|
|-------|-------------|
|
|
39
46
|
| [Getting Started](docs/getting-started.md) | Installation, API keys, first diff |
|
|
47
|
+
| [Tutorials](docs/tutorials/README.md) | Step-by-step learning path from first run to Python API (12 tutorials) |
|
|
40
48
|
| [CLI Reference](docs/cli-reference.md) | All flags, option groups, exit codes, YAML format |
|
|
41
49
|
| [Python API](docs/api.md) | All public functions, dataclasses, and field descriptions |
|
|
50
|
+
| [Schema Events](docs/schema-events.md) | Observability integration with llm-toolkit-schema |
|
|
42
51
|
| [Configuration](docs/configuration.md) | `.llmdiff` TOML schema, env vars, config priority |
|
|
43
52
|
| [Provider Setup](docs/providers.md) | OpenAI, Groq, Mistral, Ollama, LM Studio, Anthropic |
|
|
44
53
|
| [HTML Reports](docs/html-reports.md) | Report anatomy, batch reports, judge card, cost table |
|
|
@@ -50,6 +59,9 @@ multi-model (3–4 model) comparison, and structured JSON diff.
|
|
|
50
59
|
# Install with semantic scoring support
|
|
51
60
|
pip install "llm-diff[semantic]"
|
|
52
61
|
|
|
62
|
+
# Install with schema-events observability
|
|
63
|
+
pip install "llm-diff[semantic]" llm-toolkit-schema
|
|
64
|
+
|
|
53
65
|
# Set an API key
|
|
54
66
|
export OPENAI_API_KEY="sk-..."
|
|
55
67
|
|
|
@@ -63,18 +75,19 @@ llm-diff "Explain recursion." -a gpt-4o -b gpt-4o-mini --semantic --out report.h
|
|
|
63
75
|
llm-diff --batch prompts.yml -a gpt-4o -b gpt-4o-mini --semantic --fail-under 0.85
|
|
64
76
|
```
|
|
65
77
|
|
|
66
|
-
See [Getting Started](docs/getting-started.md) for
|
|
67
|
-
|
|
68
|
-
|
|
78
|
+
See [Getting Started](docs/getting-started.md) for quick examples, or work through the
|
|
79
|
+
[Tutorials](docs/tutorials/README.md) for a guided learning path covering prompt engineering,
|
|
80
|
+
batch evaluation, CI/CD gating, LLM-as-a-Judge, cost tracking, and the Python API.
|
|
69
81
|
|
|
70
82
|
## Getting Help
|
|
71
83
|
|
|
72
84
|
| | |
|
|
73
85
|
|---|---|
|
|
74
|
-
| **Bug reports** | [Open an issue](https://github.com/
|
|
75
|
-
| **Feature requests** | [Open a feature request](https://github.com/
|
|
76
|
-
| **Questions & discussion** | [GitHub Discussions](https://github.com/
|
|
77
|
-
| **Open issues** | [github.com/
|
|
86
|
+
| **Bug reports** | [Open an issue](https://github.com/veerarag1973/llmdiff/issues/new?labels=bug&template=bug_report.md) |
|
|
87
|
+
| **Feature requests** | [Open a feature request](https://github.com/veerarag1973/llmdiff/issues/new?labels=enhancement&template=feature_request.md) |
|
|
88
|
+
| **Questions & discussion** | [GitHub Discussions](https://github.com/veerarag1973/llmdiff/discussions) |
|
|
89
|
+
| **Open issues** | [github.com/veerarag1973/llmdiff/issues](https://github.com/veerarag1973/llmdiff/issues) |
|
|
90
|
+
| **PyPI project page** | [pypi.org/project/llm-diff](https://pypi.org/project/llm-diff/) |
|
|
78
91
|
| **Roadmap** | [IMPLEMENTATION_PLAN.md](IMPLEMENTATION_PLAN.md) |
|
|
79
92
|
| **Changelog** | [CHANGELOG.md](CHANGELOG.md) |
|
|
80
93
|
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""llm-diff — CLI tool for comparing LLM outputs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "1.2.2"
|
|
6
|
+
|
|
7
|
+
from llm_diff.api import ComparisonReport, compare, compare_batch, compare_prompts
|
|
8
|
+
from llm_diff.diff import JsonStructDiffResult, json_struct_diff
|
|
9
|
+
from llm_diff.judge import JudgeResult
|
|
10
|
+
from llm_diff.multi import MultiModelReport, PairScore, run_multi_model
|
|
11
|
+
from llm_diff.pricing import CostEstimate
|
|
12
|
+
from llm_diff.schema_events import (
|
|
13
|
+
EventEmitter,
|
|
14
|
+
configure_emitter,
|
|
15
|
+
emit,
|
|
16
|
+
get_emitter,
|
|
17
|
+
make_cache_event,
|
|
18
|
+
make_comparison_completed_event,
|
|
19
|
+
make_comparison_started_event,
|
|
20
|
+
make_cost_recorded_event,
|
|
21
|
+
make_eval_scenario_event,
|
|
22
|
+
make_report_exported_event,
|
|
23
|
+
make_trace_span_event,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"__version__",
|
|
28
|
+
"ComparisonReport",
|
|
29
|
+
"compare",
|
|
30
|
+
"compare_batch",
|
|
31
|
+
"compare_prompts",
|
|
32
|
+
"CostEstimate",
|
|
33
|
+
"JudgeResult",
|
|
34
|
+
"json_struct_diff",
|
|
35
|
+
"JsonStructDiffResult",
|
|
36
|
+
"MultiModelReport",
|
|
37
|
+
"PairScore",
|
|
38
|
+
"run_multi_model",
|
|
39
|
+
# Schema events
|
|
40
|
+
"EventEmitter",
|
|
41
|
+
"configure_emitter",
|
|
42
|
+
"emit",
|
|
43
|
+
"get_emitter",
|
|
44
|
+
"make_cache_event",
|
|
45
|
+
"make_comparison_completed_event",
|
|
46
|
+
"make_comparison_started_event",
|
|
47
|
+
"make_cost_recorded_event",
|
|
48
|
+
"make_eval_scenario_event",
|
|
49
|
+
"make_report_exported_event",
|
|
50
|
+
"make_trace_span_event",
|
|
51
|
+
]
|
|
@@ -196,6 +196,24 @@ async def compare(
|
|
|
196
196
|
"""
|
|
197
197
|
cfg = _resolve_config(config, temperature=temperature, max_tokens=max_tokens, timeout=timeout)
|
|
198
198
|
|
|
199
|
+
# Emit comparison started event (best-effort)
|
|
200
|
+
_started_event_id: str = ""
|
|
201
|
+
try:
|
|
202
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
203
|
+
emit as schema_emit,
|
|
204
|
+
make_comparison_started_event,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
started_evt = make_comparison_started_event(
|
|
208
|
+
model_a=model_a,
|
|
209
|
+
model_b=model_b,
|
|
210
|
+
prompt=prompt,
|
|
211
|
+
)
|
|
212
|
+
schema_emit(started_evt)
|
|
213
|
+
_started_event_id = started_evt.event_id
|
|
214
|
+
except Exception: # noqa: BLE001
|
|
215
|
+
pass
|
|
216
|
+
|
|
199
217
|
comparison = await compare_models(
|
|
200
218
|
prompt_a=prompt,
|
|
201
219
|
prompt_b=prompt,
|
|
@@ -222,6 +240,42 @@ async def compare(
|
|
|
222
240
|
|
|
223
241
|
cost_a, cost_b = _compute_cost(comparison, show_cost=show_cost)
|
|
224
242
|
|
|
243
|
+
# Emit cost recorded events for each model call (best-effort)
|
|
244
|
+
if cost_a is not None:
|
|
245
|
+
try:
|
|
246
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
247
|
+
emit as schema_emit,
|
|
248
|
+
make_cost_recorded_event,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
schema_emit(
|
|
252
|
+
make_cost_recorded_event(
|
|
253
|
+
input_cost=cost_a.prompt_usd,
|
|
254
|
+
output_cost=cost_a.completion_usd,
|
|
255
|
+
total_cost=cost_a.total_usd,
|
|
256
|
+
model=cost_a.model,
|
|
257
|
+
)
|
|
258
|
+
)
|
|
259
|
+
except Exception: # noqa: BLE001
|
|
260
|
+
pass
|
|
261
|
+
if cost_b is not None:
|
|
262
|
+
try:
|
|
263
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
264
|
+
emit as schema_emit,
|
|
265
|
+
make_cost_recorded_event,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
schema_emit(
|
|
269
|
+
make_cost_recorded_event(
|
|
270
|
+
input_cost=cost_b.prompt_usd,
|
|
271
|
+
output_cost=cost_b.completion_usd,
|
|
272
|
+
total_cost=cost_b.total_usd,
|
|
273
|
+
model=cost_b.model,
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
except Exception: # noqa: BLE001
|
|
277
|
+
pass
|
|
278
|
+
|
|
225
279
|
html_report: str | None = None
|
|
226
280
|
if build_html:
|
|
227
281
|
from llm_diff.report import build_report # noqa: PLC0415
|
|
@@ -239,6 +293,26 @@ async def compare(
|
|
|
239
293
|
cost_b=cost_b,
|
|
240
294
|
)
|
|
241
295
|
|
|
296
|
+
# Emit comparison completed event (best-effort)
|
|
297
|
+
try:
|
|
298
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
299
|
+
emit as schema_emit,
|
|
300
|
+
make_comparison_completed_event,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
schema_emit(
|
|
304
|
+
make_comparison_completed_event(
|
|
305
|
+
model_a=model_a,
|
|
306
|
+
model_b=model_b,
|
|
307
|
+
diff_type="completion",
|
|
308
|
+
completion_diff=diff_result.as_unified_diff() or None,
|
|
309
|
+
similarity_score=diff_result.similarity,
|
|
310
|
+
base_event_id=_started_event_id,
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
except Exception: # noqa: BLE001
|
|
314
|
+
pass
|
|
315
|
+
|
|
242
316
|
return ComparisonReport(
|
|
243
317
|
prompt_a=prompt,
|
|
244
318
|
prompt_b=prompt,
|
|
@@ -295,6 +369,24 @@ async def compare_prompts(
|
|
|
295
369
|
"""
|
|
296
370
|
cfg = _resolve_config(config, temperature=temperature, max_tokens=max_tokens, timeout=timeout)
|
|
297
371
|
|
|
372
|
+
# Emit comparison started event (best-effort) — diff_type is "prompt"
|
|
373
|
+
_started_event_id_p: str = ""
|
|
374
|
+
try:
|
|
375
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
376
|
+
emit as schema_emit,
|
|
377
|
+
make_comparison_started_event,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
started_evt = make_comparison_started_event(
|
|
381
|
+
model_a=model,
|
|
382
|
+
model_b=model,
|
|
383
|
+
prompt=prompt_a,
|
|
384
|
+
)
|
|
385
|
+
schema_emit(started_evt)
|
|
386
|
+
_started_event_id_p = started_evt.event_id
|
|
387
|
+
except Exception: # noqa: BLE001
|
|
388
|
+
pass
|
|
389
|
+
|
|
298
390
|
comparison = await compare_models(
|
|
299
391
|
prompt_a=prompt_a,
|
|
300
392
|
prompt_b=prompt_b,
|
|
@@ -342,6 +434,26 @@ async def compare_prompts(
|
|
|
342
434
|
cost_b=cost_b,
|
|
343
435
|
)
|
|
344
436
|
|
|
437
|
+
# Emit comparison completed event (prompt diff, best-effort)
|
|
438
|
+
try:
|
|
439
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
440
|
+
emit as schema_emit,
|
|
441
|
+
make_comparison_completed_event,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
schema_emit(
|
|
445
|
+
make_comparison_completed_event(
|
|
446
|
+
model_a=model,
|
|
447
|
+
model_b=model,
|
|
448
|
+
diff_type="prompt",
|
|
449
|
+
completion_diff=diff_result.as_unified_diff() or None,
|
|
450
|
+
similarity_score=diff_result.similarity,
|
|
451
|
+
base_event_id=_started_event_id_p,
|
|
452
|
+
)
|
|
453
|
+
)
|
|
454
|
+
except Exception: # noqa: BLE001
|
|
455
|
+
pass
|
|
456
|
+
|
|
345
457
|
return ComparisonReport(
|
|
346
458
|
prompt_a=prompt_a,
|
|
347
459
|
prompt_b=prompt_b,
|
|
@@ -146,13 +146,42 @@ class ResultCache:
|
|
|
146
146
|
|
|
147
147
|
path = self._entry_path(key)
|
|
148
148
|
if not path.is_file():
|
|
149
|
+
# Emit cache miss event
|
|
150
|
+
try:
|
|
151
|
+
from llm_diff.schema_events import emit as schema_emit, make_cache_event # noqa: PLC0415
|
|
152
|
+
|
|
153
|
+
schema_emit(
|
|
154
|
+
make_cache_event(
|
|
155
|
+
hit=False,
|
|
156
|
+
cache_key=key[:16],
|
|
157
|
+
backend="disk",
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
except Exception: # noqa: BLE001
|
|
161
|
+
pass
|
|
149
162
|
return None
|
|
150
163
|
|
|
151
164
|
try:
|
|
152
165
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
153
166
|
from llm_diff.providers import ModelResponse # noqa: PLC0415
|
|
154
167
|
|
|
155
|
-
|
|
168
|
+
cached_response = ModelResponse(**data)
|
|
169
|
+
|
|
170
|
+
# Emit cache hit event
|
|
171
|
+
try:
|
|
172
|
+
from llm_diff.schema_events import emit as schema_emit, make_cache_event # noqa: PLC0415
|
|
173
|
+
|
|
174
|
+
schema_emit(
|
|
175
|
+
make_cache_event(
|
|
176
|
+
hit=True,
|
|
177
|
+
cache_key=key[:16],
|
|
178
|
+
backend="disk",
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
except Exception: # noqa: BLE001
|
|
182
|
+
pass
|
|
183
|
+
|
|
184
|
+
return cached_response
|
|
156
185
|
except Exception: # noqa: BLE001
|
|
157
186
|
logger.warning("Cache entry for key %s is corrupt — ignoring.", key[:8])
|
|
158
187
|
return None
|
|
@@ -61,6 +61,42 @@ class DiffResult:
|
|
|
61
61
|
"word_count_b": self.word_count_b,
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
+
def as_unified_diff(self) -> str:
|
|
65
|
+
"""Return a compact unified-diff string from the chunks.
|
|
66
|
+
|
|
67
|
+
The output is a lightweight diff that summarises the DELETE and INSERT
|
|
68
|
+
segments. It is suitable for embedding in a schema
|
|
69
|
+
:class:`~llm_toolkit_schema.namespaces.diff.DiffPayload`.
|
|
70
|
+
"""
|
|
71
|
+
lines: list[str] = ["--- model_a", "+++ model_b"]
|
|
72
|
+
for chunk in self.chunks:
|
|
73
|
+
if chunk.type == DiffType.DELETE:
|
|
74
|
+
for line in chunk.text.splitlines(keepends=True):
|
|
75
|
+
lines.append(f"-{line}" if line.endswith("\n") else f"-{line}\n")
|
|
76
|
+
elif chunk.type == DiffType.INSERT:
|
|
77
|
+
for line in chunk.text.splitlines(keepends=True):
|
|
78
|
+
lines.append(f"+{line}" if line.endswith("\n") else f"+{line}\n")
|
|
79
|
+
return "".join(lines) if len(lines) > 2 else ""
|
|
80
|
+
|
|
81
|
+
def to_schema_payload(self, base_event_id: str = "") -> dict:
|
|
82
|
+
"""Return a dict conforming to the ``llm.diff.*`` namespace payload.
|
|
83
|
+
|
|
84
|
+
Compatible with
|
|
85
|
+
:class:`~llm_toolkit_schema.namespaces.diff.DiffPayload` field names.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
base_event_id:
|
|
90
|
+
ULID of the ``comparison.started`` event this result belongs to.
|
|
91
|
+
"""
|
|
92
|
+
return {
|
|
93
|
+
"base_event_id": base_event_id,
|
|
94
|
+
"diff_type": "completion",
|
|
95
|
+
"prompt_diff": None,
|
|
96
|
+
"completion_diff": self.as_unified_diff() or None,
|
|
97
|
+
"similarity_score": round(self.similarity, 4),
|
|
98
|
+
}
|
|
99
|
+
|
|
64
100
|
|
|
65
101
|
# ---------------------------------------------------------------------------
|
|
66
102
|
# Tokenisation
|
|
@@ -114,6 +114,34 @@ class JudgeResult:
|
|
|
114
114
|
"judge_model": self.judge_model,
|
|
115
115
|
}
|
|
116
116
|
|
|
117
|
+
def to_schema_payload(self) -> dict:
|
|
118
|
+
"""Return a dict conforming to the ``llm.eval.*`` namespace payload.
|
|
119
|
+
|
|
120
|
+
Compatible with
|
|
121
|
+
:class:`~llm_toolkit_schema.namespaces.eval.EvalPayload` field names.
|
|
122
|
+
The ``score`` is normalised to a ``0-1`` range from the ``1-10`` scale
|
|
123
|
+
returned by the judge prompt, so consumers always get a consistent range.
|
|
124
|
+
"""
|
|
125
|
+
# Normalise scores: the judge returns 1-10; schema uses 0-1 by convention
|
|
126
|
+
# when `scale` is set accordingly. We expose raw scores with proper scale.
|
|
127
|
+
avg_score: float = 0.0
|
|
128
|
+
scale = "1-10"
|
|
129
|
+
if self.score_a is not None and self.score_b is not None:
|
|
130
|
+
avg_score = (self.score_a + self.score_b) / 2.0
|
|
131
|
+
elif self.score_a is not None:
|
|
132
|
+
avg_score = self.score_a
|
|
133
|
+
elif self.score_b is not None:
|
|
134
|
+
avg_score = self.score_b
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
"evaluator": self.judge_model or "unknown",
|
|
138
|
+
"score": avg_score,
|
|
139
|
+
"scale": scale,
|
|
140
|
+
"label": self.winner,
|
|
141
|
+
"rationale": self.reasoning,
|
|
142
|
+
"criteria": ["accuracy", "completeness", "clarity", "conciseness"],
|
|
143
|
+
}
|
|
144
|
+
|
|
117
145
|
|
|
118
146
|
# ---------------------------------------------------------------------------
|
|
119
147
|
# Parsing helpers
|
|
@@ -263,7 +291,7 @@ async def run_judge(
|
|
|
263
291
|
except (TypeError, ValueError):
|
|
264
292
|
pass
|
|
265
293
|
|
|
266
|
-
|
|
294
|
+
result = JudgeResult(
|
|
267
295
|
winner=winner,
|
|
268
296
|
reasoning=reasoning,
|
|
269
297
|
score_a=score_a,
|
|
@@ -271,3 +299,23 @@ async def run_judge(
|
|
|
271
299
|
judge_model=judge_model,
|
|
272
300
|
raw_response=raw,
|
|
273
301
|
)
|
|
302
|
+
|
|
303
|
+
# Emit schema event for the evaluation
|
|
304
|
+
try:
|
|
305
|
+
from llm_diff.schema_events import make_eval_scenario_event, emit as schema_emit # noqa: PLC0415
|
|
306
|
+
|
|
307
|
+
schema_emit(
|
|
308
|
+
make_eval_scenario_event(
|
|
309
|
+
evaluator=judge_model,
|
|
310
|
+
score=((score_a or 0.0) + (score_b or 0.0)) / 2.0 if (score_a or score_b) else None,
|
|
311
|
+
scale="1-10",
|
|
312
|
+
label=winner,
|
|
313
|
+
rationale=reasoning,
|
|
314
|
+
criteria=["accuracy", "completeness", "clarity", "conciseness"],
|
|
315
|
+
status="passed",
|
|
316
|
+
)
|
|
317
|
+
)
|
|
318
|
+
except Exception: # noqa: BLE001
|
|
319
|
+
pass # schema events are best-effort
|
|
320
|
+
|
|
321
|
+
return result
|
|
@@ -158,6 +158,20 @@ class CostEstimate:
|
|
|
158
158
|
"known_model": self.known_model,
|
|
159
159
|
}
|
|
160
160
|
|
|
161
|
+
def to_schema_payload(self) -> dict:
|
|
162
|
+
"""Return a dict conforming to the ``llm.cost.*`` namespace payload.
|
|
163
|
+
|
|
164
|
+
Compatible with
|
|
165
|
+
:class:`~llm_toolkit_schema.namespaces.cost.CostPayload` field names.
|
|
166
|
+
"""
|
|
167
|
+
return {
|
|
168
|
+
"input_cost": round(self.prompt_usd, 6),
|
|
169
|
+
"output_cost": round(self.completion_usd, 6),
|
|
170
|
+
"total_cost": round(self.total_usd, 6),
|
|
171
|
+
"currency": "USD",
|
|
172
|
+
"pricing_tier": None,
|
|
173
|
+
}
|
|
174
|
+
|
|
161
175
|
@property
|
|
162
176
|
def total_usd_str(self) -> str:
|
|
163
177
|
"""Human-readable cost string (e.g. ``'$0.000250'``)."""
|
|
@@ -146,7 +146,7 @@ async def _call_model(
|
|
|
146
146
|
model,
|
|
147
147
|
)
|
|
148
148
|
|
|
149
|
-
|
|
149
|
+
response_obj = ModelResponse(
|
|
150
150
|
model=model,
|
|
151
151
|
text=text,
|
|
152
152
|
prompt_tokens=usage.prompt_tokens if usage else 0,
|
|
@@ -156,6 +156,34 @@ async def _call_model(
|
|
|
156
156
|
provider=provider_name,
|
|
157
157
|
)
|
|
158
158
|
|
|
159
|
+
# Emit schema trace span event (best-effort — never fails the call)
|
|
160
|
+
try:
|
|
161
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
162
|
+
emit as schema_emit,
|
|
163
|
+
make_trace_span_event,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
schema_emit(
|
|
167
|
+
make_trace_span_event(
|
|
168
|
+
model=model,
|
|
169
|
+
prompt_tokens=response_obj.prompt_tokens,
|
|
170
|
+
completion_tokens=response_obj.completion_tokens,
|
|
171
|
+
total_tokens=response_obj.total_tokens,
|
|
172
|
+
latency_ms=response_obj.latency_ms,
|
|
173
|
+
finish_reason=(
|
|
174
|
+
response.choices[0].finish_reason
|
|
175
|
+
if response.choices
|
|
176
|
+
else None
|
|
177
|
+
),
|
|
178
|
+
stream=False,
|
|
179
|
+
provider=provider_name,
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
except Exception: # noqa: BLE001
|
|
183
|
+
pass # schema events are best-effort
|
|
184
|
+
|
|
185
|
+
return response_obj
|
|
186
|
+
|
|
159
187
|
except asyncio.TimeoutError as exc:
|
|
160
188
|
last_exc = exc
|
|
161
189
|
logger.warning(
|
|
@@ -218,6 +218,23 @@ def save_report(html: str, path: Path) -> Path:
|
|
|
218
218
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
219
219
|
path.write_text(html, encoding="utf-8")
|
|
220
220
|
logger.info("Report saved to %s (%d bytes)", path, len(html))
|
|
221
|
+
|
|
222
|
+
# Emit report exported schema event (best-effort)
|
|
223
|
+
try:
|
|
224
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
225
|
+
emit as schema_emit,
|
|
226
|
+
make_report_exported_event,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
schema_emit(
|
|
230
|
+
make_report_exported_event(
|
|
231
|
+
output_path=str(path),
|
|
232
|
+
format="html",
|
|
233
|
+
)
|
|
234
|
+
)
|
|
235
|
+
except Exception: # noqa: BLE001
|
|
236
|
+
pass
|
|
237
|
+
|
|
221
238
|
return path
|
|
222
239
|
|
|
223
240
|
|
|
@@ -0,0 +1,622 @@
|
|
|
1
|
+
"""llm-toolkit-schema integration for llm-diff.
|
|
2
|
+
|
|
3
|
+
This module provides a thin, zero-configuration integration between llm-diff
|
|
4
|
+
and the ``llm-toolkit-schema`` event envelope. Every major operation in the
|
|
5
|
+
diff pipeline — comparison started/completed, model trace spans, cache
|
|
6
|
+
lookups, cost recording, and judge evaluations — now emits a structured,
|
|
7
|
+
schema-validated :class:`~llm_toolkit_schema.Event`.
|
|
8
|
+
|
|
9
|
+
Architecture
|
|
10
|
+
------------
|
|
11
|
+
A module-level :class:`EventEmitter` singleton collects events. By default
|
|
12
|
+
it operates in *sink* mode (events are built and validated but discarded).
|
|
13
|
+
Call :func:`configure_emitter` once at startup to attach an exporter, e.g.::
|
|
14
|
+
|
|
15
|
+
from llm_diff.schema_events import configure_emitter
|
|
16
|
+
from llm_toolkit_schema.export.jsonl import JSONLExporter
|
|
17
|
+
|
|
18
|
+
configure_emitter(exporter=JSONLExporter("events.jsonl"))
|
|
19
|
+
|
|
20
|
+
After that every comparison automatically appends schema-valid events to
|
|
21
|
+
``events.jsonl``.
|
|
22
|
+
|
|
23
|
+
Usage (library)
|
|
24
|
+
---------------
|
|
25
|
+
.. code-block:: python
|
|
26
|
+
|
|
27
|
+
import asyncio
|
|
28
|
+
from llm_diff import compare
|
|
29
|
+
from llm_diff.schema_events import configure_emitter, get_emitter
|
|
30
|
+
from llm_toolkit_schema.export.jsonl import JSONLExporter
|
|
31
|
+
|
|
32
|
+
configure_emitter(exporter=JSONLExporter("events.jsonl"))
|
|
33
|
+
asyncio.run(compare("Explain recursion", model_a="gpt-4o", model_b="claude-3-5-sonnet"))
|
|
34
|
+
events = get_emitter().events # list of Event objects collected in memory
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from __future__ import annotations
|
|
38
|
+
|
|
39
|
+
import dataclasses
|
|
40
|
+
import logging
|
|
41
|
+
import uuid
|
|
42
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
43
|
+
|
|
44
|
+
from llm_diff import __version__
|
|
45
|
+
|
|
46
|
+
logger = logging.getLogger(__name__)
|
|
47
|
+
|
|
48
|
+
# Source string embedded in every emitted event.
|
|
49
|
+
_SOURCE = f"llm-diff@{__version__}"
|
|
50
|
+
|
|
51
|
+
if TYPE_CHECKING:
|
|
52
|
+
from llm_toolkit_schema import Event
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# Lazy import helpers — keep startup cost low
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _llm_toolkit() -> Any:
|
|
61
|
+
"""Return the top-level ``llm_toolkit_schema`` module."""
|
|
62
|
+
import llm_toolkit_schema # noqa: PLC0415
|
|
63
|
+
|
|
64
|
+
return llm_toolkit_schema
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _event_cls() -> type:
|
|
68
|
+
return _llm_toolkit().Event
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _tags_cls() -> type:
|
|
72
|
+
return _llm_toolkit().Tags
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _event_type() -> Any:
|
|
76
|
+
return _llm_toolkit().EventType
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _diff_ns() -> Any:
|
|
80
|
+
from llm_toolkit_schema.namespaces import diff as _diff # noqa: PLC0415
|
|
81
|
+
|
|
82
|
+
return _diff
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _trace_ns() -> Any:
|
|
86
|
+
from llm_toolkit_schema.namespaces import trace as _trace # noqa: PLC0415
|
|
87
|
+
|
|
88
|
+
return _trace
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _cache_ns() -> Any:
|
|
92
|
+
from llm_toolkit_schema.namespaces import cache as _cache # noqa: PLC0415
|
|
93
|
+
|
|
94
|
+
return _cache
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _cost_ns() -> Any:
|
|
98
|
+
from llm_toolkit_schema.namespaces import cost as _cost # noqa: PLC0415
|
|
99
|
+
|
|
100
|
+
return _cost
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _eval_ns() -> Any:
|
|
104
|
+
from llm_toolkit_schema.namespaces import eval_ as _eval # noqa: PLC0415
|
|
105
|
+
|
|
106
|
+
return _eval
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _ulid_or_empty() -> str:
|
|
110
|
+
return str(uuid.uuid4()).replace("-", "")[:26]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
# EventEmitter
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class EventEmitter:
|
|
119
|
+
"""Collects and optionally exports llm-toolkit-schema :class:`Event` objects.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
exporter:
|
|
124
|
+
Any callable that accepts a single :class:`~llm_toolkit_schema.Event`
|
|
125
|
+
argument. By default events are only collected in memory (see
|
|
126
|
+
:attr:`events`). Pass a ``JSONLExporter`` or any compatible object
|
|
127
|
+
with an ``export`` method (or a plain callable) to also ship events
|
|
128
|
+
to an external backend.
|
|
129
|
+
collect:
|
|
130
|
+
When ``True`` (default), events are appended to the in-memory
|
|
131
|
+
:attr:`events` list. Disable when memory overhead matters in
|
|
132
|
+
long-running processes.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
exporter: Callable[[Any], Any] | None = None,
|
|
138
|
+
*,
|
|
139
|
+
collect: bool = True,
|
|
140
|
+
) -> None:
|
|
141
|
+
self._exporter = exporter
|
|
142
|
+
self._collect = collect
|
|
143
|
+
self._events: list[Any] = []
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def events(self) -> list[Any]:
|
|
147
|
+
"""Read-only list of all :class:`~llm_toolkit_schema.Event` objects collected."""
|
|
148
|
+
return list(self._events)
|
|
149
|
+
|
|
150
|
+
def emit(self, event: Any) -> None: # noqa: ANN401
|
|
151
|
+
"""Validate and emit *event*.
|
|
152
|
+
|
|
153
|
+
If ``collect=True``, the event is appended to :attr:`events`.
|
|
154
|
+
If an *exporter* is configured, it is called with the event.
|
|
155
|
+
Errors during export are logged as warnings and do not propagate.
|
|
156
|
+
"""
|
|
157
|
+
try:
|
|
158
|
+
event.validate()
|
|
159
|
+
except Exception as exc: # noqa: BLE001
|
|
160
|
+
logger.warning("Schema validation failed for event %s: %s", event.event_type, exc)
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
if self._collect:
|
|
164
|
+
self._events.append(event)
|
|
165
|
+
|
|
166
|
+
if self._exporter is not None:
|
|
167
|
+
try:
|
|
168
|
+
# Support both callable exporters and object exporters with .export()
|
|
169
|
+
if hasattr(self._exporter, "export"):
|
|
170
|
+
result = self._exporter.export(event)
|
|
171
|
+
# Handle async exporters gracefully by ignoring coroutines in sync context
|
|
172
|
+
if hasattr(result, "__await__"):
|
|
173
|
+
import asyncio # noqa: PLC0415
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
loop = asyncio.get_event_loop()
|
|
177
|
+
if loop.is_running():
|
|
178
|
+
loop.create_task(result)
|
|
179
|
+
else:
|
|
180
|
+
loop.run_until_complete(result)
|
|
181
|
+
except RuntimeError:
|
|
182
|
+
pass # no event loop available — silently skip
|
|
183
|
+
else:
|
|
184
|
+
self._exporter(event)
|
|
185
|
+
except Exception as exc: # noqa: BLE001
|
|
186
|
+
logger.warning("Event export failed for %s: %s", event.event_type, exc)
|
|
187
|
+
|
|
188
|
+
def clear(self) -> None:
|
|
189
|
+
"""Remove all collected events from memory."""
|
|
190
|
+
self._events.clear()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# ---------------------------------------------------------------------------
|
|
194
|
+
# Global emitter singleton
|
|
195
|
+
# ---------------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
_emitter: EventEmitter = EventEmitter()
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def get_emitter() -> EventEmitter:
|
|
201
|
+
"""Return the global :class:`EventEmitter` instance."""
|
|
202
|
+
return _emitter
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def configure_emitter(
|
|
206
|
+
exporter: Callable[[Any], Any] | None = None,
|
|
207
|
+
*,
|
|
208
|
+
collect: bool = True,
|
|
209
|
+
) -> EventEmitter:
|
|
210
|
+
"""Replace the global emitter with a new configured instance.
|
|
211
|
+
|
|
212
|
+
Call this exactly once at application startup before running any
|
|
213
|
+
comparisons.
|
|
214
|
+
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
exporter:
|
|
218
|
+
Any callable or object with an ``export`` method that accepts a
|
|
219
|
+
:class:`~llm_toolkit_schema.Event`.
|
|
220
|
+
collect:
|
|
221
|
+
Whether to keep events in memory (default ``True``).
|
|
222
|
+
|
|
223
|
+
Returns
|
|
224
|
+
-------
|
|
225
|
+
EventEmitter
|
|
226
|
+
The newly installed global emitter.
|
|
227
|
+
"""
|
|
228
|
+
global _emitter # noqa: PLW0603
|
|
229
|
+
|
|
230
|
+
_emitter = EventEmitter(exporter=exporter, collect=collect)
|
|
231
|
+
return _emitter
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def emit(event: Any) -> None: # noqa: ANN401
|
|
235
|
+
"""Emit *event* through the global emitter."""
|
|
236
|
+
_emitter.emit(event)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# ---------------------------------------------------------------------------
|
|
240
|
+
# Event factory helpers
|
|
241
|
+
# ---------------------------------------------------------------------------
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _make_event(
|
|
245
|
+
event_type_value: str,
|
|
246
|
+
payload: dict[str, Any],
|
|
247
|
+
*,
|
|
248
|
+
trace_id: str | None = None,
|
|
249
|
+
span_id: str | None = None,
|
|
250
|
+
org_id: str | None = None,
|
|
251
|
+
session_id: str | None = None,
|
|
252
|
+
tags: dict[str, str] | None = None,
|
|
253
|
+
) -> Any:
|
|
254
|
+
"""Build a :class:`~llm_toolkit_schema.Event` from the given arguments."""
|
|
255
|
+
Event = _event_cls()
|
|
256
|
+
Tags = _tags_cls()
|
|
257
|
+
|
|
258
|
+
kwargs: dict[str, Any] = {
|
|
259
|
+
"event_type": event_type_value,
|
|
260
|
+
"source": _SOURCE,
|
|
261
|
+
"payload": payload,
|
|
262
|
+
}
|
|
263
|
+
if trace_id is not None:
|
|
264
|
+
kwargs["trace_id"] = trace_id
|
|
265
|
+
if span_id is not None:
|
|
266
|
+
kwargs["span_id"] = span_id
|
|
267
|
+
if org_id is not None:
|
|
268
|
+
kwargs["org_id"] = org_id
|
|
269
|
+
if session_id is not None:
|
|
270
|
+
kwargs["session_id"] = session_id
|
|
271
|
+
if tags:
|
|
272
|
+
kwargs["tags"] = Tags(**tags)
|
|
273
|
+
|
|
274
|
+
return Event(**kwargs)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ---------------------------------------------------------------------------
|
|
278
|
+
# llm.diff.* — Comparison lifecycle events
|
|
279
|
+
# ---------------------------------------------------------------------------
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def make_comparison_started_event(
|
|
283
|
+
*,
|
|
284
|
+
model_a: str,
|
|
285
|
+
model_b: str,
|
|
286
|
+
prompt: str,
|
|
287
|
+
session_id: str | None = None,
|
|
288
|
+
org_id: str | None = None,
|
|
289
|
+
) -> Any:
|
|
290
|
+
"""Build a ``llm.diff.comparison.started`` event.
|
|
291
|
+
|
|
292
|
+
Parameters
|
|
293
|
+
----------
|
|
294
|
+
model_a:
|
|
295
|
+
Identifier of the first model (e.g. ``"gpt-4o"``).
|
|
296
|
+
model_b:
|
|
297
|
+
Identifier of the second model (e.g. ``"claude-3-5-sonnet"``).
|
|
298
|
+
prompt:
|
|
299
|
+
The full prompt text used for the comparison.
|
|
300
|
+
session_id:
|
|
301
|
+
Optional session identifier for correlation.
|
|
302
|
+
org_id:
|
|
303
|
+
Optional organisation identifier.
|
|
304
|
+
"""
|
|
305
|
+
ET = _event_type()
|
|
306
|
+
payload: dict[str, Any] = {
|
|
307
|
+
"model_a": model_a,
|
|
308
|
+
"model_b": model_b,
|
|
309
|
+
"prompt_length": len(prompt),
|
|
310
|
+
}
|
|
311
|
+
return _make_event(
|
|
312
|
+
ET.DIFF_COMPARISON_STARTED,
|
|
313
|
+
payload,
|
|
314
|
+
session_id=session_id,
|
|
315
|
+
org_id=org_id,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def make_comparison_completed_event(
|
|
320
|
+
*,
|
|
321
|
+
model_a: str,
|
|
322
|
+
model_b: str,
|
|
323
|
+
diff_type: str = "word-level",
|
|
324
|
+
prompt_diff: str | None = None,
|
|
325
|
+
completion_diff: str | None = None,
|
|
326
|
+
similarity_score: float | None = None,
|
|
327
|
+
base_event_id: str | None = None,
|
|
328
|
+
model_a_text: str | None = None,
|
|
329
|
+
model_b_text: str | None = None,
|
|
330
|
+
session_id: str | None = None,
|
|
331
|
+
org_id: str | None = None,
|
|
332
|
+
) -> Any:
|
|
333
|
+
"""Build a ``llm.diff.comparison.completed`` event with a DiffComparisonPayload."""
|
|
334
|
+
ET = _event_type()
|
|
335
|
+
ns = _diff_ns()
|
|
336
|
+
|
|
337
|
+
diff_result_dict: dict[str, Any] | None = None
|
|
338
|
+
if completion_diff:
|
|
339
|
+
diff_result_dict = {"unified_diff": completion_diff}
|
|
340
|
+
elif prompt_diff:
|
|
341
|
+
diff_result_dict = {"unified_diff": prompt_diff}
|
|
342
|
+
|
|
343
|
+
payload_obj = ns.DiffComparisonPayload(
|
|
344
|
+
source_id=base_event_id or model_a,
|
|
345
|
+
target_id=model_b,
|
|
346
|
+
diff_type=diff_type,
|
|
347
|
+
similarity_score=similarity_score,
|
|
348
|
+
source_text=model_a_text,
|
|
349
|
+
target_text=model_b_text,
|
|
350
|
+
diff_result=diff_result_dict,
|
|
351
|
+
)
|
|
352
|
+
payload = dataclasses.asdict(payload_obj)
|
|
353
|
+
|
|
354
|
+
return _make_event(
|
|
355
|
+
ET.DIFF_COMPARISON_COMPLETED,
|
|
356
|
+
payload,
|
|
357
|
+
session_id=session_id,
|
|
358
|
+
org_id=org_id,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def make_report_exported_event(
|
|
363
|
+
*,
|
|
364
|
+
output_path: str,
|
|
365
|
+
format: str = "html",
|
|
366
|
+
comparison_event_id: str = "",
|
|
367
|
+
report_id: str | None = None,
|
|
368
|
+
session_id: str | None = None,
|
|
369
|
+
org_id: str | None = None,
|
|
370
|
+
) -> Any:
|
|
371
|
+
"""Build a ``llm.diff.report.exported`` event with DiffReportPayload."""
|
|
372
|
+
ET = _event_type()
|
|
373
|
+
ns = _diff_ns()
|
|
374
|
+
|
|
375
|
+
payload_obj = ns.DiffReportPayload(
|
|
376
|
+
report_id=report_id or _ulid_or_empty(),
|
|
377
|
+
comparison_event_id=comparison_event_id or _ulid_or_empty(),
|
|
378
|
+
format=format,
|
|
379
|
+
export_path=output_path,
|
|
380
|
+
)
|
|
381
|
+
payload = dataclasses.asdict(payload_obj)
|
|
382
|
+
return _make_event(
|
|
383
|
+
ET.DIFF_REPORT_EXPORTED,
|
|
384
|
+
payload,
|
|
385
|
+
session_id=session_id,
|
|
386
|
+
org_id=org_id,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
# ---------------------------------------------------------------------------
|
|
391
|
+
# llm.trace.* — Model span events
|
|
392
|
+
# ---------------------------------------------------------------------------
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def make_trace_span_event(
|
|
396
|
+
*,
|
|
397
|
+
model: str,
|
|
398
|
+
prompt_tokens: int,
|
|
399
|
+
completion_tokens: int,
|
|
400
|
+
total_tokens: int | None = None,
|
|
401
|
+
latency_ms: float,
|
|
402
|
+
finish_reason: str | None = None,
|
|
403
|
+
stream: bool = False,
|
|
404
|
+
provider: str | None = None,
|
|
405
|
+
cost_usd: float | None = None,
|
|
406
|
+
session_id: str | None = None,
|
|
407
|
+
org_id: str | None = None,
|
|
408
|
+
) -> Any:
|
|
409
|
+
"""Build a ``llm.trace.span.completed`` event with SpanCompletedPayload.
|
|
410
|
+
|
|
411
|
+
Parameters
|
|
412
|
+
----------
|
|
413
|
+
model:
|
|
414
|
+
Model identifier string (e.g. ``"gpt-4o"``).
|
|
415
|
+
prompt_tokens:
|
|
416
|
+
Number of input tokens consumed.
|
|
417
|
+
completion_tokens:
|
|
418
|
+
Number of output tokens generated.
|
|
419
|
+
total_tokens:
|
|
420
|
+
Total token count; inferred from prompt + completion if ``None``.
|
|
421
|
+
latency_ms:
|
|
422
|
+
End-to-end request latency in milliseconds.
|
|
423
|
+
finish_reason:
|
|
424
|
+
Provider finish reason string (``"stop"``, ``"length"``, etc.).
|
|
425
|
+
stream:
|
|
426
|
+
Whether the response was streamed.
|
|
427
|
+
provider:
|
|
428
|
+
Provider name for tagging (``"openai"``, ``"anthropic"``, etc.).
|
|
429
|
+
"""
|
|
430
|
+
ET = _event_type()
|
|
431
|
+
ns = _trace_ns()
|
|
432
|
+
|
|
433
|
+
total = total_tokens if total_tokens is not None else prompt_tokens + completion_tokens
|
|
434
|
+
token_usage = ns.TokenUsage(
|
|
435
|
+
prompt_tokens=prompt_tokens,
|
|
436
|
+
completion_tokens=completion_tokens,
|
|
437
|
+
total_tokens=total,
|
|
438
|
+
)
|
|
439
|
+
model_info = ns.ModelInfo(
|
|
440
|
+
name=model,
|
|
441
|
+
provider=provider or "unknown",
|
|
442
|
+
version=None,
|
|
443
|
+
)
|
|
444
|
+
payload_obj = ns.SpanCompletedPayload(
|
|
445
|
+
span_name="llm-diff-model-call",
|
|
446
|
+
status="ok" if finish_reason != "error" else "error",
|
|
447
|
+
duration_ms=latency_ms,
|
|
448
|
+
model=model_info,
|
|
449
|
+
token_usage=token_usage,
|
|
450
|
+
cost_usd=cost_usd,
|
|
451
|
+
)
|
|
452
|
+
payload = dataclasses.asdict(payload_obj)
|
|
453
|
+
payload["finish_reason"] = finish_reason
|
|
454
|
+
payload["stream"] = stream
|
|
455
|
+
|
|
456
|
+
tags: dict[str, str] | None = None
|
|
457
|
+
if provider:
|
|
458
|
+
tags = {"provider": provider, "model": model}
|
|
459
|
+
|
|
460
|
+
return _make_event(
|
|
461
|
+
ET.TRACE_SPAN_COMPLETED,
|
|
462
|
+
payload,
|
|
463
|
+
session_id=session_id,
|
|
464
|
+
org_id=org_id,
|
|
465
|
+
tags=tags,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
# ---------------------------------------------------------------------------
|
|
470
|
+
# llm.cache.* — Cache hit/miss events
|
|
471
|
+
# ---------------------------------------------------------------------------
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def make_cache_event(
|
|
475
|
+
*,
|
|
476
|
+
hit: bool,
|
|
477
|
+
cache_key: str | None = None,
|
|
478
|
+
ttl_seconds: int | None = None,
|
|
479
|
+
backend: str = "disk",
|
|
480
|
+
latency_ms: float | None = None,
|
|
481
|
+
session_id: str | None = None,
|
|
482
|
+
org_id: str | None = None,
|
|
483
|
+
) -> Any:
|
|
484
|
+
"""Build a ``llm.cache.hit`` or ``llm.cache.miss`` event.
|
|
485
|
+
|
|
486
|
+
Parameters
|
|
487
|
+
----------
|
|
488
|
+
hit:
|
|
489
|
+
``True`` → ``CACHE_HIT``; ``False`` → ``CACHE_MISS``.
|
|
490
|
+
cache_key:
|
|
491
|
+
Opaque cache key used for lookup (first 16 chars of SHA-256 digest).
|
|
492
|
+
ttl_seconds:
|
|
493
|
+
Time-to-live of the cached entry, if known.
|
|
494
|
+
backend:
|
|
495
|
+
Cache backend name (default ``"disk"``).
|
|
496
|
+
latency_ms:
|
|
497
|
+
Cache lookup latency in milliseconds, if measured.
|
|
498
|
+
"""
|
|
499
|
+
ET = _event_type()
|
|
500
|
+
ns = _cache_ns()
|
|
501
|
+
|
|
502
|
+
if hit:
|
|
503
|
+
payload_obj = ns.CacheHitPayload(
|
|
504
|
+
cache_key_hash=cache_key or "unknown",
|
|
505
|
+
cache_store=backend,
|
|
506
|
+
ttl_seconds=ttl_seconds,
|
|
507
|
+
)
|
|
508
|
+
event_type = ET.CACHE_HIT
|
|
509
|
+
else:
|
|
510
|
+
payload_obj = ns.CacheMissPayload(
|
|
511
|
+
cache_key_hash=cache_key or "unknown",
|
|
512
|
+
cache_store=backend,
|
|
513
|
+
)
|
|
514
|
+
event_type = ET.CACHE_MISS
|
|
515
|
+
|
|
516
|
+
payload = dataclasses.asdict(payload_obj)
|
|
517
|
+
if latency_ms is not None:
|
|
518
|
+
payload["latency_ms"] = latency_ms
|
|
519
|
+
return _make_event(event_type, payload, session_id=session_id, org_id=org_id)
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
# ---------------------------------------------------------------------------
|
|
523
|
+
# llm.cost.* — Cost recorded events
|
|
524
|
+
# ---------------------------------------------------------------------------
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def make_cost_recorded_event(
|
|
528
|
+
*,
|
|
529
|
+
input_cost: float,
|
|
530
|
+
output_cost: float,
|
|
531
|
+
total_cost: float,
|
|
532
|
+
currency: str = "USD",
|
|
533
|
+
pricing_tier: str | None = None,
|
|
534
|
+
model: str | None = None,
|
|
535
|
+
provider: str | None = None,
|
|
536
|
+
prompt_tokens: int = 0,
|
|
537
|
+
completion_tokens: int = 0,
|
|
538
|
+
total_tokens: int = 0,
|
|
539
|
+
span_event_id: str | None = None,
|
|
540
|
+
session_id: str | None = None,
|
|
541
|
+
org_id: str | None = None,
|
|
542
|
+
) -> Any:
|
|
543
|
+
"""Build a ``llm.cost.recorded`` event with CostRecordedPayload."""
|
|
544
|
+
ET = _event_type()
|
|
545
|
+
ns = _cost_ns()
|
|
546
|
+
|
|
547
|
+
payload_obj = ns.CostRecordedPayload(
|
|
548
|
+
span_event_id=span_event_id or _ulid_or_empty(),
|
|
549
|
+
model_name=model or "unknown",
|
|
550
|
+
provider=provider or "unknown",
|
|
551
|
+
prompt_tokens=prompt_tokens,
|
|
552
|
+
completion_tokens=completion_tokens,
|
|
553
|
+
total_tokens=total_tokens or (prompt_tokens + completion_tokens),
|
|
554
|
+
cost_usd=total_cost,
|
|
555
|
+
currency=currency,
|
|
556
|
+
)
|
|
557
|
+
payload = dataclasses.asdict(payload_obj)
|
|
558
|
+
payload["input_cost_usd"] = input_cost
|
|
559
|
+
payload["output_cost_usd"] = output_cost
|
|
560
|
+
if pricing_tier is not None:
|
|
561
|
+
payload["pricing_tier"] = pricing_tier
|
|
562
|
+
|
|
563
|
+
return _make_event(ET.COST_RECORDED, payload, session_id=session_id, org_id=org_id)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
# ---------------------------------------------------------------------------
|
|
567
|
+
# llm.eval.* — Judge / evaluation events
|
|
568
|
+
# ---------------------------------------------------------------------------
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def make_eval_scenario_event(
|
|
572
|
+
*,
|
|
573
|
+
evaluator: str,
|
|
574
|
+
score: float | None = None,
|
|
575
|
+
scale: str = "1-10",
|
|
576
|
+
label: str | None = None,
|
|
577
|
+
rationale: str | None = None,
|
|
578
|
+
criteria: list[str] | None = None,
|
|
579
|
+
status: str = "passed",
|
|
580
|
+
duration_ms: float | None = None,
|
|
581
|
+
baseline_score: float | None = None,
|
|
582
|
+
session_id: str | None = None,
|
|
583
|
+
org_id: str | None = None,
|
|
584
|
+
) -> Any:
|
|
585
|
+
"""Build a ``llm.eval.scenario.completed`` event with EvalScenarioPayload.
|
|
586
|
+
|
|
587
|
+
Parameters
|
|
588
|
+
----------
|
|
589
|
+
status:
|
|
590
|
+
Must be ``"passed"``, ``"failed"``, or ``"skipped"``.
|
|
591
|
+
"""
|
|
592
|
+
ET = _event_type()
|
|
593
|
+
ns = _eval_ns()
|
|
594
|
+
|
|
595
|
+
metrics: dict[str, float] | None = None
|
|
596
|
+
if score is not None and criteria:
|
|
597
|
+
metrics = {c: score for c in criteria}
|
|
598
|
+
elif score is not None:
|
|
599
|
+
metrics = {"score": score}
|
|
600
|
+
|
|
601
|
+
scenario_name = f"llm-diff/{evaluator}"
|
|
602
|
+
if label:
|
|
603
|
+
scenario_name = f"{scenario_name}/{label}"
|
|
604
|
+
|
|
605
|
+
payload_obj = ns.EvalScenarioPayload(
|
|
606
|
+
scenario_id=_ulid_or_empty(),
|
|
607
|
+
scenario_name=scenario_name,
|
|
608
|
+
status=status,
|
|
609
|
+
score=score,
|
|
610
|
+
metrics=metrics,
|
|
611
|
+
baseline_score=baseline_score,
|
|
612
|
+
duration_ms=duration_ms,
|
|
613
|
+
)
|
|
614
|
+
payload = dataclasses.asdict(payload_obj)
|
|
615
|
+
payload["scale"] = scale
|
|
616
|
+
if rationale:
|
|
617
|
+
payload["rationale"] = rationale
|
|
618
|
+
if label:
|
|
619
|
+
payload["label"] = label
|
|
620
|
+
return _make_event(
|
|
621
|
+
ET.EVAL_SCENARIO_COMPLETED, payload, session_id=session_id, org_id=org_id
|
|
622
|
+
)
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "llm-diff"
|
|
7
|
-
version = "1.2.
|
|
7
|
+
version = "1.2.2"
|
|
8
8
|
description = "A CLI tool for comparing LLM outputs — semantically, visually, and at scale"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -32,6 +32,7 @@ dependencies = [
|
|
|
32
32
|
"tomli>=2.0; python_version < '3.11'",
|
|
33
33
|
"jinja2>=3.1",
|
|
34
34
|
"pyyaml>=6.0",
|
|
35
|
+
"llm-toolkit-schema>=1.1.0",
|
|
35
36
|
]
|
|
36
37
|
|
|
37
38
|
[project.optional-dependencies]
|
|
@@ -53,10 +54,10 @@ dev = [
|
|
|
53
54
|
llm-diff = "llm_diff.cli:main"
|
|
54
55
|
|
|
55
56
|
[project.urls]
|
|
56
|
-
Homepage = "https://github.com/
|
|
57
|
-
Repository = "https://github.com/
|
|
58
|
-
"Bug Tracker" = "https://github.com/
|
|
59
|
-
Documentation = "https://github.com/
|
|
57
|
+
Homepage = "https://github.com/veerarag1973/llmdiff"
|
|
58
|
+
Repository = "https://github.com/veerarag1973/llmdiff"
|
|
59
|
+
"Bug Tracker" = "https://github.com/veerarag1973/llmdiff/issues"
|
|
60
|
+
Documentation = "https://github.com/veerarag1973/llmdiff/tree/main/docs"
|
|
60
61
|
|
|
61
62
|
[tool.hatch.build.targets.wheel]
|
|
62
63
|
packages = ["llm_diff"]
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
"""llm-diff — CLI tool for comparing LLM outputs."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
__version__ = "1.2.0"
|
|
6
|
-
|
|
7
|
-
from llm_diff.api import ComparisonReport, compare, compare_batch, compare_prompts
|
|
8
|
-
from llm_diff.diff import JsonStructDiffResult, json_struct_diff
|
|
9
|
-
from llm_diff.judge import JudgeResult
|
|
10
|
-
from llm_diff.multi import MultiModelReport, PairScore, run_multi_model
|
|
11
|
-
from llm_diff.pricing import CostEstimate
|
|
12
|
-
|
|
13
|
-
__all__ = [
|
|
14
|
-
"__version__",
|
|
15
|
-
"ComparisonReport",
|
|
16
|
-
"compare",
|
|
17
|
-
"compare_batch",
|
|
18
|
-
"compare_prompts",
|
|
19
|
-
"CostEstimate",
|
|
20
|
-
"JudgeResult",
|
|
21
|
-
"json_struct_diff",
|
|
22
|
-
"JsonStructDiffResult",
|
|
23
|
-
"MultiModelReport",
|
|
24
|
-
"PairScore",
|
|
25
|
-
"run_multi_model",
|
|
26
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|