llm-diff 1.2.2__tar.gz → 1.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llm_diff-1.2.2 → llm_diff-1.2.3}/PKG-INFO +11 -5
- {llm_diff-1.2.2 → llm_diff-1.2.3}/README.md +10 -4
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/__init__.py +3 -1
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/cli.py +38 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/schema_events.py +52 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/pyproject.toml +1 -1
- {llm_diff-1.2.2 → llm_diff-1.2.3}/.gitignore +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/LICENSE +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/api.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/batch.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/cache.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/config.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/diff.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/judge.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/metrics.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/multi.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/pricing.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/providers.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/renderer.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/report.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/semantic.py +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/templates/batch_report.html.j2 +0 -0
- {llm_diff-1.2.2 → llm_diff-1.2.3}/llm_diff/templates/report.html.j2 +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llm-diff
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.3
|
|
4
4
|
Summary: A CLI tool for comparing LLM outputs — semantically, visually, and at scale
|
|
5
5
|
Project-URL: Homepage, https://github.com/veerarag1973/llmdiff
|
|
6
6
|
Project-URL: Repository, https://github.com/veerarag1973/llmdiff
|
|
@@ -47,8 +47,8 @@ Description-Content-Type: text/markdown
|
|
|
47
47
|
|
|
48
48
|
**A CLI tool and Python library for comparing LLM outputs — semantically, visually, and at scale.**
|
|
49
49
|
|
|
50
|
-
[](https://pypi.org/project/llm-diff/1.2.3/)
|
|
51
|
+
[](https://pypi.org/project/llm-diff/)
|
|
52
52
|
[](https://pypi.org/project/llm-diff/)
|
|
53
53
|
[](https://pypi.org/project/llm-diff/)
|
|
54
54
|
[](LICENSE)
|
|
@@ -79,10 +79,16 @@ threshold — making it a first-class citizen in CI/CD pipelines.
|
|
|
79
79
|
Version 1.2 adds LLM-as-a-Judge scoring, per-call USD cost tracking,
|
|
80
80
|
multi-model (3–4 model) comparison, and structured JSON diff.
|
|
81
81
|
|
|
82
|
+
Version 1.2.3 adds `EVAL_REGRESSION_FAILED` schema event emission — `--fail-under`
|
|
83
|
+
gate failures now emit a structured `llm.eval.regression.failed` event (via
|
|
84
|
+
`make_eval_regression_event()`) in addition to returning exit code 1,
|
|
85
|
+
providing a full audit trail for CI regression gates.
|
|
86
|
+
|
|
82
87
|
Version 1.2.2 integrates [llm-toolkit-schema](https://pypi.org/project/llm-toolkit-schema/)
|
|
83
88
|
as a built-in observability layer: every comparison, model call, cache lookup,
|
|
84
|
-
cost record,
|
|
85
|
-
collected in memory, exported to JSONL, or
|
|
89
|
+
cost record, judge evaluation, and `--fail-under` regression failure now emits a
|
|
90
|
+
validated schema event that can be collected in memory, exported to JSONL, or
|
|
91
|
+
forwarded to any custom backend.
|
|
86
92
|
|
|
87
93
|
## Documentation
|
|
88
94
|
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
**A CLI tool and Python library for comparing LLM outputs — semantically, visually, and at scale.**
|
|
4
4
|
|
|
5
|
-
[](https://pypi.org/project/llm-diff/1.2.3/)
|
|
6
|
+
[](https://pypi.org/project/llm-diff/)
|
|
7
7
|
[](https://pypi.org/project/llm-diff/)
|
|
8
8
|
[](https://pypi.org/project/llm-diff/)
|
|
9
9
|
[](LICENSE)
|
|
@@ -34,10 +34,16 @@ threshold — making it a first-class citizen in CI/CD pipelines.
|
|
|
34
34
|
Version 1.2 adds LLM-as-a-Judge scoring, per-call USD cost tracking,
|
|
35
35
|
multi-model (3–4 model) comparison, and structured JSON diff.
|
|
36
36
|
|
|
37
|
+
Version 1.2.3 adds `EVAL_REGRESSION_FAILED` schema event emission — `--fail-under`
|
|
38
|
+
gate failures now emit a structured `llm.eval.regression.failed` event (via
|
|
39
|
+
`make_eval_regression_event()`) in addition to returning exit code 1,
|
|
40
|
+
providing a full audit trail for CI regression gates.
|
|
41
|
+
|
|
37
42
|
Version 1.2.2 integrates [llm-toolkit-schema](https://pypi.org/project/llm-toolkit-schema/)
|
|
38
43
|
as a built-in observability layer: every comparison, model call, cache lookup,
|
|
39
|
-
cost record,
|
|
40
|
-
collected in memory, exported to JSONL, or
|
|
44
|
+
cost record, judge evaluation, and `--fail-under` regression failure now emits a
|
|
45
|
+
validated schema event that can be collected in memory, exported to JSONL, or
|
|
46
|
+
forwarded to any custom backend.
|
|
41
47
|
|
|
42
48
|
## Documentation
|
|
43
49
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
__version__ = "1.2.
|
|
5
|
+
__version__ = "1.2.3"
|
|
6
6
|
|
|
7
7
|
from llm_diff.api import ComparisonReport, compare, compare_batch, compare_prompts
|
|
8
8
|
from llm_diff.diff import JsonStructDiffResult, json_struct_diff
|
|
@@ -18,6 +18,7 @@ from llm_diff.schema_events import (
|
|
|
18
18
|
make_comparison_completed_event,
|
|
19
19
|
make_comparison_started_event,
|
|
20
20
|
make_cost_recorded_event,
|
|
21
|
+
make_eval_regression_event,
|
|
21
22
|
make_eval_scenario_event,
|
|
22
23
|
make_report_exported_event,
|
|
23
24
|
make_trace_span_event,
|
|
@@ -45,6 +46,7 @@ __all__ = [
|
|
|
45
46
|
"make_comparison_completed_event",
|
|
46
47
|
"make_comparison_started_event",
|
|
47
48
|
"make_cost_recorded_event",
|
|
49
|
+
"make_eval_regression_event",
|
|
48
50
|
"make_eval_scenario_event",
|
|
49
51
|
"make_report_exported_event",
|
|
50
52
|
"make_trace_span_event",
|
|
@@ -600,6 +600,28 @@ async def _run_batch(
|
|
|
600
600
|
f"[bold red]--fail-under {fail_under:.2f}: "
|
|
601
601
|
f"{len(failing)}/{len(batch_results)} item(s) below threshold.[/bold red]"
|
|
602
602
|
)
|
|
603
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
604
|
+
emit as schema_emit,
|
|
605
|
+
make_eval_regression_event,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
for _r in failing:
|
|
609
|
+
_score = (
|
|
610
|
+
_r.semantic_score
|
|
611
|
+
if _r.semantic_score is not None
|
|
612
|
+
else _r.diff_result.similarity
|
|
613
|
+
)
|
|
614
|
+
schema_emit(
|
|
615
|
+
make_eval_regression_event(
|
|
616
|
+
scenario_name="llm-diff/fail-under/batch",
|
|
617
|
+
current_score=_score,
|
|
618
|
+
baseline_score=float(fail_under),
|
|
619
|
+
threshold=float(fail_under),
|
|
620
|
+
metrics={"similarity": _r.diff_result.similarity}
|
|
621
|
+
if _r.semantic_score is not None
|
|
622
|
+
else None,
|
|
623
|
+
)
|
|
624
|
+
)
|
|
603
625
|
sys.exit(1)
|
|
604
626
|
|
|
605
627
|
if out:
|
|
@@ -787,6 +809,22 @@ async def _run_diff(
|
|
|
787
809
|
f"[bold red]--fail-under {fail_under:.2f}: "
|
|
788
810
|
f"score {primary:.4f} is below threshold.[/bold red]"
|
|
789
811
|
)
|
|
812
|
+
from llm_diff.schema_events import ( # noqa: PLC0415
|
|
813
|
+
emit as schema_emit,
|
|
814
|
+
make_eval_regression_event,
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
schema_emit(
|
|
818
|
+
make_eval_regression_event(
|
|
819
|
+
scenario_name="llm-diff/fail-under/single",
|
|
820
|
+
current_score=float(primary),
|
|
821
|
+
baseline_score=float(fail_under),
|
|
822
|
+
threshold=float(fail_under),
|
|
823
|
+
metrics={"similarity": float(diff_result.similarity)}
|
|
824
|
+
if semantic_score is not None
|
|
825
|
+
else None,
|
|
826
|
+
)
|
|
827
|
+
)
|
|
790
828
|
sys.exit(1)
|
|
791
829
|
|
|
792
830
|
# ── Save HTML report ─────────────────────────────────────────────────────
|
|
@@ -620,3 +620,55 @@ def make_eval_scenario_event(
|
|
|
620
620
|
return _make_event(
|
|
621
621
|
ET.EVAL_SCENARIO_COMPLETED, payload, session_id=session_id, org_id=org_id
|
|
622
622
|
)
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def make_eval_regression_event(
|
|
626
|
+
*,
|
|
627
|
+
scenario_name: str,
|
|
628
|
+
current_score: float,
|
|
629
|
+
baseline_score: float,
|
|
630
|
+
threshold: float,
|
|
631
|
+
metrics: dict[str, float] | None = None,
|
|
632
|
+
session_id: str | None = None,
|
|
633
|
+
org_id: str | None = None,
|
|
634
|
+
) -> Any:
|
|
635
|
+
"""Build a ``llm.eval.regression.failed`` event with EvalRegressionPayload.
|
|
636
|
+
|
|
637
|
+
Emitted when the ``--fail-under`` threshold is not met, indicating that
|
|
638
|
+
the primary similarity/semantic score has regressed below the minimum
|
|
639
|
+
acceptable level.
|
|
640
|
+
|
|
641
|
+
Parameters
|
|
642
|
+
----------
|
|
643
|
+
scenario_name:
|
|
644
|
+
Human-readable name for the scenario that triggered the regression,
|
|
645
|
+
e.g. ``"llm-diff/fail-under/batch"`` or ``"llm-diff/fail-under/single"``.
|
|
646
|
+
current_score:
|
|
647
|
+
The actual similarity or semantic score that was measured.
|
|
648
|
+
baseline_score:
|
|
649
|
+
The minimum acceptable score (i.e. the ``--fail-under`` value).
|
|
650
|
+
threshold:
|
|
651
|
+
The ``--fail-under`` threshold value (same as *baseline_score* here).
|
|
652
|
+
metrics:
|
|
653
|
+
Optional mapping of metric names to values for richer diagnostics.
|
|
654
|
+
session_id:
|
|
655
|
+
Optional session identifier for correlation.
|
|
656
|
+
org_id:
|
|
657
|
+
Optional organisation identifier.
|
|
658
|
+
"""
|
|
659
|
+
ET = _event_type()
|
|
660
|
+
ns = _eval_ns()
|
|
661
|
+
|
|
662
|
+
payload_obj = ns.EvalRegressionPayload(
|
|
663
|
+
scenario_id=_ulid_or_empty(),
|
|
664
|
+
scenario_name=scenario_name,
|
|
665
|
+
current_score=current_score,
|
|
666
|
+
baseline_score=baseline_score,
|
|
667
|
+
regression_delta=baseline_score - current_score,
|
|
668
|
+
threshold=threshold,
|
|
669
|
+
metrics=metrics,
|
|
670
|
+
)
|
|
671
|
+
payload = dataclasses.asdict(payload_obj)
|
|
672
|
+
return _make_event(
|
|
673
|
+
ET.EVAL_REGRESSION_FAILED, payload, session_id=session_id, org_id=org_id
|
|
674
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|