llm-diff 1.2.2__tar.gz → 1.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llm-diff
3
- Version: 1.2.2
3
+ Version: 1.2.3
4
4
  Summary: A CLI tool for comparing LLM outputs — semantically, visually, and at scale
5
5
  Project-URL: Homepage, https://github.com/veerarag1973/llmdiff
6
6
  Project-URL: Repository, https://github.com/veerarag1973/llmdiff
@@ -47,8 +47,8 @@ Description-Content-Type: text/markdown
47
47
 
48
48
  **A CLI tool and Python library for comparing LLM outputs — semantically, visually, and at scale.**
49
49
 
50
- [![PyPI](https://img.shields.io/badge/PyPI-1.2.2-blue?logo=pypi&logoColor=white)](https://pypi.org/project/llm-diff/1.2.2/)
51
- [![Tests](https://img.shields.io/badge/tests-715%20passed-brightgreen)](https://pypi.org/project/llm-diff/)
50
+ [![PyPI](https://img.shields.io/badge/PyPI-1.2.3-blue?logo=pypi&logoColor=white)](https://pypi.org/project/llm-diff/1.2.3/)
51
+ [![Tests](https://img.shields.io/badge/tests-722%20passed-brightgreen)](https://pypi.org/project/llm-diff/)
52
52
  [![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)](https://pypi.org/project/llm-diff/)
53
53
  [![Python](https://img.shields.io/pypi/pyversions/llm-diff)](https://pypi.org/project/llm-diff/)
54
54
  [![License](https://img.shields.io/pypi/l/llm-diff)](LICENSE)
@@ -79,10 +79,16 @@ threshold — making it a first-class citizen in CI/CD pipelines.
79
79
  Version 1.2 adds LLM-as-a-Judge scoring, per-call USD cost tracking,
80
80
  multi-model (3–4 model) comparison, and structured JSON diff.
81
81
 
82
+ Version 1.2.3 adds `EVAL_REGRESSION_FAILED` schema event emission — `--fail-under`
83
+ gate failures now emit a structured `llm.eval.regression.failed` event (via
84
+ `make_eval_regression_event()`) in addition to returning exit code 1,
85
+ providing a full audit trail for CI regression gates.
86
+
82
87
  Version 1.2.2 integrates [llm-toolkit-schema](https://pypi.org/project/llm-toolkit-schema/)
83
88
  as a built-in observability layer: every comparison, model call, cache lookup,
84
- cost record, and judge evaluation now emits a validated schema event that can be
85
- collected in memory, exported to JSONL, or forwarded to any custom backend.
89
+ cost record, judge evaluation, and `--fail-under` regression failure now emits a
90
+ validated schema event that can be collected in memory, exported to JSONL, or
91
+ forwarded to any custom backend.
86
92
 
87
93
  ## Documentation
88
94
 
@@ -2,8 +2,8 @@
2
2
 
3
3
  **A CLI tool and Python library for comparing LLM outputs — semantically, visually, and at scale.**
4
4
 
5
- [![PyPI](https://img.shields.io/badge/PyPI-1.2.2-blue?logo=pypi&logoColor=white)](https://pypi.org/project/llm-diff/1.2.2/)
6
- [![Tests](https://img.shields.io/badge/tests-715%20passed-brightgreen)](https://pypi.org/project/llm-diff/)
5
+ [![PyPI](https://img.shields.io/badge/PyPI-1.2.3-blue?logo=pypi&logoColor=white)](https://pypi.org/project/llm-diff/1.2.3/)
6
+ [![Tests](https://img.shields.io/badge/tests-722%20passed-brightgreen)](https://pypi.org/project/llm-diff/)
7
7
  [![Coverage](https://img.shields.io/badge/coverage-100%25-brightgreen)](https://pypi.org/project/llm-diff/)
8
8
  [![Python](https://img.shields.io/pypi/pyversions/llm-diff)](https://pypi.org/project/llm-diff/)
9
9
  [![License](https://img.shields.io/pypi/l/llm-diff)](LICENSE)
@@ -34,10 +34,16 @@ threshold — making it a first-class citizen in CI/CD pipelines.
34
34
  Version 1.2 adds LLM-as-a-Judge scoring, per-call USD cost tracking,
35
35
  multi-model (3–4 model) comparison, and structured JSON diff.
36
36
 
37
+ Version 1.2.3 adds `EVAL_REGRESSION_FAILED` schema event emission — `--fail-under`
38
+ gate failures now emit a structured `llm.eval.regression.failed` event (via
39
+ `make_eval_regression_event()`) in addition to returning exit code 1,
40
+ providing a full audit trail for CI regression gates.
41
+
37
42
  Version 1.2.2 integrates [llm-toolkit-schema](https://pypi.org/project/llm-toolkit-schema/)
38
43
  as a built-in observability layer: every comparison, model call, cache lookup,
39
- cost record, and judge evaluation now emits a validated schema event that can be
40
- collected in memory, exported to JSONL, or forwarded to any custom backend.
44
+ cost record, judge evaluation, and `--fail-under` regression failure now emits a
45
+ validated schema event that can be collected in memory, exported to JSONL, or
46
+ forwarded to any custom backend.
41
47
 
42
48
  ## Documentation
43
49
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "1.2.2"
5
+ __version__ = "1.2.3"
6
6
 
7
7
  from llm_diff.api import ComparisonReport, compare, compare_batch, compare_prompts
8
8
  from llm_diff.diff import JsonStructDiffResult, json_struct_diff
@@ -18,6 +18,7 @@ from llm_diff.schema_events import (
18
18
  make_comparison_completed_event,
19
19
  make_comparison_started_event,
20
20
  make_cost_recorded_event,
21
+ make_eval_regression_event,
21
22
  make_eval_scenario_event,
22
23
  make_report_exported_event,
23
24
  make_trace_span_event,
@@ -45,6 +46,7 @@ __all__ = [
45
46
  "make_comparison_completed_event",
46
47
  "make_comparison_started_event",
47
48
  "make_cost_recorded_event",
49
+ "make_eval_regression_event",
48
50
  "make_eval_scenario_event",
49
51
  "make_report_exported_event",
50
52
  "make_trace_span_event",
@@ -600,6 +600,28 @@ async def _run_batch(
600
600
  f"[bold red]--fail-under {fail_under:.2f}: "
601
601
  f"{len(failing)}/{len(batch_results)} item(s) below threshold.[/bold red]"
602
602
  )
603
+ from llm_diff.schema_events import ( # noqa: PLC0415
604
+ emit as schema_emit,
605
+ make_eval_regression_event,
606
+ )
607
+
608
+ for _r in failing:
609
+ _score = (
610
+ _r.semantic_score
611
+ if _r.semantic_score is not None
612
+ else _r.diff_result.similarity
613
+ )
614
+ schema_emit(
615
+ make_eval_regression_event(
616
+ scenario_name="llm-diff/fail-under/batch",
617
+ current_score=_score,
618
+ baseline_score=float(fail_under),
619
+ threshold=float(fail_under),
620
+ metrics={"similarity": _r.diff_result.similarity}
621
+ if _r.semantic_score is not None
622
+ else None,
623
+ )
624
+ )
603
625
  sys.exit(1)
604
626
 
605
627
  if out:
@@ -787,6 +809,22 @@ async def _run_diff(
787
809
  f"[bold red]--fail-under {fail_under:.2f}: "
788
810
  f"score {primary:.4f} is below threshold.[/bold red]"
789
811
  )
812
+ from llm_diff.schema_events import ( # noqa: PLC0415
813
+ emit as schema_emit,
814
+ make_eval_regression_event,
815
+ )
816
+
817
+ schema_emit(
818
+ make_eval_regression_event(
819
+ scenario_name="llm-diff/fail-under/single",
820
+ current_score=float(primary),
821
+ baseline_score=float(fail_under),
822
+ threshold=float(fail_under),
823
+ metrics={"similarity": float(diff_result.similarity)}
824
+ if semantic_score is not None
825
+ else None,
826
+ )
827
+ )
790
828
  sys.exit(1)
791
829
 
792
830
  # ── Save HTML report ─────────────────────────────────────────────────────
@@ -620,3 +620,55 @@ def make_eval_scenario_event(
620
620
  return _make_event(
621
621
  ET.EVAL_SCENARIO_COMPLETED, payload, session_id=session_id, org_id=org_id
622
622
  )
623
+
624
+
625
+ def make_eval_regression_event(
626
+ *,
627
+ scenario_name: str,
628
+ current_score: float,
629
+ baseline_score: float,
630
+ threshold: float,
631
+ metrics: dict[str, float] | None = None,
632
+ session_id: str | None = None,
633
+ org_id: str | None = None,
634
+ ) -> Any:
635
+ """Build a ``llm.eval.regression.failed`` event with EvalRegressionPayload.
636
+
637
+ Emitted when the ``--fail-under`` threshold is not met, indicating that
638
+ the primary similarity/semantic score has regressed below the minimum
639
+ acceptable level.
640
+
641
+ Parameters
642
+ ----------
643
+ scenario_name:
644
+ Human-readable name for the scenario that triggered the regression,
645
+ e.g. ``"llm-diff/fail-under/batch"`` or ``"llm-diff/fail-under/single"``.
646
+ current_score:
647
+ The actual similarity or semantic score that was measured.
648
+ baseline_score:
649
+ The minimum acceptable score (i.e. the ``--fail-under`` value).
650
+ threshold:
651
+ The ``--fail-under`` threshold value (same as *baseline_score* here).
652
+ metrics:
653
+ Optional mapping of metric names to values for richer diagnostics.
654
+ session_id:
655
+ Optional session identifier for correlation.
656
+ org_id:
657
+ Optional organisation identifier.
658
+ """
659
+ ET = _event_type()
660
+ ns = _eval_ns()
661
+
662
+ payload_obj = ns.EvalRegressionPayload(
663
+ scenario_id=_ulid_or_empty(),
664
+ scenario_name=scenario_name,
665
+ current_score=current_score,
666
+ baseline_score=baseline_score,
667
+ regression_delta=baseline_score - current_score,
668
+ threshold=threshold,
669
+ metrics=metrics,
670
+ )
671
+ payload = dataclasses.asdict(payload_obj)
672
+ return _make_event(
673
+ ET.EVAL_REGRESSION_FAILED, payload, session_id=session_id, org_id=org_id
674
+ )
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "llm-diff"
7
- version = "1.2.2"
7
+ version = "1.2.3"
8
8
  description = "A CLI tool for comparing LLM outputs — semantically, visually, and at scale"
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes