gooddata-eval 1.68.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gooddata_eval/__init__.py +6 -0
- gooddata_eval/_version.py +7 -0
- gooddata_eval/cli/__init__.py +1 -0
- gooddata_eval/cli/main.py +382 -0
- gooddata_eval/core/__init__.py +1 -0
- gooddata_eval/core/chat/__init__.py +1 -0
- gooddata_eval/core/chat/sse_client.py +181 -0
- gooddata_eval/core/config.py +20 -0
- gooddata_eval/core/connection.py +33 -0
- gooddata_eval/core/dataset/__init__.py +1 -0
- gooddata_eval/core/dataset/langfuse_source.py +123 -0
- gooddata_eval/core/dataset/local.py +39 -0
- gooddata_eval/core/evaluators/__init__.py +67 -0
- gooddata_eval/core/evaluators/_deep_subset.py +35 -0
- gooddata_eval/core/evaluators/_llm_judge.py +66 -0
- gooddata_eval/core/evaluators/_text_utils.py +11 -0
- gooddata_eval/core/evaluators/alert_skill.py +128 -0
- gooddata_eval/core/evaluators/base.py +24 -0
- gooddata_eval/core/evaluators/general_question.py +34 -0
- gooddata_eval/core/evaluators/guardrail.py +52 -0
- gooddata_eval/core/evaluators/metric_skill.py +58 -0
- gooddata_eval/core/evaluators/search_tool.py +40 -0
- gooddata_eval/core/evaluators/summary.py +96 -0
- gooddata_eval/core/evaluators/visualization.py +156 -0
- gooddata_eval/core/langfuse/__init__.py +1 -0
- gooddata_eval/core/langfuse/sink.py +178 -0
- gooddata_eval/core/models.py +116 -0
- gooddata_eval/core/reporting/__init__.py +1 -0
- gooddata_eval/core/reporting/console.py +117 -0
- gooddata_eval/core/reporting/json_report.py +81 -0
- gooddata_eval/core/runner.py +214 -0
- gooddata_eval/core/scoring.py +155 -0
- gooddata_eval/core/summary/__init__.py +1 -0
- gooddata_eval/core/summary/http_client.py +54 -0
- gooddata_eval/core/workspace.py +262 -0
- gooddata_eval-1.68.0.dist-info/METADATA +275 -0
- gooddata_eval-1.68.0.dist-info/RECORD +40 -0
- gooddata_eval-1.68.0.dist-info/WHEEL +4 -0
- gooddata_eval-1.68.0.dist-info/entry_points.txt +2 -0
- gooddata_eval-1.68.0.dist-info/licenses/LICENSE.txt +3252 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Evaluator for search_tool: agent must call the catalog search with expected parameters."""
|
|
3
|
+
|
|
4
|
+
from gooddata_eval.core.evaluators.base import ItemEvaluation
|
|
5
|
+
from gooddata_eval.core.models import ChatResult, DatasetItem
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _args_match(actual_args: dict, expected_args: dict) -> bool:
|
|
9
|
+
if sorted(actual_args.get("keywords") or []) != sorted(expected_args.get("keywords") or []):
|
|
10
|
+
return False
|
|
11
|
+
if sorted(actual_args.get("object_types") or []) != sorted(expected_args.get("object_types") or []):
|
|
12
|
+
return False
|
|
13
|
+
if actual_args.get("limit") != expected_args.get("limit"):
|
|
14
|
+
return False
|
|
15
|
+
return actual_args.get("emit_widget") == expected_args.get("emit_widget")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SearchToolEvaluator:
|
|
19
|
+
test_kind = "search_tool"
|
|
20
|
+
|
|
21
|
+
def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
|
|
22
|
+
expected_call = (item.expected_output or {}).get("tool_call", {})
|
|
23
|
+
expected_fn = expected_call.get("function_name", "search_objects")
|
|
24
|
+
expected_args = expected_call.get("function_arguments", {})
|
|
25
|
+
|
|
26
|
+
matching_events = [ev for ev in chat_result.tool_call_events if ev.function_name == expected_fn]
|
|
27
|
+
tool_selection = len(matching_events) > 0
|
|
28
|
+
tool_correctness = any(_args_match(ev.parsed_arguments(), expected_args) for ev in matching_events)
|
|
29
|
+
|
|
30
|
+
# tool_selection is the hard gate; tool_correctness is scored but not blocking
|
|
31
|
+
return ItemEvaluation(
|
|
32
|
+
passed=tool_selection,
|
|
33
|
+
rank_key=(int(tool_selection), int(tool_correctness)),
|
|
34
|
+
detail={
|
|
35
|
+
"tool_selection": tool_selection,
|
|
36
|
+
"tool_correctness": tool_correctness,
|
|
37
|
+
"expected_function": expected_fn,
|
|
38
|
+
"calls_found": len(matching_events),
|
|
39
|
+
},
|
|
40
|
+
)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Evaluator for dashboard_summary: rubric-based LLM-as-judge scoring.
|
|
3
|
+
|
|
4
|
+
Summaries are free text, so we do not match strings. Instead, `expected_output`
|
|
5
|
+
is a rubric of checkable criteria:
|
|
6
|
+
|
|
7
|
+
{
|
|
8
|
+
"must_include": ["...facts a good summary must contain..."],
|
|
9
|
+
"must_not_include": ["...things a good summary must avoid (hallucinations)..."],
|
|
10
|
+
"rubric": ["...soft quality dimensions..."]
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
Each criterion is scored independently by the judge (True/False), so the
|
|
14
|
+
runner's `quality_score` becomes the fraction of satisfied criteria. The item
|
|
15
|
+
*passes* only when every `must_include` is satisfied and no `must_not_include`
|
|
16
|
+
is violated; `rubric` items contribute to quality but do not gate pass/fail.
|
|
17
|
+
|
|
18
|
+
As a fallback, a non-dict `expected_output` is treated as a single rubric
|
|
19
|
+
criterion (same behaviour as `general_question`).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from gooddata_eval.core.evaluators._llm_judge import LLMJudge
|
|
25
|
+
from gooddata_eval.core.evaluators._text_utils import extract_text
|
|
26
|
+
from gooddata_eval.core.evaluators.base import ItemEvaluation
|
|
27
|
+
from gooddata_eval.core.models import ChatResult, DatasetItem
|
|
28
|
+
|
|
29
|
+
_POSITIVE_STEPS = [
|
|
30
|
+
"Read the INPUT (the user's request) and the EXPECTED OUTPUT (one criterion the summary must satisfy).",
|
|
31
|
+
"Read the ACTUAL OUTPUT (the generated summary).",
|
|
32
|
+
"Score 1 if the actual output clearly satisfies the criterion (allow paraphrasing and reasonable numeric tolerance).",
|
|
33
|
+
"Score 0 if the criterion is missing, contradicted, or only partially addressed.",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
# For must_not_include we ask the judge a plain presence question and invert the
|
|
37
|
+
# result in code. Scoring "does the summary AVOID X?" via a field labelled
|
|
38
|
+
# EXPECTED OUTPUT is unreliable: the model reads the forbidden behaviour as
|
|
39
|
+
# desired and flips the verdict. Detecting presence (no negation, no
|
|
40
|
+
# contradictory label) is far more robust.
|
|
41
|
+
_VIOLATION_STEPS = [
|
|
42
|
+
"Read the CHARACTERISTIC described in EXPECTED OUTPUT.",
|
|
43
|
+
"Read the ACTUAL OUTPUT (the generated summary).",
|
|
44
|
+
"Score 1 if the actual output clearly exhibits the described characteristic.",
|
|
45
|
+
"Score 0 if it does not exhibit it.",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DashboardSummaryEvaluator:
|
|
50
|
+
test_kind = "dashboard_summary"
|
|
51
|
+
|
|
52
|
+
def __init__(self):
|
|
53
|
+
self._positive_judge = LLMJudge(evaluation_steps=_POSITIVE_STEPS)
|
|
54
|
+
self._violation_judge = LLMJudge(evaluation_steps=_VIOLATION_STEPS)
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def _criteria(expected_output: Any) -> tuple[list[str], list[str], list[str]]:
|
|
58
|
+
if isinstance(expected_output, dict):
|
|
59
|
+
must_include = [str(c) for c in expected_output.get("must_include", [])]
|
|
60
|
+
must_not_include = [str(c) for c in expected_output.get("must_not_include", [])]
|
|
61
|
+
rubric = [str(c) for c in expected_output.get("rubric", [])]
|
|
62
|
+
if must_include or must_not_include or rubric:
|
|
63
|
+
return must_include, must_not_include, rubric
|
|
64
|
+
# Fallback: treat the whole expected_output as a single gating criterion
|
|
65
|
+
# (same pass/fail semantics as general_question).
|
|
66
|
+
return [str(expected_output)], [], []
|
|
67
|
+
|
|
68
|
+
def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
|
|
69
|
+
actual = extract_text(chat_result)
|
|
70
|
+
must_include, must_not_include, rubric = self._criteria(item.expected_output)
|
|
71
|
+
|
|
72
|
+
detail: dict[str, Any] = {"actual_output": actual}
|
|
73
|
+
passed = True
|
|
74
|
+
|
|
75
|
+
for i, criterion in enumerate(must_include):
|
|
76
|
+
ok, reason = self._positive_judge.score(item.question, criterion, actual)
|
|
77
|
+
detail[f"include_{i}"] = ok
|
|
78
|
+
detail[f"include_{i}_reason"] = reason
|
|
79
|
+
passed = passed and ok
|
|
80
|
+
|
|
81
|
+
for i, criterion in enumerate(must_not_include):
|
|
82
|
+
violated, reason = self._violation_judge.score(item.question, criterion, actual)
|
|
83
|
+
ok = not violated # True == characteristic absent == correctly avoided
|
|
84
|
+
detail[f"exclude_{i}"] = ok
|
|
85
|
+
detail[f"exclude_{i}_reason"] = reason
|
|
86
|
+
passed = passed and ok
|
|
87
|
+
|
|
88
|
+
for i, criterion in enumerate(rubric):
|
|
89
|
+
ok, reason = self._positive_judge.score(item.question, criterion, actual)
|
|
90
|
+
detail[f"rubric_{i}"] = ok
|
|
91
|
+
detail[f"rubric_{i}_reason"] = reason
|
|
92
|
+
|
|
93
|
+
bool_checks = [v for v in detail.values() if isinstance(v, bool)]
|
|
94
|
+
quality = sum(1 for v in bool_checks if v) / len(bool_checks) if bool_checks else 0.0
|
|
95
|
+
|
|
96
|
+
return ItemEvaluation(passed=passed, rank_key=(int(passed), quality), detail=detail)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Agentic visualization evaluator — ported from gdc-nas tavern-e2e app/vis_agentic.py."""
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from gooddata_eval.core.evaluators.base import ItemEvaluation
|
|
7
|
+
from gooddata_eval.core.models import ChatResult, CreatedVisualization, DatasetItem
|
|
8
|
+
from gooddata_eval.core.scoring import (
|
|
9
|
+
check_filters,
|
|
10
|
+
check_viz_type,
|
|
11
|
+
get_dimension_uri_set,
|
|
12
|
+
get_metric_uri_set,
|
|
13
|
+
validate_cross_references,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class EvaluationResult:
|
|
19
|
+
visualization_created: bool
|
|
20
|
+
cross_ref_valid: bool
|
|
21
|
+
metrics_correct: bool
|
|
22
|
+
dimensions_correct: bool
|
|
23
|
+
filters_correct: bool
|
|
24
|
+
viz_type_hard: bool
|
|
25
|
+
filter_date_score: bool
|
|
26
|
+
filter_ranking_score: bool
|
|
27
|
+
filter_attribute_score: bool
|
|
28
|
+
cross_ref_errors: list[str]
|
|
29
|
+
expected_metric_uris: set[str]
|
|
30
|
+
actual_metric_uris: set[str]
|
|
31
|
+
expected_dim_uris: set[str]
|
|
32
|
+
actual_dim_uris: set[str]
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def strict_pass(self) -> bool:
|
|
36
|
+
return (
|
|
37
|
+
self.visualization_created
|
|
38
|
+
and self.cross_ref_valid
|
|
39
|
+
and self.metrics_correct
|
|
40
|
+
and self.dimensions_correct
|
|
41
|
+
and self.filters_correct
|
|
42
|
+
and self.viz_type_hard
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def strict_checks_passed_count(self) -> int:
|
|
47
|
+
return sum(
|
|
48
|
+
[
|
|
49
|
+
self.cross_ref_valid,
|
|
50
|
+
self.metrics_correct,
|
|
51
|
+
self.dimensions_correct,
|
|
52
|
+
self.filters_correct,
|
|
53
|
+
self.viz_type_hard,
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _evaluate_visualization(expected: CreatedVisualization, actual: CreatedVisualization | None) -> EvaluationResult:
|
|
59
|
+
exp_metric_uris = get_metric_uri_set(expected)
|
|
60
|
+
exp_dim_uris = get_dimension_uri_set(expected)
|
|
61
|
+
if actual is None:
|
|
62
|
+
return EvaluationResult(
|
|
63
|
+
visualization_created=False,
|
|
64
|
+
cross_ref_valid=False,
|
|
65
|
+
metrics_correct=False,
|
|
66
|
+
dimensions_correct=False,
|
|
67
|
+
filters_correct=False,
|
|
68
|
+
viz_type_hard=False,
|
|
69
|
+
filter_date_score=False,
|
|
70
|
+
filter_ranking_score=False,
|
|
71
|
+
filter_attribute_score=False,
|
|
72
|
+
cross_ref_errors=["No visualization was created"],
|
|
73
|
+
expected_metric_uris=exp_metric_uris,
|
|
74
|
+
actual_metric_uris=set(),
|
|
75
|
+
expected_dim_uris=exp_dim_uris,
|
|
76
|
+
actual_dim_uris=set(),
|
|
77
|
+
)
|
|
78
|
+
cross_ref_valid, cross_ref_errors = validate_cross_references(actual)
|
|
79
|
+
act_metric_uris = get_metric_uri_set(actual)
|
|
80
|
+
act_dim_uris = get_dimension_uri_set(actual)
|
|
81
|
+
filter_scores = check_filters(expected, actual)
|
|
82
|
+
return EvaluationResult(
|
|
83
|
+
visualization_created=True,
|
|
84
|
+
cross_ref_valid=cross_ref_valid,
|
|
85
|
+
metrics_correct=act_metric_uris == exp_metric_uris,
|
|
86
|
+
dimensions_correct=act_dim_uris == exp_dim_uris,
|
|
87
|
+
filters_correct=filter_scores.all_ok,
|
|
88
|
+
viz_type_hard=check_viz_type(expected, actual),
|
|
89
|
+
filter_date_score=filter_scores.date_ok,
|
|
90
|
+
filter_ranking_score=filter_scores.ranking_ok,
|
|
91
|
+
filter_attribute_score=filter_scores.attribute_ok,
|
|
92
|
+
cross_ref_errors=cross_ref_errors,
|
|
93
|
+
expected_metric_uris=exp_metric_uris,
|
|
94
|
+
actual_metric_uris=act_metric_uris,
|
|
95
|
+
expected_dim_uris=exp_dim_uris,
|
|
96
|
+
actual_dim_uris=act_dim_uris,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _evaluate_against_candidates(
|
|
101
|
+
expected_outputs: list[CreatedVisualization], actual: CreatedVisualization | None
|
|
102
|
+
) -> tuple[EvaluationResult, CreatedVisualization]:
|
|
103
|
+
pairs = [(_evaluate_visualization(exp, actual), exp) for exp in expected_outputs]
|
|
104
|
+
best_result, best_expected = max(pairs, key=lambda p: (p[0].strict_pass, p[0].strict_checks_passed_count))
|
|
105
|
+
return best_result, best_expected
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _parse_expected(expected_output: dict) -> list[CreatedVisualization]:
|
|
109
|
+
if not isinstance(expected_output, dict):
|
|
110
|
+
raise ValueError("'expected_output' must be a JSON object")
|
|
111
|
+
raw_viz = expected_output.get("visualization")
|
|
112
|
+
if raw_viz is None:
|
|
113
|
+
raise ValueError("'expected_output.visualization' is required")
|
|
114
|
+
if isinstance(raw_viz, list):
|
|
115
|
+
if not raw_viz:
|
|
116
|
+
raise ValueError("'expected_output.visualization' array must not be empty")
|
|
117
|
+
return [CreatedVisualization.model_validate(v) for v in raw_viz]
|
|
118
|
+
if isinstance(raw_viz, dict):
|
|
119
|
+
return [CreatedVisualization.model_validate(raw_viz)]
|
|
120
|
+
raise ValueError("'expected_output.visualization' must be a JSON object or non-empty array")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _extract_actual(chat_result: ChatResult) -> CreatedVisualization | None:
|
|
124
|
+
cv = chat_result.created_visualizations
|
|
125
|
+
if cv is None or not cv.objects:
|
|
126
|
+
return None
|
|
127
|
+
return cv.objects[0]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class VisualizationEvaluator:
|
|
131
|
+
test_kind = "visualization"
|
|
132
|
+
|
|
133
|
+
def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
|
|
134
|
+
candidates = _parse_expected(item.expected_output)
|
|
135
|
+
actual = _extract_actual(chat_result)
|
|
136
|
+
ev, _best_expected = _evaluate_against_candidates(candidates, actual)
|
|
137
|
+
return ItemEvaluation(
|
|
138
|
+
passed=ev.strict_pass,
|
|
139
|
+
rank_key=(ev.strict_pass, ev.strict_checks_passed_count),
|
|
140
|
+
detail={
|
|
141
|
+
"visualization_created": ev.visualization_created,
|
|
142
|
+
"cross_ref_valid": ev.cross_ref_valid,
|
|
143
|
+
"cross_ref_errors": ev.cross_ref_errors,
|
|
144
|
+
"metrics_correct": ev.metrics_correct,
|
|
145
|
+
"dimensions_correct": ev.dimensions_correct,
|
|
146
|
+
"filters_correct": ev.filters_correct,
|
|
147
|
+
"filter_date_score": ev.filter_date_score,
|
|
148
|
+
"filter_ranking_score": ev.filter_ranking_score,
|
|
149
|
+
"filter_attribute_score": ev.filter_attribute_score,
|
|
150
|
+
"viz_type_hard": ev.viz_type_hard,
|
|
151
|
+
"expected_metric_uris": sorted(ev.expected_metric_uris),
|
|
152
|
+
"actual_metric_uris": sorted(ev.actual_metric_uris),
|
|
153
|
+
"expected_dim_uris": sorted(ev.expected_dim_uris),
|
|
154
|
+
"actual_dim_uris": sorted(ev.actual_dim_uris),
|
|
155
|
+
},
|
|
156
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Langfuse scoring sink — posts evaluation results via the Langfuse REST API."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import uuid
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
_MAX_LATENCY_S = 60.0
|
|
16
|
+
_QUALITY_WEIGHT = 0.6
|
|
17
|
+
_SPEED_WEIGHT = 0.2
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from gooddata_eval.core.runner import ItemReport
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def compute_scores(
|
|
24
|
+
pass_at_k: bool,
|
|
25
|
+
avg_latency_s: float,
|
|
26
|
+
best_detail: dict[str, Any],
|
|
27
|
+
) -> dict[str, float]:
|
|
28
|
+
"""Compute Langfuse score values from an ItemReport.
|
|
29
|
+
|
|
30
|
+
Returns a dict with keys: pass_at_k, quality_score, value_score, latency_s.
|
|
31
|
+
quality_score = fraction of bool-valued keys in best_detail that are True.
|
|
32
|
+
Falls back to pass_at_k if no bool keys exist (text evaluators).
|
|
33
|
+
"""
|
|
34
|
+
bool_checks = {k: v for k, v in best_detail.items() if isinstance(v, bool)}
|
|
35
|
+
quality = sum(1 for v in bool_checks.values() if v) / len(bool_checks) if bool_checks else 1.0 if pass_at_k else 0.0
|
|
36
|
+
|
|
37
|
+
speed = max(0.0, 1.0 - avg_latency_s / _MAX_LATENCY_S)
|
|
38
|
+
value = _QUALITY_WEIGHT * quality + _SPEED_WEIGHT * speed
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
"pass_at_k": 1 if pass_at_k else 0,
|
|
42
|
+
"quality_score": round(quality, 4),
|
|
43
|
+
"value_score": round(value, 4),
|
|
44
|
+
"latency_s": round(avg_latency_s, 3),
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LangfuseSink:
|
|
49
|
+
"""Posts evaluation results to Langfuse via the ingestion REST API."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, dataset_name: str, run_name: str, model_id: str = "", provider_type: str = ""):
|
|
52
|
+
self._dataset_name = dataset_name
|
|
53
|
+
self._run_name = run_name
|
|
54
|
+
self._model_id = model_id
|
|
55
|
+
self._provider_type = provider_type
|
|
56
|
+
host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com").rstrip("/")
|
|
57
|
+
pub = os.environ.get("LANGFUSE_PUBLIC_KEY", "")
|
|
58
|
+
sec = os.environ.get("LANGFUSE_SECRET_KEY", "")
|
|
59
|
+
if not pub or not sec:
|
|
60
|
+
raise RuntimeError(
|
|
61
|
+
"Langfuse credentials not set. Export LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY to use --langfuse."
|
|
62
|
+
)
|
|
63
|
+
creds = base64.b64encode(f"{pub}:{sec}".encode()).decode()
|
|
64
|
+
self._host = host
|
|
65
|
+
self._auth_header = f"Basic {creds}"
|
|
66
|
+
|
|
67
|
+
def log_item(self, report: ItemReport, *, dataset_item_id: str) -> None:
|
|
68
|
+
"""Send trace + dataset-run-item + scores for one evaluated item.
|
|
69
|
+
|
|
70
|
+
Swallows all errors — Langfuse failures never abort the eval run.
|
|
71
|
+
"""
|
|
72
|
+
trace_id = str(uuid.uuid4())
|
|
73
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
74
|
+
scores = compute_scores(
|
|
75
|
+
pass_at_k=report.pass_at_k,
|
|
76
|
+
avg_latency_s=report.avg_latency_s,
|
|
77
|
+
best_detail=report.best_detail,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Each ingestion event needs a top-level id (dedup) and timestamp
|
|
81
|
+
# in addition to the body-level id/timestamp for the trace/score itself.
|
|
82
|
+
def _event(event_type: str, body: dict[str, Any]) -> dict[str, Any]:
|
|
83
|
+
return {"id": str(uuid.uuid4()), "timestamp": now, "type": event_type, "body": body}
|
|
84
|
+
|
|
85
|
+
batch: list[dict[str, Any]] = [
|
|
86
|
+
_event(
|
|
87
|
+
"trace-create",
|
|
88
|
+
{
|
|
89
|
+
"id": trace_id,
|
|
90
|
+
"timestamp": now,
|
|
91
|
+
"name": f"gd-eval: {report.question[:80]}",
|
|
92
|
+
# Expose the model on a first-class trace field so Langfuse
|
|
93
|
+
# dashboards can filter / break down by it ("Version"); trace
|
|
94
|
+
# metadata is not available as a breakdown dimension.
|
|
95
|
+
"version": self._model_id or None,
|
|
96
|
+
"input": {"question": report.question},
|
|
97
|
+
"output": report.best_detail,
|
|
98
|
+
"metadata": {
|
|
99
|
+
"dataset_name": report.dataset_name,
|
|
100
|
+
"test_kind": report.test_kind,
|
|
101
|
+
"item_id": report.id,
|
|
102
|
+
"model": self._model_id,
|
|
103
|
+
"provider_type": self._provider_type,
|
|
104
|
+
},
|
|
105
|
+
"tags": [t for t in [report.test_kind, self._provider_type] if t],
|
|
106
|
+
},
|
|
107
|
+
),
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
score_defs = [
|
|
111
|
+
("pass_at_k", scores["pass_at_k"], "BOOLEAN"),
|
|
112
|
+
("quality_score", scores["quality_score"], "NUMERIC"),
|
|
113
|
+
("value_score", scores["value_score"], "NUMERIC"),
|
|
114
|
+
("latency_s", scores["latency_s"], "NUMERIC"),
|
|
115
|
+
]
|
|
116
|
+
for name, value, data_type in score_defs:
|
|
117
|
+
batch.append(
|
|
118
|
+
_event(
|
|
119
|
+
"score-create",
|
|
120
|
+
{
|
|
121
|
+
"id": str(uuid.uuid4()),
|
|
122
|
+
"traceId": trace_id,
|
|
123
|
+
"name": name,
|
|
124
|
+
"value": value,
|
|
125
|
+
"dataType": data_type,
|
|
126
|
+
},
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
with httpx.Client(
|
|
132
|
+
base_url=self._host,
|
|
133
|
+
headers={"Authorization": self._auth_header},
|
|
134
|
+
timeout=10,
|
|
135
|
+
) as client:
|
|
136
|
+
resp = client.post("/api/public/ingestion", json={"batch": batch})
|
|
137
|
+
resp.raise_for_status()
|
|
138
|
+
# The ingestion endpoint returns HTTP 200 even when individual events
|
|
139
|
+
# fail — per-event errors are in the response body.
|
|
140
|
+
body = resp.json()
|
|
141
|
+
errors = body.get("errors") or []
|
|
142
|
+
for err in errors:
|
|
143
|
+
print(
|
|
144
|
+
f"warning: Langfuse event failed for item '{report.id}': "
|
|
145
|
+
f"type={err.get('error')} status={err.get('status')} id={err.get('id')}",
|
|
146
|
+
file=sys.stderr,
|
|
147
|
+
)
|
|
148
|
+
except Exception as exc:
|
|
149
|
+
print(f"warning: Langfuse ingestion failed for item '{report.id}': {exc}", file=sys.stderr)
|
|
150
|
+
|
|
151
|
+
# Link trace to dataset run via the dedicated endpoint (simpler than ingestion —
|
|
152
|
+
# does not require datasetId/runId; creates the run by name if absent).
|
|
153
|
+
try:
|
|
154
|
+
with httpx.Client(
|
|
155
|
+
base_url=self._host,
|
|
156
|
+
headers={"Authorization": self._auth_header},
|
|
157
|
+
timeout=10,
|
|
158
|
+
) as client:
|
|
159
|
+
r = client.post(
|
|
160
|
+
"/api/public/dataset-run-items",
|
|
161
|
+
json={
|
|
162
|
+
"runName": self._run_name,
|
|
163
|
+
"runDescription": (
|
|
164
|
+
f"{self._provider_type}/{self._model_id}"
|
|
165
|
+
if self._provider_type and self._model_id
|
|
166
|
+
else self._model_id or ""
|
|
167
|
+
),
|
|
168
|
+
"metadata": {
|
|
169
|
+
"model": self._model_id,
|
|
170
|
+
"provider_type": self._provider_type,
|
|
171
|
+
},
|
|
172
|
+
"datasetItemId": dataset_item_id,
|
|
173
|
+
"traceId": trace_id,
|
|
174
|
+
},
|
|
175
|
+
)
|
|
176
|
+
r.raise_for_status()
|
|
177
|
+
except Exception as exc:
|
|
178
|
+
print(f"warning: Langfuse dataset-run-item failed for item '{report.id}': {exc}", file=sys.stderr)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Pydantic models for the eval dataset envelope and the agent's AAC output.
|
|
3
|
+
|
|
4
|
+
Ported from gdc-nas tavern-e2e app/llm_as_judge/schemas/chat.py.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AacQueryField(BaseModel):
|
|
14
|
+
model_config = ConfigDict(extra="allow")
|
|
15
|
+
|
|
16
|
+
using: str
|
|
17
|
+
title: str | None = None
|
|
18
|
+
aggregation: str | None = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AacBucketRef(BaseModel):
|
|
22
|
+
model_config = ConfigDict(extra="allow")
|
|
23
|
+
|
|
24
|
+
field: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AacQuery(BaseModel):
|
|
28
|
+
fields: dict[str, AacQueryField | str]
|
|
29
|
+
filter_by: dict[str, dict] = Field(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class CreatedVisualization(BaseModel):
|
|
33
|
+
"""Visualization in the AAC format (agent output and dataset expected output)."""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="ignore")
|
|
36
|
+
|
|
37
|
+
id: str
|
|
38
|
+
title: str | None = None
|
|
39
|
+
type: str
|
|
40
|
+
query: AacQuery
|
|
41
|
+
metrics: list[AacBucketRef | str] = Field(default_factory=list)
|
|
42
|
+
view_by: list[AacBucketRef | str] = Field(default_factory=list)
|
|
43
|
+
segment_by: list[AacBucketRef | str] = Field(default_factory=list)
|
|
44
|
+
rows: list[AacBucketRef | str] = Field(default_factory=list)
|
|
45
|
+
columns: list[AacBucketRef | str] = Field(default_factory=list)
|
|
46
|
+
config: dict | None = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class CreatedVisualizations(BaseModel):
|
|
50
|
+
model_config = ConfigDict(extra="ignore")
|
|
51
|
+
|
|
52
|
+
objects: list[CreatedVisualization] = Field(default_factory=list)
|
|
53
|
+
reasoning: str = ""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ToolCallEvent(BaseModel):
|
|
57
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
58
|
+
|
|
59
|
+
function_name: str = Field(alias="functionName")
|
|
60
|
+
function_arguments: str = Field(alias="functionArguments")
|
|
61
|
+
result: str | None = None
|
|
62
|
+
|
|
63
|
+
def parsed_arguments(self) -> dict[str, Any]:
|
|
64
|
+
try:
|
|
65
|
+
return json.loads(self.function_arguments) if self.function_arguments else {}
|
|
66
|
+
except json.JSONDecodeError:
|
|
67
|
+
return {}
|
|
68
|
+
|
|
69
|
+
def parsed_result(self) -> dict[str, Any] | None:
|
|
70
|
+
if not self.result:
|
|
71
|
+
return None
|
|
72
|
+
try:
|
|
73
|
+
return json.loads(self.result)
|
|
74
|
+
except json.JSONDecodeError:
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ChatResult(BaseModel):
|
|
79
|
+
"""Subset of the agent chat response needed for Phase 1 evaluation."""
|
|
80
|
+
|
|
81
|
+
model_config = ConfigDict(extra="ignore", populate_by_name=True)
|
|
82
|
+
|
|
83
|
+
text_response: str | None = Field(default=None, alias="textResponse")
|
|
84
|
+
created_visualizations: CreatedVisualizations | None = Field(default=None, alias="createdVisualizations")
|
|
85
|
+
tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class SummaryInput(BaseModel):
|
|
89
|
+
"""Structured input for the `dashboard_summary` test kind.
|
|
90
|
+
|
|
91
|
+
Maps onto the dedicated summary endpoint's request body
|
|
92
|
+
(`POST /api/v1/ai/workspaces/{ws}/summary`). Authored in snake_case in the
|
|
93
|
+
dataset; the SummaryClient maps it to the endpoint's camelCase fields.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
model_config = ConfigDict(extra="ignore")
|
|
97
|
+
|
|
98
|
+
dashboard_id: str
|
|
99
|
+
visualizations: list[str] | None = None
|
|
100
|
+
filter_context: list[dict] | None = None
|
|
101
|
+
tab_id: str | None = None
|
|
102
|
+
format_hint: str | None = None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class DatasetItem(BaseModel):
|
|
106
|
+
"""Common dataset envelope. `expected_output` stays raw; each evaluator parses its own shape."""
|
|
107
|
+
|
|
108
|
+
model_config = ConfigDict(extra="ignore")
|
|
109
|
+
|
|
110
|
+
id: str
|
|
111
|
+
dataset_name: str
|
|
112
|
+
test_kind: str
|
|
113
|
+
question: str
|
|
114
|
+
expected_output: Any
|
|
115
|
+
# Only used by the `dashboard_summary` test kind; ignored by all others.
|
|
116
|
+
summary_input: SummaryInput | None = None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|