gooddata-eval 1.68.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gooddata_eval/__init__.py +6 -0
- gooddata_eval/_version.py +7 -0
- gooddata_eval/cli/__init__.py +1 -0
- gooddata_eval/cli/main.py +382 -0
- gooddata_eval/core/__init__.py +1 -0
- gooddata_eval/core/chat/__init__.py +1 -0
- gooddata_eval/core/chat/sse_client.py +181 -0
- gooddata_eval/core/config.py +20 -0
- gooddata_eval/core/connection.py +33 -0
- gooddata_eval/core/dataset/__init__.py +1 -0
- gooddata_eval/core/dataset/langfuse_source.py +123 -0
- gooddata_eval/core/dataset/local.py +39 -0
- gooddata_eval/core/evaluators/__init__.py +67 -0
- gooddata_eval/core/evaluators/_deep_subset.py +35 -0
- gooddata_eval/core/evaluators/_llm_judge.py +66 -0
- gooddata_eval/core/evaluators/_text_utils.py +11 -0
- gooddata_eval/core/evaluators/alert_skill.py +128 -0
- gooddata_eval/core/evaluators/base.py +24 -0
- gooddata_eval/core/evaluators/general_question.py +34 -0
- gooddata_eval/core/evaluators/guardrail.py +52 -0
- gooddata_eval/core/evaluators/metric_skill.py +58 -0
- gooddata_eval/core/evaluators/search_tool.py +40 -0
- gooddata_eval/core/evaluators/summary.py +96 -0
- gooddata_eval/core/evaluators/visualization.py +156 -0
- gooddata_eval/core/langfuse/__init__.py +1 -0
- gooddata_eval/core/langfuse/sink.py +178 -0
- gooddata_eval/core/models.py +116 -0
- gooddata_eval/core/reporting/__init__.py +1 -0
- gooddata_eval/core/reporting/console.py +117 -0
- gooddata_eval/core/reporting/json_report.py +81 -0
- gooddata_eval/core/runner.py +214 -0
- gooddata_eval/core/scoring.py +155 -0
- gooddata_eval/core/summary/__init__.py +1 -0
- gooddata_eval/core/summary/http_client.py +54 -0
- gooddata_eval/core/workspace.py +262 -0
- gooddata_eval-1.68.0.dist-info/METADATA +275 -0
- gooddata_eval-1.68.0.dist-info/RECORD +40 -0
- gooddata_eval-1.68.0.dist-info/WHEEL +4 -0
- gooddata_eval-1.68.0.dist-info/entry_points.txt +2 -0
- gooddata_eval-1.68.0.dist-info/licenses/LICENSE.txt +3252 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Render a human-readable console report using rich."""
|
|
3
|
+
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from rich.table import Table
|
|
6
|
+
|
|
7
|
+
from gooddata_eval.core.runner import EvalReport
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def render_console(report: EvalReport, *, console: Console | None = None) -> str:
|
|
11
|
+
"""Render the report to the console and return the rendered text.
|
|
12
|
+
|
|
13
|
+
Passing a Console lets callers print to stdout; the returned string aids testing.
|
|
14
|
+
"""
|
|
15
|
+
out = console or Console(record=True, width=120)
|
|
16
|
+
|
|
17
|
+
table = Table(title=f"Evaluation — model={report.model} workspace={report.workspace_id}")
|
|
18
|
+
table.add_column("Item")
|
|
19
|
+
table.add_column("Kind")
|
|
20
|
+
table.add_column("Result")
|
|
21
|
+
table.add_column("Runs")
|
|
22
|
+
table.add_column("Latency")
|
|
23
|
+
table.add_column("Avg/run")
|
|
24
|
+
table.add_column("Quality")
|
|
25
|
+
table.add_column("Notes")
|
|
26
|
+
|
|
27
|
+
for item in report.items:
|
|
28
|
+
if item.skipped:
|
|
29
|
+
result, notes = "SKIPPED", f"test_kind '{item.test_kind}' not supported in this phase"
|
|
30
|
+
elif item.error:
|
|
31
|
+
result, notes = "ERROR", item.error
|
|
32
|
+
elif item.pass_at_k:
|
|
33
|
+
result, notes = "PASS", ""
|
|
34
|
+
else:
|
|
35
|
+
# Evaluator-agnostic: report whichever boolean checks came back False
|
|
36
|
+
# (visualization uses metrics_correct/…; dashboard_summary uses
|
|
37
|
+
# include_*/exclude_*/rubric_*). Falls back to a generic message.
|
|
38
|
+
failing = [k for k, v in item.best_detail.items() if v is False]
|
|
39
|
+
notes = "failed: " + ", ".join(failing) if failing else "did not pass strict checks"
|
|
40
|
+
result = "FAIL"
|
|
41
|
+
latency = "-" if item.runs == 0 else f"{item.latency_s:.2f}s"
|
|
42
|
+
avg = "-" if item.runs == 0 else f"{item.avg_latency_s:.2f}s"
|
|
43
|
+
quality = "-" if item.skipped else f"{item.quality_score:.0%}"
|
|
44
|
+
table.add_row(item.id, item.test_kind, result, str(item.runs), latency, avg, quality, notes)
|
|
45
|
+
|
|
46
|
+
out.print(table)
|
|
47
|
+
_wall = report.wall_clock_s
|
|
48
|
+
_agent = report.latency_s
|
|
49
|
+
if _wall > 0 and abs(_wall - _agent) > 1: # concurrency > 1: show both
|
|
50
|
+
timing = f"{_wall:.2f}s wall-clock, {_agent:.2f}s agent time (avg {report.avg_latency_s:.2f}s/run)"
|
|
51
|
+
else:
|
|
52
|
+
timing = f"{_agent:.2f}s (avg {report.avg_latency_s:.2f}s/run)"
|
|
53
|
+
out.print(
|
|
54
|
+
f"\nSummary: {report.passed}/{report.total} passed "
|
|
55
|
+
f"({report.skipped} skipped, {report.errored} errored) "
|
|
56
|
+
f"avg quality {report.avg_quality_score:.0%} in {timing}"
|
|
57
|
+
)
|
|
58
|
+
return out.export_text() if out.record else ""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def render_comparison(reports: list[EvalReport], *, console: Console | None = None) -> str:
|
|
62
|
+
"""Render a side-by-side comparison for multiple model runs.
|
|
63
|
+
|
|
64
|
+
Winner is selected by (pass_rate, avg_quality_score, -avg_latency_s) —
|
|
65
|
+
higher pass rate first, then quality, then lower latency as final tiebreaker.
|
|
66
|
+
Returns an empty string when fewer than two reports are provided.
|
|
67
|
+
"""
|
|
68
|
+
if len(reports) < 2:
|
|
69
|
+
return ""
|
|
70
|
+
|
|
71
|
+
out = console or Console(record=True, width=120)
|
|
72
|
+
|
|
73
|
+
table = Table(title="Model Comparison")
|
|
74
|
+
table.add_column("Model")
|
|
75
|
+
table.add_column("Passed")
|
|
76
|
+
table.add_column("Quality")
|
|
77
|
+
table.add_column("Avg/run")
|
|
78
|
+
table.add_column("Total time")
|
|
79
|
+
|
|
80
|
+
for r in reports:
|
|
81
|
+
evaluated = r.total - r.skipped
|
|
82
|
+
pass_pct = f"{r.passed / evaluated:.0%}" if evaluated else "—"
|
|
83
|
+
model_label = f"{r.provider_name}/{r.model}" if r.provider_name else r.model or "?"
|
|
84
|
+
table.add_row(
|
|
85
|
+
model_label,
|
|
86
|
+
f"{r.passed}/{r.total} ({pass_pct})",
|
|
87
|
+
f"{r.avg_quality_score:.0%}",
|
|
88
|
+
f"{r.avg_latency_s:.2f}s",
|
|
89
|
+
f"{r.latency_s:.0f}s",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
out.print(table)
|
|
93
|
+
|
|
94
|
+
evaluated_reports = [r for r in reports if r.total > 0]
|
|
95
|
+
if evaluated_reports:
|
|
96
|
+
winner = max(
|
|
97
|
+
evaluated_reports,
|
|
98
|
+
key=lambda r: (
|
|
99
|
+
r.passed / r.total if r.total else 0,
|
|
100
|
+
r.avg_quality_score,
|
|
101
|
+
-r.avg_latency_s, # lower latency wins when pass rate and quality tie
|
|
102
|
+
),
|
|
103
|
+
)
|
|
104
|
+
runner_up = max(
|
|
105
|
+
(r for r in evaluated_reports if r is not winner),
|
|
106
|
+
key=lambda r: r.passed / r.total if r.total else 0,
|
|
107
|
+
default=None,
|
|
108
|
+
)
|
|
109
|
+
delta = ""
|
|
110
|
+
if runner_up and runner_up.total > 0:
|
|
111
|
+
delta_items = winner.passed - runner_up.passed
|
|
112
|
+
delta_quality = winner.avg_quality_score - runner_up.avg_quality_score
|
|
113
|
+
delta = f" (+{delta_items} item(s) passed, +{delta_quality:.0%} quality)"
|
|
114
|
+
winner_label = f"{winner.provider_name}/{winner.model}" if winner.provider_name else winner.model or "?"
|
|
115
|
+
out.print(f"\n[bold]Winner: {winner_label}[/bold]{delta}")
|
|
116
|
+
|
|
117
|
+
return out.export_text() if out.record else ""
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Build and write machine-readable reports (single-model or multi-model)."""
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import orjson
|
|
7
|
+
|
|
8
|
+
from gooddata_eval.core.runner import EvalReport
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _build_run_dict(report: EvalReport) -> dict:
|
|
12
|
+
return {
|
|
13
|
+
"model": report.model,
|
|
14
|
+
"workspace_id": report.workspace_id,
|
|
15
|
+
"summary": {
|
|
16
|
+
"total": report.total,
|
|
17
|
+
"passed": report.passed,
|
|
18
|
+
"failed": report.total - report.passed - report.skipped,
|
|
19
|
+
"skipped": report.skipped,
|
|
20
|
+
"errored": report.errored,
|
|
21
|
+
"latency_s": round(report.latency_s, 3),
|
|
22
|
+
"avg_latency_s": round(report.avg_latency_s, 3),
|
|
23
|
+
"wall_clock_s": round(report.wall_clock_s, 3),
|
|
24
|
+
},
|
|
25
|
+
"items": {
|
|
26
|
+
item.id: {
|
|
27
|
+
"dataset_name": item.dataset_name,
|
|
28
|
+
"test_kind": item.test_kind,
|
|
29
|
+
"question": item.question,
|
|
30
|
+
"pass_at_k": item.pass_at_k,
|
|
31
|
+
"skipped": item.skipped,
|
|
32
|
+
"error": item.error,
|
|
33
|
+
"runs": item.runs,
|
|
34
|
+
"latency_s": round(item.latency_s, 3),
|
|
35
|
+
"avg_latency_s": round(item.avg_latency_s, 3),
|
|
36
|
+
"detail": item.best_detail,
|
|
37
|
+
}
|
|
38
|
+
for item in report.items
|
|
39
|
+
},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _build_comparison_entry(report: EvalReport) -> dict:
|
|
44
|
+
total = report.total
|
|
45
|
+
passed = report.passed
|
|
46
|
+
return {
|
|
47
|
+
"provider_name": report.provider_name,
|
|
48
|
+
"passed": passed,
|
|
49
|
+
"total": total,
|
|
50
|
+
"pass_rate": round(passed / total, 4) if total else 0.0,
|
|
51
|
+
"avg_quality_score": round(report.avg_quality_score, 4),
|
|
52
|
+
"avg_latency_s": round(report.avg_latency_s, 3),
|
|
53
|
+
"total_latency_s": round(report.latency_s, 3),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _run_key(report: EvalReport) -> str:
|
|
58
|
+
"""Collision-free key matching the console comparison table label."""
|
|
59
|
+
return f"{report.provider_name}/{report.model}" if report.provider_name else report.model or "?"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def build_multi_model_report(reports: list[EvalReport]) -> dict:
|
|
63
|
+
"""Build the nested multi-model JSON report (used for single-model runs too)."""
|
|
64
|
+
return {
|
|
65
|
+
"models": [_run_key(r) for r in reports],
|
|
66
|
+
"runs": {_run_key(r): _build_run_dict(r) for r in reports},
|
|
67
|
+
"comparison": {_run_key(r): _build_comparison_entry(r) for r in reports},
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def write_multi_model_report(reports: list[EvalReport], path: Path) -> None:
|
|
72
|
+
Path(path).write_bytes(orjson.dumps(build_multi_model_report(reports), option=orjson.OPT_INDENT_2))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# Backward-compatible aliases so existing callers keep working.
|
|
76
|
+
def build_json_report(report: EvalReport) -> dict:
|
|
77
|
+
return _build_run_dict(report)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def write_json_report(report: EvalReport, path: Path) -> None:
|
|
81
|
+
write_multi_model_report([report], path)
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Dataset run orchestration: per item, K single-turn runs, route by test_kind, aggregate pass@K."""
|
|
3
|
+
|
|
4
|
+
import time
|
|
5
|
+
import traceback
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from functools import partial
|
|
9
|
+
from typing import Callable, Protocol
|
|
10
|
+
|
|
11
|
+
from gooddata_eval.core.evaluators import get_evaluator, supported_test_kinds
|
|
12
|
+
from gooddata_eval.core.evaluators.base import ItemEvaluation
|
|
13
|
+
from gooddata_eval.core.models import ChatResult, DatasetItem
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ChatBackend(Protocol):
|
|
17
|
+
# Receives the whole item so backends can use per-item context beyond the
|
|
18
|
+
# question text (e.g. dashboard_summary needs item.summary_input).
|
|
19
|
+
def ask(self, item: DatasetItem) -> ChatResult: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ItemReport:
|
|
24
|
+
id: str
|
|
25
|
+
dataset_name: str
|
|
26
|
+
test_kind: str
|
|
27
|
+
question: str
|
|
28
|
+
pass_at_k: bool = False
|
|
29
|
+
skipped: bool = False
|
|
30
|
+
error: str | None = None
|
|
31
|
+
runs: int = 0
|
|
32
|
+
latency_s: float = 0.0 # total wall-clock across this item's runs
|
|
33
|
+
best_detail: dict = field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def avg_latency_s(self) -> float:
|
|
37
|
+
return self.latency_s / self.runs if self.runs else 0.0
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def quality_score(self) -> float:
|
|
41
|
+
"""Fraction of bool-valued strict checks in best_detail that are True.
|
|
42
|
+
|
|
43
|
+
Falls back to 1.0 if pass_at_k else 0.0 when no bool checks exist
|
|
44
|
+
(e.g. text evaluators where best_detail has no bool flags).
|
|
45
|
+
"""
|
|
46
|
+
checks = {k: v for k, v in self.best_detail.items() if isinstance(v, bool)}
|
|
47
|
+
if checks:
|
|
48
|
+
return sum(1 for v in checks.values() if v) / len(checks)
|
|
49
|
+
return 1.0 if self.pass_at_k else 0.0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class EvalReport:
|
|
54
|
+
model: str | None
|
|
55
|
+
provider_name: str = ""
|
|
56
|
+
provider_type: str = ""
|
|
57
|
+
workspace_id: str = ""
|
|
58
|
+
items: list[ItemReport] = field(default_factory=list)
|
|
59
|
+
wall_clock_s: float = 0.0 # actual elapsed time; differs from latency_s under concurrency
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def total(self) -> int:
|
|
63
|
+
return len(self.items)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def passed(self) -> int:
|
|
67
|
+
return sum(1 for i in self.items if i.pass_at_k)
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def skipped(self) -> int:
|
|
71
|
+
return sum(1 for i in self.items if i.skipped)
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def errored(self) -> int:
|
|
75
|
+
return sum(1 for i in self.items if i.error is not None)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def latency_s(self) -> float:
|
|
79
|
+
return sum(i.latency_s for i in self.items)
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def total_runs(self) -> int:
|
|
83
|
+
return sum(i.runs for i in self.items)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def avg_latency_s(self) -> float:
|
|
87
|
+
total_runs = self.total_runs
|
|
88
|
+
return self.latency_s / total_runs if total_runs else 0.0
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def avg_quality_score(self) -> float:
|
|
92
|
+
"""Mean quality_score across evaluated (non-skipped, non-errored) items."""
|
|
93
|
+
evaluated = [i.quality_score for i in self.items if not i.skipped and i.error is None]
|
|
94
|
+
return sum(evaluated) / len(evaluated) if evaluated else 0.0
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# A per-run progress callback: (run_index, runs_total, passed, latency_s) -> None
|
|
98
|
+
RunCallback = Callable[[int, int, bool, float], None]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _run_one_item(
|
|
102
|
+
item: DatasetItem, backend: ChatBackend, runs: int, on_run_done: RunCallback | None = None
|
|
103
|
+
) -> ItemReport:
|
|
104
|
+
report = ItemReport(id=item.id, dataset_name=item.dataset_name, test_kind=item.test_kind, question=item.question)
|
|
105
|
+
if item.test_kind not in supported_test_kinds():
|
|
106
|
+
report.skipped = True
|
|
107
|
+
return report
|
|
108
|
+
|
|
109
|
+
evaluator = get_evaluator(item.test_kind)
|
|
110
|
+
best: ItemEvaluation | None = None
|
|
111
|
+
try:
|
|
112
|
+
for run_index in range(1, runs + 1):
|
|
113
|
+
t0 = time.perf_counter()
|
|
114
|
+
chat_result = backend.ask(item)
|
|
115
|
+
evaluation = evaluator.evaluate(item, chat_result)
|
|
116
|
+
latency = time.perf_counter() - t0
|
|
117
|
+
report.runs += 1
|
|
118
|
+
report.latency_s += latency
|
|
119
|
+
if best is None or evaluation.rank_key > best.rank_key:
|
|
120
|
+
best = evaluation
|
|
121
|
+
if evaluation.passed:
|
|
122
|
+
report.pass_at_k = True
|
|
123
|
+
if on_run_done is not None:
|
|
124
|
+
on_run_done(run_index, runs, evaluation.passed, latency)
|
|
125
|
+
except Exception as e: # agent/network/parse failure for this item
|
|
126
|
+
report.error = f"{type(e).__name__}: {e}"
|
|
127
|
+
if best is not None:
|
|
128
|
+
report.best_detail = best.detail
|
|
129
|
+
return report
|
|
130
|
+
|
|
131
|
+
if best is not None:
|
|
132
|
+
report.best_detail = best.detail
|
|
133
|
+
return report
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _forward_run_event(
|
|
137
|
+
user_cb: "Callable[[int, int, int, int, bool, float], None]",
|
|
138
|
+
item_index: int,
|
|
139
|
+
total: int,
|
|
140
|
+
run_index: int,
|
|
141
|
+
runs_total: int,
|
|
142
|
+
passed: bool,
|
|
143
|
+
latency: float,
|
|
144
|
+
) -> None:
|
|
145
|
+
user_cb(item_index, total, run_index, runs_total, passed, latency)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def run_items(
|
|
149
|
+
items: list[DatasetItem],
|
|
150
|
+
backend: ChatBackend,
|
|
151
|
+
*,
|
|
152
|
+
runs: int = 2,
|
|
153
|
+
model: str | None = None,
|
|
154
|
+
provider_name: str = "",
|
|
155
|
+
provider_type: str = "",
|
|
156
|
+
workspace_id: str = "",
|
|
157
|
+
on_item_start: Callable[[int, int, DatasetItem], None] | None = None,
|
|
158
|
+
on_run_done: Callable[[int, int, int, int, bool, float], None] | None = None,
|
|
159
|
+
on_item_done: Callable[[int, int, ItemReport], None] | None = None,
|
|
160
|
+
on_langfuse_item_done: Callable[[int, int, ItemReport], None] | None = None,
|
|
161
|
+
concurrency: int = 1,
|
|
162
|
+
) -> EvalReport:
|
|
163
|
+
"""Run every item K times, routing by test_kind, and aggregate pass@K.
|
|
164
|
+
|
|
165
|
+
Optional callbacks stream progress without coupling core to any I/O library
|
|
166
|
+
(index/run_index are 1-based):
|
|
167
|
+
- on_item_start(index, total, item) before an item's runs begin
|
|
168
|
+
- on_run_done(index, total, run_index, runs, passed, latency) after each individual run
|
|
169
|
+
- on_item_done(index, total, report) after an item is fully evaluated
|
|
170
|
+
- on_langfuse_item_done(index, total, report) after non-skipped, non-errored items only
|
|
171
|
+
|
|
172
|
+
concurrency > 1 dispatches items to a ThreadPoolExecutor so multiple
|
|
173
|
+
questions are sent to the agent simultaneously. Each item still runs
|
|
174
|
+
--runs times sequentially (pass@K). Results are collected in input order.
|
|
175
|
+
"""
|
|
176
|
+
concurrency = max(1, concurrency)
|
|
177
|
+
report = EvalReport(
|
|
178
|
+
model=model, provider_name=provider_name, provider_type=provider_type, workspace_id=workspace_id
|
|
179
|
+
)
|
|
180
|
+
total = len(items)
|
|
181
|
+
|
|
182
|
+
def _process_item(index: int, item: DatasetItem) -> ItemReport:
|
|
183
|
+
try:
|
|
184
|
+
if on_item_start is not None:
|
|
185
|
+
on_item_start(index, total, item)
|
|
186
|
+
except Exception: # non-fatal — callback must not abort a parallel run
|
|
187
|
+
traceback.print_exc()
|
|
188
|
+
run_cb = partial(_forward_run_event, on_run_done, index, total) if on_run_done is not None else None
|
|
189
|
+
item_report = _run_one_item(item, backend, runs, on_run_done=run_cb)
|
|
190
|
+
try:
|
|
191
|
+
if on_item_done is not None:
|
|
192
|
+
on_item_done(index, total, item_report)
|
|
193
|
+
if on_langfuse_item_done is not None and not item_report.skipped and item_report.error is None:
|
|
194
|
+
on_langfuse_item_done(index, total, item_report)
|
|
195
|
+
except Exception: # non-fatal — log but don't abort
|
|
196
|
+
traceback.print_exc()
|
|
197
|
+
return item_report
|
|
198
|
+
|
|
199
|
+
_t0 = time.perf_counter()
|
|
200
|
+
if concurrency <= 1:
|
|
201
|
+
for index, item in enumerate(items, start=1):
|
|
202
|
+
report.items.append(_process_item(index, item))
|
|
203
|
+
else:
|
|
204
|
+
# Dispatch concurrently; collect in original order.
|
|
205
|
+
with ThreadPoolExecutor(max_workers=concurrency) as pool:
|
|
206
|
+
futures = {pool.submit(_process_item, index, item): index for index, item in enumerate(items, start=1)}
|
|
207
|
+
results: dict[int, ItemReport] = {}
|
|
208
|
+
for future in as_completed(futures):
|
|
209
|
+
idx = futures[future]
|
|
210
|
+
results[idx] = future.result()
|
|
211
|
+
for index in range(1, total + 1):
|
|
212
|
+
report.items.append(results[index])
|
|
213
|
+
report.wall_clock_s = time.perf_counter() - _t0
|
|
214
|
+
return report
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Visualization scoring — ported from gdc-nas tavern-e2e app/vis_assertions/metrics.py."""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from gooddata_eval.core.models import AacBucketRef, AacQueryField, CreatedVisualization
|
|
8
|
+
|
|
9
|
+
# Maps dataset chart-type names (and agent enum values) to a canonical token.
|
|
10
|
+
_AAC_TYPE_MAP = {
|
|
11
|
+
"line_chart": "LINE",
|
|
12
|
+
"bar_chart": "BAR",
|
|
13
|
+
"column_chart": "COLUMN",
|
|
14
|
+
"pie_chart": "PIE",
|
|
15
|
+
"table": "TABLE",
|
|
16
|
+
"headline": "HEADLINE",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class FilterScores:
|
|
22
|
+
date_ok: bool
|
|
23
|
+
ranking_ok: bool
|
|
24
|
+
attribute_ok: bool
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def all_ok(self) -> bool:
|
|
28
|
+
return self.date_ok and self.ranking_ok and self.attribute_ok
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _resolve_alias_to_uri(alias: str, fields: dict[str, AacQueryField | str]) -> str:
|
|
32
|
+
"""Resolve a field alias to its `using` URI; return the alias unchanged if absent."""
|
|
33
|
+
field = fields.get(alias)
|
|
34
|
+
if field is None:
|
|
35
|
+
return alias
|
|
36
|
+
if isinstance(field, AacQueryField):
|
|
37
|
+
return field.using
|
|
38
|
+
return field
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _resolve_bucket_to_uri_set(bucket: list[AacBucketRef | str], fields: dict[str, AacQueryField | str]) -> set[str]:
|
|
42
|
+
uris: set[str] = set()
|
|
43
|
+
for ref in bucket:
|
|
44
|
+
alias = ref.field if isinstance(ref, AacBucketRef) else ref
|
|
45
|
+
uris.add(_resolve_alias_to_uri(alias, fields))
|
|
46
|
+
return uris
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_metric_uri_set(viz: CreatedVisualization) -> set[str]:
|
|
50
|
+
return _resolve_bucket_to_uri_set(viz.metrics, viz.query.fields)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_dimension_uri_set(viz: CreatedVisualization) -> set[str]:
|
|
54
|
+
all_dim_buckets = viz.view_by + viz.segment_by + viz.rows + viz.columns
|
|
55
|
+
return _resolve_bucket_to_uri_set(all_dim_buckets, viz.query.fields)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def uri_to_display_name(uri: str) -> str:
|
|
59
|
+
"""Convert 'metric/net_sales' -> 'net sales', 'label/date.month' -> 'date - month'."""
|
|
60
|
+
last = uri.split("/", 1)[-1]
|
|
61
|
+
return last.replace(".", " - ").replace("_", " ")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def validate_cross_references(viz: CreatedVisualization) -> tuple[bool, list[str]]:
|
|
65
|
+
"""Validate ranking-filter `using`/`attribute` resolve to correct URI prefixes."""
|
|
66
|
+
errors: list[str] = []
|
|
67
|
+
fields = viz.query.fields
|
|
68
|
+
for filter_key, filter_dict in viz.query.filter_by.items():
|
|
69
|
+
if filter_dict.get("type") != "ranking_filter":
|
|
70
|
+
continue
|
|
71
|
+
using_val = filter_dict.get("using", "")
|
|
72
|
+
using_uri = _resolve_alias_to_uri(using_val, fields)
|
|
73
|
+
if not using_uri.startswith(("metric/", "fact/")):
|
|
74
|
+
errors.append(
|
|
75
|
+
f"ranking filter '{filter_key}': using='{using_val}' "
|
|
76
|
+
f"resolves to '{using_uri}' — expected a metric/ or fact/ URI"
|
|
77
|
+
)
|
|
78
|
+
if "attribute" in filter_dict:
|
|
79
|
+
attr_val = filter_dict["attribute"]
|
|
80
|
+
attr_uri = _resolve_alias_to_uri(attr_val, fields)
|
|
81
|
+
if not attr_uri.startswith(("label/", "attribute/")):
|
|
82
|
+
errors.append(
|
|
83
|
+
f"ranking filter '{filter_key}': attribute='{attr_val}' "
|
|
84
|
+
f"resolves to '{attr_uri}' — expected a label/ or attribute/ URI"
|
|
85
|
+
)
|
|
86
|
+
return len(errors) == 0, errors
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _normalize_date_filter(filter_dict: dict, _fields: dict) -> dict:
|
|
90
|
+
return {
|
|
91
|
+
"type": "date_filter",
|
|
92
|
+
"dataset_uri": filter_dict.get("using", ""),
|
|
93
|
+
"from": filter_dict.get("from"),
|
|
94
|
+
"to": filter_dict.get("to"),
|
|
95
|
+
"granularity": filter_dict.get("granularity"),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _normalize_ranking_filter(filter_dict: dict, fields: dict[str, AacQueryField | str]) -> dict:
|
|
100
|
+
entry: dict = {
|
|
101
|
+
"type": "ranking_filter",
|
|
102
|
+
"metric_uri": _resolve_alias_to_uri(filter_dict.get("using", ""), fields),
|
|
103
|
+
"dim_uri": _resolve_alias_to_uri(filter_dict.get("attribute", ""), fields),
|
|
104
|
+
}
|
|
105
|
+
if "top" in filter_dict:
|
|
106
|
+
entry["top"] = filter_dict["top"]
|
|
107
|
+
if "bottom" in filter_dict:
|
|
108
|
+
entry["bottom"] = filter_dict["bottom"]
|
|
109
|
+
return entry
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _normalize_attribute_filter(filter_dict: dict, _fields: dict) -> dict:
|
|
113
|
+
raw_state = filter_dict.get("state") or {}
|
|
114
|
+
state = {k: v for k, v in raw_state.items() if v}
|
|
115
|
+
return {
|
|
116
|
+
"type": "attribute_filter",
|
|
117
|
+
"field_uri": filter_dict.get("using", ""),
|
|
118
|
+
"state": state,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _split_and_normalize_filters(viz: CreatedVisualization) -> tuple[set[str], set[str], set[str]]:
|
|
123
|
+
date_set: set[str] = set()
|
|
124
|
+
ranking_set: set[str] = set()
|
|
125
|
+
attr_set: set[str] = set()
|
|
126
|
+
fields = viz.query.fields
|
|
127
|
+
for filter_dict in viz.query.filter_by.values():
|
|
128
|
+
ft = filter_dict.get("type")
|
|
129
|
+
if ft == "date_filter":
|
|
130
|
+
date_set.add(json.dumps(_normalize_date_filter(filter_dict, fields), sort_keys=True))
|
|
131
|
+
elif ft == "ranking_filter":
|
|
132
|
+
ranking_set.add(json.dumps(_normalize_ranking_filter(filter_dict, fields), sort_keys=True))
|
|
133
|
+
elif ft == "attribute_filter":
|
|
134
|
+
attr_set.add(json.dumps(_normalize_attribute_filter(filter_dict, fields), sort_keys=True))
|
|
135
|
+
return date_set, ranking_set, attr_set
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def check_filters(expected: CreatedVisualization, actual: CreatedVisualization) -> FilterScores:
|
|
139
|
+
exp_date, exp_rank, exp_attr = _split_and_normalize_filters(expected)
|
|
140
|
+
act_date, act_rank, act_attr = _split_and_normalize_filters(actual)
|
|
141
|
+
return FilterScores(
|
|
142
|
+
date_ok=act_date == exp_date,
|
|
143
|
+
ranking_ok=act_rank == exp_rank,
|
|
144
|
+
attribute_ok=act_attr == exp_attr,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _normalize_viz_type(raw_type: str) -> str:
|
|
149
|
+
return _AAC_TYPE_MAP.get(raw_type, raw_type.replace("_chart", "").upper())
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def check_viz_type(expected: CreatedVisualization, actual: CreatedVisualization) -> bool:
|
|
153
|
+
if not expected.type:
|
|
154
|
+
return True
|
|
155
|
+
return _normalize_viz_type(expected.type) == _normalize_viz_type(actual.type)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""HTTP client for the dedicated dashboard-summary endpoint.
|
|
3
|
+
|
|
4
|
+
Unlike the conversational chat skill, this endpoint executes the AFM for each
|
|
5
|
+
visualization server-side and returns a plain synchronous JSON summary — no SSE
|
|
6
|
+
stream and no client-side ``result_id`` wrangling. The response is adapted into
|
|
7
|
+
a ``ChatResult`` (summary text -> ``text_response``) so the existing
|
|
8
|
+
LLM-as-judge evaluators can score it unchanged.
|
|
9
|
+
|
|
10
|
+
Endpoint (gen-ai service):
|
|
11
|
+
POST /api/v1/ai/workspaces/{workspace_id}/summary
|
|
12
|
+
|
|
13
|
+
If the route is ever renamed (e.g. to ``/summarize``), change ``_PATH`` only.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
|
|
18
|
+
from gooddata_eval.core.models import ChatResult, DatasetItem
|
|
19
|
+
|
|
20
|
+
_PATH = "/api/v1/ai/workspaces/{workspace_id}/summary"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SummaryClient:
|
|
24
|
+
"""Single-shot client for the dashboard-summary endpoint."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0):
|
|
27
|
+
self._url = f"{host.rstrip('/')}{_PATH.format(workspace_id=workspace_id)}"
|
|
28
|
+
self._auth = {"Authorization": f"Bearer {token}"}
|
|
29
|
+
self._client = httpx.Client(timeout=timeout)
|
|
30
|
+
|
|
31
|
+
def ask(self, item: DatasetItem) -> ChatResult:
|
|
32
|
+
"""Request a summary for one dataset item and adapt it to a ChatResult."""
|
|
33
|
+
si = item.summary_input
|
|
34
|
+
if si is None:
|
|
35
|
+
raise ValueError(f"dashboard_summary item '{item.id}' is missing required 'summary_input'.")
|
|
36
|
+
|
|
37
|
+
body: dict = {"dashboardId": si.dashboard_id}
|
|
38
|
+
if si.visualizations is not None:
|
|
39
|
+
body["visualizations"] = si.visualizations
|
|
40
|
+
if si.filter_context is not None:
|
|
41
|
+
body["filterContext"] = si.filter_context
|
|
42
|
+
if si.tab_id is not None:
|
|
43
|
+
body["tabId"] = si.tab_id
|
|
44
|
+
if si.format_hint is not None:
|
|
45
|
+
body["formatHint"] = si.format_hint
|
|
46
|
+
|
|
47
|
+
resp = self._client.post(self._url, json=body, headers={**self._auth, "Content-Type": "application/json"})
|
|
48
|
+
resp.raise_for_status()
|
|
49
|
+
data = resp.json()
|
|
50
|
+
summary = data.get("summary") or ""
|
|
51
|
+
return ChatResult.model_validate({"textResponse": summary})
|
|
52
|
+
|
|
53
|
+
def close(self) -> None:
|
|
54
|
+
self._client.close()
|