gooddata-eval 1.68.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gooddata_eval/__init__.py +6 -0
- gooddata_eval/_version.py +7 -0
- gooddata_eval/cli/__init__.py +1 -0
- gooddata_eval/cli/main.py +382 -0
- gooddata_eval/core/__init__.py +1 -0
- gooddata_eval/core/chat/__init__.py +1 -0
- gooddata_eval/core/chat/sse_client.py +181 -0
- gooddata_eval/core/config.py +20 -0
- gooddata_eval/core/connection.py +33 -0
- gooddata_eval/core/dataset/__init__.py +1 -0
- gooddata_eval/core/dataset/langfuse_source.py +123 -0
- gooddata_eval/core/dataset/local.py +39 -0
- gooddata_eval/core/evaluators/__init__.py +67 -0
- gooddata_eval/core/evaluators/_deep_subset.py +35 -0
- gooddata_eval/core/evaluators/_llm_judge.py +66 -0
- gooddata_eval/core/evaluators/_text_utils.py +11 -0
- gooddata_eval/core/evaluators/alert_skill.py +128 -0
- gooddata_eval/core/evaluators/base.py +24 -0
- gooddata_eval/core/evaluators/general_question.py +34 -0
- gooddata_eval/core/evaluators/guardrail.py +52 -0
- gooddata_eval/core/evaluators/metric_skill.py +58 -0
- gooddata_eval/core/evaluators/search_tool.py +40 -0
- gooddata_eval/core/evaluators/summary.py +96 -0
- gooddata_eval/core/evaluators/visualization.py +156 -0
- gooddata_eval/core/langfuse/__init__.py +1 -0
- gooddata_eval/core/langfuse/sink.py +178 -0
- gooddata_eval/core/models.py +116 -0
- gooddata_eval/core/reporting/__init__.py +1 -0
- gooddata_eval/core/reporting/console.py +117 -0
- gooddata_eval/core/reporting/json_report.py +81 -0
- gooddata_eval/core/runner.py +214 -0
- gooddata_eval/core/scoring.py +155 -0
- gooddata_eval/core/summary/__init__.py +1 -0
- gooddata_eval/core/summary/http_client.py +54 -0
- gooddata_eval/core/workspace.py +262 -0
- gooddata_eval-1.68.0.dist-info/METADATA +275 -0
- gooddata_eval-1.68.0.dist-info/RECORD +40 -0
- gooddata_eval-1.68.0.dist-info/WHEEL +4 -0
- gooddata_eval-1.68.0.dist-info/entry_points.txt +2 -0
- gooddata_eval-1.68.0.dist-info/licenses/LICENSE.txt +3252 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""`gd-eval` command-line entry point."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
from gooddata_api_client.exceptions import ApiException
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
|
|
14
|
+
from gooddata_eval.core.chat.sse_client import ChatClient
|
|
15
|
+
from gooddata_eval.core.config import RunConfig
|
|
16
|
+
from gooddata_eval.core.connection import ConnectionError_, resolve_connection
|
|
17
|
+
from gooddata_eval.core.dataset.local import load_local_dataset
|
|
18
|
+
from gooddata_eval.core.langfuse.sink import LangfuseSink
|
|
19
|
+
from gooddata_eval.core.models import ChatResult, DatasetItem
|
|
20
|
+
from gooddata_eval.core.reporting.console import render_comparison, render_console
|
|
21
|
+
from gooddata_eval.core.reporting.json_report import write_multi_model_report
|
|
22
|
+
from gooddata_eval.core.runner import ItemReport, run_items
|
|
23
|
+
from gooddata_eval.core.summary.http_client import SummaryClient
|
|
24
|
+
from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController
|
|
25
|
+
|
|
26
|
+
_EXIT_OK = 0
|
|
27
|
+
_EXIT_OPERATIONAL_ERROR = 2
|
|
28
|
+
_SUMMARY_TEST_KIND = "dashboard_summary"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class _RoutingBackend:
|
|
32
|
+
"""Dispatch each item to the right backend by test_kind.
|
|
33
|
+
|
|
34
|
+
`dashboard_summary` items go to the dedicated summary endpoint; everything
|
|
35
|
+
else uses the conversational chat endpoint.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, chat: ChatClient, summary: SummaryClient):
|
|
39
|
+
self._chat = chat
|
|
40
|
+
self._summary = summary
|
|
41
|
+
|
|
42
|
+
def ask(self, item: DatasetItem) -> ChatResult:
|
|
43
|
+
if item.test_kind == _SUMMARY_TEST_KIND:
|
|
44
|
+
return self._summary.ask(item)
|
|
45
|
+
return self._chat.ask(item)
|
|
46
|
+
|
|
47
|
+
def close(self) -> None:
|
|
48
|
+
for backend in (self._chat, self._summary):
|
|
49
|
+
if hasattr(backend, "close"):
|
|
50
|
+
backend.close()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
54
|
+
parser = argparse.ArgumentParser(prog="gd-eval", description="Evaluate the GoodData AI agent.")
|
|
55
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
56
|
+
|
|
57
|
+
run = sub.add_parser("run", help="Run an evaluation dataset.")
|
|
58
|
+
run.add_argument("--host", help="GoodData host URL.")
|
|
59
|
+
run.add_argument("--token", help="API token (or set GOODDATA_TOKEN).")
|
|
60
|
+
run.add_argument("--profile", help="Profile name in ~/.gooddata/profiles.yaml.")
|
|
61
|
+
run.add_argument("--workspace", required=True, help="Workspace id.")
|
|
62
|
+
source = run.add_mutually_exclusive_group(required=True)
|
|
63
|
+
source.add_argument("--dataset", help="Path to a folder of dataset JSON files.")
|
|
64
|
+
source.add_argument("--langfuse-dataset", dest="langfuse_dataset", help="Langfuse dataset name.")
|
|
65
|
+
run.add_argument(
|
|
66
|
+
"--model",
|
|
67
|
+
action="append",
|
|
68
|
+
dest="models",
|
|
69
|
+
metavar="MODEL",
|
|
70
|
+
help=(
|
|
71
|
+
"Model id to evaluate (e.g. --model gpt-5.2). "
|
|
72
|
+
"Prefix with provider name or id to disambiguate: "
|
|
73
|
+
"--model ProviderName/gpt-5.2 or --model provider_id/gpt-5.2. "
|
|
74
|
+
"Repeat to compare multiple models. "
|
|
75
|
+
"Default: workspace's current active model."
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
run.add_argument("--runs", type=int, default=2, help="Independent runs per item (pass@K). Default 2.")
|
|
79
|
+
run.add_argument(
|
|
80
|
+
"--concurrency",
|
|
81
|
+
type=int,
|
|
82
|
+
default=1,
|
|
83
|
+
help="Number of items evaluated concurrently (default 1 = sequential). "
|
|
84
|
+
"Increase to load-test the agent under simultaneous requests.",
|
|
85
|
+
)
|
|
86
|
+
run.add_argument("--json", dest="json_path", help="Write a JSON report to this path.")
|
|
87
|
+
run.add_argument("--quiet", action="store_true", help="Suppress per-item progress output.")
|
|
88
|
+
run.add_argument(
|
|
89
|
+
"--langfuse",
|
|
90
|
+
action="store_true",
|
|
91
|
+
help="Log scores and traces to Langfuse (requires --langfuse-dataset and LANGFUSE_* env vars).",
|
|
92
|
+
)
|
|
93
|
+
models_cmd = sub.add_parser("models", help="List LLM providers and models configured in the org.")
|
|
94
|
+
models_cmd.add_argument("--host", help="GoodData host URL.")
|
|
95
|
+
models_cmd.add_argument("--token", help="API token (or set GOODDATA_TOKEN).")
|
|
96
|
+
models_cmd.add_argument("--profile", help="Profile name in ~/.gooddata/profiles.yaml.")
|
|
97
|
+
models_cmd.add_argument(
|
|
98
|
+
"--workspace",
|
|
99
|
+
help="Workspace id. When provided, marks the currently active model.",
|
|
100
|
+
)
|
|
101
|
+
return parser
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_args(argv: list[str]) -> argparse.Namespace:
|
|
105
|
+
return _build_parser().parse_args(argv)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _truncate(text: str, limit: int = 80) -> str:
|
|
109
|
+
return text if len(text) <= limit else text[: limit - 1] + "…"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _parse_model_arg(val: str) -> tuple[str | None, str]:
|
|
113
|
+
"""Parse a model argument that may include a provider prefix.
|
|
114
|
+
|
|
115
|
+
Accepts two forms:
|
|
116
|
+
"gpt-5.2" → (None, "gpt-5.2")
|
|
117
|
+
"ProviderName/gpt-5.2" → ("ProviderName", "gpt-5.2")
|
|
118
|
+
"provider_id.../model_id" → ("provider_id...", "model_id")
|
|
119
|
+
|
|
120
|
+
The provider part (if present) is passed to resolve_and_activate and
|
|
121
|
+
accepted as either a provider name or provider id.
|
|
122
|
+
"""
|
|
123
|
+
if "/" in val:
|
|
124
|
+
provider_ref, _, model_id = val.partition("/")
|
|
125
|
+
return provider_ref.strip() or None, model_id.strip()
|
|
126
|
+
return None, val
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _make_progress_callbacks(console: Console):
|
|
130
|
+
"""Build (on_item_start, on_run_done, on_item_done) callbacks that stream progress."""
|
|
131
|
+
|
|
132
|
+
def on_item_start(index: int, total: int, item: DatasetItem) -> None:
|
|
133
|
+
console.print(f"[dim]\\[{index}/{total}][/dim] [cyan]{item.id}[/cyan] {_truncate(item.question)}")
|
|
134
|
+
|
|
135
|
+
def on_run_done(index: int, total: int, run_index: int, runs: int, passed: bool, latency: float) -> None:
|
|
136
|
+
tag = "[green]pass[/green]" if passed else "[red]fail[/red]"
|
|
137
|
+
console.print(f"[dim]\\[{index}/{total}][/dim] run {run_index}/{runs} {tag} [dim]{latency:.2f}s[/dim]")
|
|
138
|
+
|
|
139
|
+
def on_item_done(index: int, total: int, report: ItemReport) -> None:
|
|
140
|
+
if report.skipped:
|
|
141
|
+
tag = "[yellow]SKIP[/yellow]"
|
|
142
|
+
elif report.error:
|
|
143
|
+
tag = "[red]ERR [/red]"
|
|
144
|
+
elif report.pass_at_k:
|
|
145
|
+
tag = "[green]PASS[/green]"
|
|
146
|
+
else:
|
|
147
|
+
tag = "[red]FAIL[/red]"
|
|
148
|
+
if report.skipped:
|
|
149
|
+
suffix = ""
|
|
150
|
+
else:
|
|
151
|
+
quality_str = f"{report.quality_score:.0%}"
|
|
152
|
+
suffix = (
|
|
153
|
+
f" [dim]({report.latency_s:.2f}s total, {report.avg_latency_s:.2f}s avg, "
|
|
154
|
+
f"quality={quality_str}, {report.runs} run(s))[/dim]"
|
|
155
|
+
)
|
|
156
|
+
console.print(f"[dim]\\[{index}/{total}][/dim] -> {tag} [cyan]{report.id}[/cyan]{suffix}")
|
|
157
|
+
|
|
158
|
+
return on_item_start, on_run_done, on_item_done
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _load_dataset(config: RunConfig):
|
|
162
|
+
if config.dataset_folder is not None:
|
|
163
|
+
return load_local_dataset(config.dataset_folder)
|
|
164
|
+
from gooddata_eval.core.dataset.langfuse_source import load_langfuse_dataset # noqa: PLC0415
|
|
165
|
+
|
|
166
|
+
if config.langfuse_dataset is None: # pragma: no cover - argparse mutually-exclusive group guarantees one is set
|
|
167
|
+
raise ValueError("Either --dataset or --langfuse-dataset is required.")
|
|
168
|
+
return load_langfuse_dataset(config.langfuse_dataset)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _list_models(host: str, token: str, workspace_id: str | None) -> int:
|
|
172
|
+
"""List all LLM providers and their models; mark the active one if --workspace given."""
|
|
173
|
+
from gooddata_eval.core.workspace import WorkspaceModelController # noqa: PLC0415
|
|
174
|
+
|
|
175
|
+
controller = WorkspaceModelController(host, token, workspace_id or "")
|
|
176
|
+
info = controller._provider_info() # {provider_id: {name, models: [{id, family}]}}
|
|
177
|
+
|
|
178
|
+
active_provider_id: str | None = None
|
|
179
|
+
active_model_id: str | None = None
|
|
180
|
+
if workspace_id:
|
|
181
|
+
active = controller.get_active()
|
|
182
|
+
if active:
|
|
183
|
+
active_provider_id = active.provider_id
|
|
184
|
+
active_model_id = active.default_model_id
|
|
185
|
+
|
|
186
|
+
console = Console()
|
|
187
|
+
|
|
188
|
+
if not info:
|
|
189
|
+
console.print("[yellow]No LLM providers configured in this organisation.[/yellow]")
|
|
190
|
+
return _EXIT_OK
|
|
191
|
+
|
|
192
|
+
table = Table(title=f"LLM Providers and Models{f' (workspace: {workspace_id})' if workspace_id else ''}")
|
|
193
|
+
table.add_column("Provider")
|
|
194
|
+
table.add_column("Provider ID")
|
|
195
|
+
table.add_column("Model ID")
|
|
196
|
+
table.add_column("Family")
|
|
197
|
+
table.add_column("Active")
|
|
198
|
+
|
|
199
|
+
for provider_id, pinfo in sorted(info.items(), key=lambda kv: kv[1].get("name") or kv[0]):
|
|
200
|
+
name = pinfo.get("name") or provider_id
|
|
201
|
+
models = pinfo.get("models") or []
|
|
202
|
+
if not models:
|
|
203
|
+
is_active_provider = provider_id == active_provider_id
|
|
204
|
+
table.add_row(name, provider_id, "[dim](none listed)[/dim]", "", "◀" if is_active_provider else "")
|
|
205
|
+
for i, model in enumerate(models):
|
|
206
|
+
model_id = model.get("id", "?") if isinstance(model, dict) else str(model)
|
|
207
|
+
family = model.get("family", "") if isinstance(model, dict) else ""
|
|
208
|
+
is_active = provider_id == active_provider_id and model_id == active_model_id
|
|
209
|
+
active_marker = "[green]◀ active[/green]" if is_active else ""
|
|
210
|
+
table.add_row(
|
|
211
|
+
name if i == 0 else "",
|
|
212
|
+
provider_id if i == 0 else "",
|
|
213
|
+
model_id,
|
|
214
|
+
family,
|
|
215
|
+
active_marker,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
console.print(table)
|
|
219
|
+
return _EXIT_OK
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _run(config: RunConfig) -> int:
|
|
223
|
+
if config.log_to_langfuse and config.langfuse_dataset is None:
|
|
224
|
+
print(
|
|
225
|
+
"error: --langfuse requires --langfuse-dataset (local datasets have no Langfuse item ids to link to).",
|
|
226
|
+
file=sys.stderr,
|
|
227
|
+
)
|
|
228
|
+
return _EXIT_OPERATIONAL_ERROR
|
|
229
|
+
|
|
230
|
+
items = _load_dataset(config)
|
|
231
|
+
models = config.models or []
|
|
232
|
+
run_ts = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M")
|
|
233
|
+
n_models = len(models) if models else 1
|
|
234
|
+
|
|
235
|
+
controller = WorkspaceModelController(config.host, config.token, config.workspace_id)
|
|
236
|
+
original_active = controller.get_active()
|
|
237
|
+
|
|
238
|
+
progress_console = Console(stderr=True) if not config.quiet else None
|
|
239
|
+
if progress_console:
|
|
240
|
+
multi_suffix = f" — {n_models} model(s)" if n_models > 1 else ""
|
|
241
|
+
progress_console.print(f"Evaluating {len(items)} item(s) on workspace '{config.workspace_id}'{multi_suffix}")
|
|
242
|
+
|
|
243
|
+
reports: list = []
|
|
244
|
+
try:
|
|
245
|
+
for k, model_id in enumerate(models or [None], start=1):
|
|
246
|
+
provider_ref, bare_model_id = _parse_model_arg(model_id or "")
|
|
247
|
+
effective_provider = provider_ref
|
|
248
|
+
try:
|
|
249
|
+
resolved = controller.resolve_and_activate(bare_model_id or None, effective_provider)
|
|
250
|
+
except (ModelResolutionError, httpx.HTTPError, ApiException, RuntimeError) as exc:
|
|
251
|
+
print(f"warning: skipping model '{model_id}': {exc}", file=sys.stderr)
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
if progress_console:
|
|
255
|
+
if n_models > 1:
|
|
256
|
+
progress_console.print(f"\n── Model {k}/{n_models}: {resolved.model_id} " + "─" * 40)
|
|
257
|
+
else:
|
|
258
|
+
switched = " [switched active provider]" if resolved.switched else ""
|
|
259
|
+
provider_display = resolved.provider_name or resolved.provider_id
|
|
260
|
+
progress_console.print(f"Provider={provider_display}, model={resolved.model_id}{switched}")
|
|
261
|
+
|
|
262
|
+
run_name = f"gd-eval-{run_ts}-{resolved.model_id}"
|
|
263
|
+
if progress_console and config.log_to_langfuse:
|
|
264
|
+
progress_console.print(f"Logging to Langfuse run '{run_name}'...")
|
|
265
|
+
|
|
266
|
+
on_item_start, on_run_done, on_item_done = (None, None, None)
|
|
267
|
+
if progress_console:
|
|
268
|
+
on_item_start, on_run_done, on_item_done = _make_progress_callbacks(progress_console)
|
|
269
|
+
|
|
270
|
+
on_langfuse_item_done = None
|
|
271
|
+
if config.log_to_langfuse:
|
|
272
|
+
assert config.langfuse_dataset is not None
|
|
273
|
+
sink = LangfuseSink(
|
|
274
|
+
dataset_name=config.langfuse_dataset,
|
|
275
|
+
run_name=run_name,
|
|
276
|
+
model_id=resolved.model_id,
|
|
277
|
+
provider_type=resolved.provider_type,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
def on_langfuse_item_done(
|
|
281
|
+
index: int,
|
|
282
|
+
total: int,
|
|
283
|
+
report: ItemReport,
|
|
284
|
+
_sink: LangfuseSink = sink,
|
|
285
|
+
_model_id: str = resolved.model_id,
|
|
286
|
+
_provider_type: str = resolved.provider_type,
|
|
287
|
+
) -> None:
|
|
288
|
+
_sink.log_item(report, dataset_item_id=report.id)
|
|
289
|
+
|
|
290
|
+
backend = _RoutingBackend(
|
|
291
|
+
ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
|
|
292
|
+
SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
|
|
293
|
+
)
|
|
294
|
+
try:
|
|
295
|
+
report = run_items(
|
|
296
|
+
items,
|
|
297
|
+
backend,
|
|
298
|
+
runs=config.runs,
|
|
299
|
+
model=resolved.model_id,
|
|
300
|
+
provider_name=resolved.provider_name or resolved.provider_id,
|
|
301
|
+
provider_type=resolved.provider_type,
|
|
302
|
+
workspace_id=config.workspace_id,
|
|
303
|
+
on_item_start=on_item_start,
|
|
304
|
+
on_run_done=on_run_done,
|
|
305
|
+
on_item_done=on_item_done,
|
|
306
|
+
on_langfuse_item_done=on_langfuse_item_done,
|
|
307
|
+
concurrency=config.concurrency,
|
|
308
|
+
)
|
|
309
|
+
finally:
|
|
310
|
+
if hasattr(backend, "close"):
|
|
311
|
+
backend.close()
|
|
312
|
+
|
|
313
|
+
skipped_kinds = sorted({i.test_kind for i in report.items if i.skipped})
|
|
314
|
+
if skipped_kinds:
|
|
315
|
+
print(
|
|
316
|
+
f"warning: skipped {sum(i.skipped for i in report.items)} item(s) with "
|
|
317
|
+
f"unsupported test_kind(s): {', '.join(skipped_kinds)}",
|
|
318
|
+
file=sys.stderr,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
render_console(report)
|
|
322
|
+
reports.append(report)
|
|
323
|
+
finally:
|
|
324
|
+
try:
|
|
325
|
+
controller.restore(original_active)
|
|
326
|
+
except Exception as _restore_exc:
|
|
327
|
+
print(
|
|
328
|
+
f"warning: failed to restore workspace active model: {_restore_exc}",
|
|
329
|
+
file=sys.stderr,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
if not reports:
|
|
333
|
+
print("error: no models evaluated successfully.", file=sys.stderr)
|
|
334
|
+
return _EXIT_OPERATIONAL_ERROR
|
|
335
|
+
|
|
336
|
+
if len(reports) > 1:
|
|
337
|
+
render_comparison(reports)
|
|
338
|
+
|
|
339
|
+
if config.json_path is not None:
|
|
340
|
+
write_multi_model_report(reports, config.json_path)
|
|
341
|
+
|
|
342
|
+
return _EXIT_OK
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def main(argv: list[str] | None = None) -> int:
|
|
346
|
+
args = parse_args(argv if argv is not None else sys.argv[1:])
|
|
347
|
+
if hasattr(args, "concurrency") and args.concurrency < 1:
|
|
348
|
+
print("error: --concurrency must be >= 1.", file=sys.stderr)
|
|
349
|
+
return _EXIT_OPERATIONAL_ERROR
|
|
350
|
+
try:
|
|
351
|
+
host, token = resolve_connection(host=args.host, token=args.token, profile=args.profile)
|
|
352
|
+
if args.command == "models":
|
|
353
|
+
return _list_models(host, token, getattr(args, "workspace", None))
|
|
354
|
+
config = RunConfig(
|
|
355
|
+
host=host,
|
|
356
|
+
token=token,
|
|
357
|
+
workspace_id=args.workspace,
|
|
358
|
+
dataset_folder=Path(args.dataset) if args.dataset else None,
|
|
359
|
+
langfuse_dataset=args.langfuse_dataset,
|
|
360
|
+
models=args.models or [],
|
|
361
|
+
runs=args.runs,
|
|
362
|
+
concurrency=args.concurrency,
|
|
363
|
+
json_path=Path(args.json_path) if args.json_path else None,
|
|
364
|
+
log_to_langfuse=args.langfuse,
|
|
365
|
+
quiet=args.quiet,
|
|
366
|
+
)
|
|
367
|
+
return _run(config)
|
|
368
|
+
except (
|
|
369
|
+
ConnectionError_,
|
|
370
|
+
ModelResolutionError,
|
|
371
|
+
FileNotFoundError,
|
|
372
|
+
ValueError,
|
|
373
|
+
httpx.HTTPError,
|
|
374
|
+
ApiException,
|
|
375
|
+
RuntimeError,
|
|
376
|
+
) as e:
|
|
377
|
+
print(f"error: {e}", file=sys.stderr)
|
|
378
|
+
return _EXIT_OPERATIONAL_ERROR
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
if __name__ == "__main__":
|
|
382
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""SSE chat client for the agentic AI conversations API.
|
|
3
|
+
|
|
4
|
+
Ported from gdc-nas tavern-e2e app/sse_client.py (httpx instead of requests).
|
|
5
|
+
|
|
6
|
+
Why not gooddata_sdk.compute.ai_chat / ai_chat_stream? Those target the legacy
|
|
7
|
+
``/api/v1/actions/workspaces/{ws}/ai/chat[Stream]`` endpoint and expose a different
|
|
8
|
+
visualization shape (``metrics``/``dimensionality``). This evaluator scores the
|
|
9
|
+
*agentic* visualization (AAC ``query.fields`` shape) returned by the newer
|
|
10
|
+
``/api/v1/ai/workspaces/{ws}/chat/conversations`` endpoint, which is not yet
|
|
11
|
+
present in the generated api-client. When that endpoint lands in the SDK, this
|
|
12
|
+
module is the single place to swap — the runner only depends on the ChatBackend
|
|
13
|
+
protocol, not on this class.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any, Iterable
|
|
19
|
+
|
|
20
|
+
import httpx
|
|
21
|
+
|
|
22
|
+
from gooddata_eval.core.models import ChatResult, DatasetItem
|
|
23
|
+
|
|
24
|
+
SSE_DATA_PREFIX = "data: "
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class _SseAccumulator:
|
|
29
|
+
text_parts: list[str] = field(default_factory=list)
|
|
30
|
+
viz_reasoning_parts: list[str] = field(default_factory=list)
|
|
31
|
+
visualizations: list[dict[str, Any]] = field(default_factory=list)
|
|
32
|
+
tool_call_events: list[dict[str, Any]] = field(default_factory=list)
|
|
33
|
+
call_id_to_event_index: dict[str, int] = field(default_factory=dict)
|
|
34
|
+
reasoning_steps: list[dict[str, Any]] = field(default_factory=list)
|
|
35
|
+
adhoc_viz_args: list[dict[str, Any]] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _handle_text(content: dict[str, Any], acc: _SseAccumulator) -> None:
|
|
39
|
+
text = content.get("text", "")
|
|
40
|
+
if text:
|
|
41
|
+
acc.text_parts.append(text)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _handle_multipart(content: dict[str, Any], acc: _SseAccumulator) -> None:
|
|
45
|
+
for part in content.get("parts", []):
|
|
46
|
+
ptype = part.get("type")
|
|
47
|
+
if ptype == "text":
|
|
48
|
+
t = part.get("text", "")
|
|
49
|
+
if t:
|
|
50
|
+
acc.text_parts.append(t)
|
|
51
|
+
acc.viz_reasoning_parts.append(t)
|
|
52
|
+
elif ptype == "visualization" and part.get("visualization"):
|
|
53
|
+
acc.visualizations.append(part["visualization"])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _handle_reasoning(content: dict[str, Any], acc: _SseAccumulator) -> None:
|
|
57
|
+
summary = content.get("summary", "")
|
|
58
|
+
if summary:
|
|
59
|
+
acc.reasoning_steps.append({"summary": summary})
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _handle_tool_call(content: dict[str, Any], acc: _SseAccumulator) -> None:
|
|
63
|
+
call_id = content.get("callId", "")
|
|
64
|
+
acc.call_id_to_event_index[call_id] = len(acc.tool_call_events)
|
|
65
|
+
acc.tool_call_events.append(
|
|
66
|
+
{
|
|
67
|
+
"functionName": content.get("name", ""),
|
|
68
|
+
"functionArguments": json.dumps(content.get("arguments", {})),
|
|
69
|
+
"result": None,
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
# Stash visualization definition from create_adhoc_visualization so we can
|
|
73
|
+
# evaluate the agent's intended answer even when the data source call fails.
|
|
74
|
+
if content.get("name") == "create_adhoc_visualization":
|
|
75
|
+
viz = (content.get("arguments") or {}).get("visualization")
|
|
76
|
+
if viz and isinstance(viz, dict):
|
|
77
|
+
acc.adhoc_viz_args.append(viz)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _handle_tool_result(content: dict[str, Any], acc: _SseAccumulator) -> None:
|
|
81
|
+
call_id = content.get("callId", "")
|
|
82
|
+
idx = acc.call_id_to_event_index.get(call_id)
|
|
83
|
+
if idx is not None:
|
|
84
|
+
acc.tool_call_events[idx]["result"] = content.get("result", "")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _build_chat_result(acc: _SseAccumulator) -> ChatResult:
|
|
88
|
+
payload: dict[str, Any] = {
|
|
89
|
+
"textResponse": "\n".join(acc.text_parts) or None,
|
|
90
|
+
"toolCallEvents": acc.tool_call_events,
|
|
91
|
+
}
|
|
92
|
+
if acc.visualizations:
|
|
93
|
+
payload["createdVisualizations"] = {
|
|
94
|
+
"objects": acc.visualizations,
|
|
95
|
+
"reasoning": "\n".join(acc.viz_reasoning_parts),
|
|
96
|
+
}
|
|
97
|
+
elif acc.adhoc_viz_args:
|
|
98
|
+
# Fallback: the agent produced a correct visualization definition via
|
|
99
|
+
# create_adhoc_visualization but the call failed (e.g. data source not
|
|
100
|
+
# accessible). The last attempt is the agent's best answer.
|
|
101
|
+
payload["createdVisualizations"] = {
|
|
102
|
+
"objects": [acc.adhoc_viz_args[-1]],
|
|
103
|
+
"reasoning": "\n".join(acc.viz_reasoning_parts),
|
|
104
|
+
}
|
|
105
|
+
return ChatResult.model_validate(payload)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def parse_sse_lines(lines: Iterable[str]) -> ChatResult:
|
|
109
|
+
"""Parse an SSE stream (iterable of decoded lines) into a ChatResult."""
|
|
110
|
+
acc = _SseAccumulator()
|
|
111
|
+
for raw_line in lines:
|
|
112
|
+
line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
|
|
113
|
+
if not line or line.startswith("event: ") or not line.startswith(SSE_DATA_PREFIX):
|
|
114
|
+
continue
|
|
115
|
+
data_str = line[len(SSE_DATA_PREFIX) :]
|
|
116
|
+
try:
|
|
117
|
+
event_data = json.loads(data_str)
|
|
118
|
+
except json.JSONDecodeError:
|
|
119
|
+
continue
|
|
120
|
+
if "statusCode" in event_data:
|
|
121
|
+
raise RuntimeError(f"SSE error {event_data.get('statusCode')}: {event_data.get('detail')}")
|
|
122
|
+
item = event_data.get("item")
|
|
123
|
+
if not item:
|
|
124
|
+
continue
|
|
125
|
+
role = item.get("role")
|
|
126
|
+
content: dict[str, Any] = item.get("content") or {}
|
|
127
|
+
ctype = content.get("type")
|
|
128
|
+
if role == "assistant":
|
|
129
|
+
if ctype == "text":
|
|
130
|
+
_handle_text(content, acc)
|
|
131
|
+
elif ctype == "multipart":
|
|
132
|
+
_handle_multipart(content, acc)
|
|
133
|
+
elif ctype == "reasoning":
|
|
134
|
+
_handle_reasoning(content, acc)
|
|
135
|
+
elif ctype == "toolCall":
|
|
136
|
+
_handle_tool_call(content, acc)
|
|
137
|
+
elif role == "tool" and ctype == "toolResult":
|
|
138
|
+
_handle_tool_result(content, acc)
|
|
139
|
+
return _build_chat_result(acc)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class ChatClient:
|
|
143
|
+
"""Single-turn AI chat client over the GoodData AI conversation endpoints."""
|
|
144
|
+
|
|
145
|
+
def __init__(self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0):
|
|
146
|
+
self._base = f"{host.rstrip('/')}/api/v1/ai/workspaces/{workspace_id}/chat/conversations"
|
|
147
|
+
self._auth = {"Authorization": f"Bearer {token}"}
|
|
148
|
+
self._client = httpx.Client(timeout=timeout)
|
|
149
|
+
|
|
150
|
+
def _create_conversation(self) -> str:
|
|
151
|
+
resp = self._client.post(self._base, headers={**self._auth, "Content-Type": "application/json"})
|
|
152
|
+
resp.raise_for_status()
|
|
153
|
+
body = resp.json()
|
|
154
|
+
if "conversationId" not in body:
|
|
155
|
+
raise ValueError(f"GoodData /chat/conversations response missing 'conversationId': {body}")
|
|
156
|
+
return body["conversationId"]
|
|
157
|
+
|
|
158
|
+
def _delete_conversation(self, conversation_id: str) -> None:
|
|
159
|
+
try:
|
|
160
|
+
self._client.delete(f"{self._base}/{conversation_id}", headers=self._auth)
|
|
161
|
+
except httpx.HTTPError:
|
|
162
|
+
pass # best-effort cleanup
|
|
163
|
+
|
|
164
|
+
def _send_message(self, conversation_id: str, question: str) -> ChatResult:
|
|
165
|
+
url = f"{self._base}/{conversation_id}/messages"
|
|
166
|
+
headers = {**self._auth, "Accept": "text/event-stream", "Content-Type": "application/json"}
|
|
167
|
+
body = {"item": {"role": "user", "content": {"type": "text", "text": question}}}
|
|
168
|
+
with self._client.stream("POST", url, json=body, headers=headers) as resp:
|
|
169
|
+
resp.raise_for_status()
|
|
170
|
+
return parse_sse_lines(resp.iter_lines())
|
|
171
|
+
|
|
172
|
+
def ask(self, item: DatasetItem) -> ChatResult:
|
|
173
|
+
"""Run one single-turn conversation: create, send, parse, clean up."""
|
|
174
|
+
conversation_id = self._create_conversation()
|
|
175
|
+
try:
|
|
176
|
+
return self._send_message(conversation_id, item.question)
|
|
177
|
+
finally:
|
|
178
|
+
self._delete_conversation(conversation_id)
|
|
179
|
+
|
|
180
|
+
def close(self) -> None:
|
|
181
|
+
self._client.close()
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Validated run configuration produced by the CLI and consumed by the runner."""
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class RunConfig:
|
|
10
|
+
host: str
|
|
11
|
+
token: str
|
|
12
|
+
workspace_id: str
|
|
13
|
+
dataset_folder: Path | None = None
|
|
14
|
+
langfuse_dataset: str | None = None
|
|
15
|
+
models: list[str] = field(default_factory=list)
|
|
16
|
+
runs: int = 2
|
|
17
|
+
concurrency: int = 1
|
|
18
|
+
json_path: Path | None = None
|
|
19
|
+
log_to_langfuse: bool = False
|
|
20
|
+
quiet: bool = False
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|
|
2
|
+
"""Resolve (host, token) from explicit flags, environment, or a gooddata.yaml profile."""
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from gooddata_sdk.utils import profile_content
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ConnectionError_(Exception):
|
|
10
|
+
"""Raised when host/token cannot be resolved."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def resolve_connection(host: str | None, token: str | None, profile: str | None) -> tuple[str, str]:
|
|
14
|
+
"""Resolve connection parameters.
|
|
15
|
+
|
|
16
|
+
Precedence: explicit flags > GOODDATA_TOKEN env (token only) > profile file.
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
ConnectionError_: host or token could not be determined.
|
|
20
|
+
"""
|
|
21
|
+
resolved_host = host
|
|
22
|
+
resolved_token = token or os.environ.get("GOODDATA_TOKEN")
|
|
23
|
+
|
|
24
|
+
if profile is not None and (resolved_host is None or resolved_token is None):
|
|
25
|
+
content = profile_content(profile)
|
|
26
|
+
resolved_host = resolved_host or content.get("host")
|
|
27
|
+
resolved_token = resolved_token or content.get("token")
|
|
28
|
+
|
|
29
|
+
if not resolved_host:
|
|
30
|
+
raise ConnectionError_("Missing host. Pass --host or use a --profile that defines it.")
|
|
31
|
+
if not resolved_token:
|
|
32
|
+
raise ConnectionError_("Missing token. Pass --token, set GOODDATA_TOKEN, or use a --profile that defines it.")
|
|
33
|
+
return resolved_host, resolved_token
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# (C) 2026 GoodData Corporation
|