gooddata-eval 1.68.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gooddata_eval/__init__.py +6 -0
  2. gooddata_eval/_version.py +7 -0
  3. gooddata_eval/cli/__init__.py +1 -0
  4. gooddata_eval/cli/main.py +382 -0
  5. gooddata_eval/core/__init__.py +1 -0
  6. gooddata_eval/core/chat/__init__.py +1 -0
  7. gooddata_eval/core/chat/sse_client.py +181 -0
  8. gooddata_eval/core/config.py +20 -0
  9. gooddata_eval/core/connection.py +33 -0
  10. gooddata_eval/core/dataset/__init__.py +1 -0
  11. gooddata_eval/core/dataset/langfuse_source.py +123 -0
  12. gooddata_eval/core/dataset/local.py +39 -0
  13. gooddata_eval/core/evaluators/__init__.py +67 -0
  14. gooddata_eval/core/evaluators/_deep_subset.py +35 -0
  15. gooddata_eval/core/evaluators/_llm_judge.py +66 -0
  16. gooddata_eval/core/evaluators/_text_utils.py +11 -0
  17. gooddata_eval/core/evaluators/alert_skill.py +128 -0
  18. gooddata_eval/core/evaluators/base.py +24 -0
  19. gooddata_eval/core/evaluators/general_question.py +34 -0
  20. gooddata_eval/core/evaluators/guardrail.py +52 -0
  21. gooddata_eval/core/evaluators/metric_skill.py +58 -0
  22. gooddata_eval/core/evaluators/search_tool.py +40 -0
  23. gooddata_eval/core/evaluators/summary.py +96 -0
  24. gooddata_eval/core/evaluators/visualization.py +156 -0
  25. gooddata_eval/core/langfuse/__init__.py +1 -0
  26. gooddata_eval/core/langfuse/sink.py +178 -0
  27. gooddata_eval/core/models.py +116 -0
  28. gooddata_eval/core/reporting/__init__.py +1 -0
  29. gooddata_eval/core/reporting/console.py +117 -0
  30. gooddata_eval/core/reporting/json_report.py +81 -0
  31. gooddata_eval/core/runner.py +214 -0
  32. gooddata_eval/core/scoring.py +155 -0
  33. gooddata_eval/core/summary/__init__.py +1 -0
  34. gooddata_eval/core/summary/http_client.py +54 -0
  35. gooddata_eval/core/workspace.py +262 -0
  36. gooddata_eval-1.68.0.dist-info/METADATA +275 -0
  37. gooddata_eval-1.68.0.dist-info/RECORD +40 -0
  38. gooddata_eval-1.68.0.dist-info/WHEEL +4 -0
  39. gooddata_eval-1.68.0.dist-info/entry_points.txt +2 -0
  40. gooddata_eval-1.68.0.dist-info/licenses/LICENSE.txt +3252 -0
@@ -0,0 +1,6 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """gooddata-eval: evaluate the GoodData AI agent against your own datasets."""
3
+
4
+ from gooddata_eval._version import __version__
5
+
6
+ __all__ = ["__version__"]
@@ -0,0 +1,7 @@
1
+ # (C) 2026 GoodData Corporation
2
+ from importlib import metadata
3
+
4
+ try:
5
+ __version__ = metadata.version("gooddata-eval")
6
+ except metadata.PackageNotFoundError:
7
+ __version__ = "unknown-version"
@@ -0,0 +1 @@
1
+ # (C) 2026 GoodData Corporation
@@ -0,0 +1,382 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """`gd-eval` command-line entry point."""
3
+
4
+ import argparse
5
+ import sys
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+
9
+ import httpx
10
+ from gooddata_api_client.exceptions import ApiException
11
+ from rich.console import Console
12
+ from rich.table import Table
13
+
14
+ from gooddata_eval.core.chat.sse_client import ChatClient
15
+ from gooddata_eval.core.config import RunConfig
16
+ from gooddata_eval.core.connection import ConnectionError_, resolve_connection
17
+ from gooddata_eval.core.dataset.local import load_local_dataset
18
+ from gooddata_eval.core.langfuse.sink import LangfuseSink
19
+ from gooddata_eval.core.models import ChatResult, DatasetItem
20
+ from gooddata_eval.core.reporting.console import render_comparison, render_console
21
+ from gooddata_eval.core.reporting.json_report import write_multi_model_report
22
+ from gooddata_eval.core.runner import ItemReport, run_items
23
+ from gooddata_eval.core.summary.http_client import SummaryClient
24
+ from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController
25
+
26
+ _EXIT_OK = 0
27
+ _EXIT_OPERATIONAL_ERROR = 2
28
+ _SUMMARY_TEST_KIND = "dashboard_summary"
29
+
30
+
31
+ class _RoutingBackend:
32
+ """Dispatch each item to the right backend by test_kind.
33
+
34
+ `dashboard_summary` items go to the dedicated summary endpoint; everything
35
+ else uses the conversational chat endpoint.
36
+ """
37
+
38
+ def __init__(self, chat: ChatClient, summary: SummaryClient):
39
+ self._chat = chat
40
+ self._summary = summary
41
+
42
+ def ask(self, item: DatasetItem) -> ChatResult:
43
+ if item.test_kind == _SUMMARY_TEST_KIND:
44
+ return self._summary.ask(item)
45
+ return self._chat.ask(item)
46
+
47
+ def close(self) -> None:
48
+ for backend in (self._chat, self._summary):
49
+ if hasattr(backend, "close"):
50
+ backend.close()
51
+
52
+
53
+ def _build_parser() -> argparse.ArgumentParser:
54
+ parser = argparse.ArgumentParser(prog="gd-eval", description="Evaluate the GoodData AI agent.")
55
+ sub = parser.add_subparsers(dest="command", required=True)
56
+
57
+ run = sub.add_parser("run", help="Run an evaluation dataset.")
58
+ run.add_argument("--host", help="GoodData host URL.")
59
+ run.add_argument("--token", help="API token (or set GOODDATA_TOKEN).")
60
+ run.add_argument("--profile", help="Profile name in ~/.gooddata/profiles.yaml.")
61
+ run.add_argument("--workspace", required=True, help="Workspace id.")
62
+ source = run.add_mutually_exclusive_group(required=True)
63
+ source.add_argument("--dataset", help="Path to a folder of dataset JSON files.")
64
+ source.add_argument("--langfuse-dataset", dest="langfuse_dataset", help="Langfuse dataset name.")
65
+ run.add_argument(
66
+ "--model",
67
+ action="append",
68
+ dest="models",
69
+ metavar="MODEL",
70
+ help=(
71
+ "Model id to evaluate (e.g. --model gpt-5.2). "
72
+ "Prefix with provider name or id to disambiguate: "
73
+ "--model ProviderName/gpt-5.2 or --model provider_id/gpt-5.2. "
74
+ "Repeat to compare multiple models. "
75
+ "Default: workspace's current active model."
76
+ ),
77
+ )
78
+ run.add_argument("--runs", type=int, default=2, help="Independent runs per item (pass@K). Default 2.")
79
+ run.add_argument(
80
+ "--concurrency",
81
+ type=int,
82
+ default=1,
83
+ help="Number of items evaluated concurrently (default 1 = sequential). "
84
+ "Increase to load-test the agent under simultaneous requests.",
85
+ )
86
+ run.add_argument("--json", dest="json_path", help="Write a JSON report to this path.")
87
+ run.add_argument("--quiet", action="store_true", help="Suppress per-item progress output.")
88
+ run.add_argument(
89
+ "--langfuse",
90
+ action="store_true",
91
+ help="Log scores and traces to Langfuse (requires --langfuse-dataset and LANGFUSE_* env vars).",
92
+ )
93
+ models_cmd = sub.add_parser("models", help="List LLM providers and models configured in the org.")
94
+ models_cmd.add_argument("--host", help="GoodData host URL.")
95
+ models_cmd.add_argument("--token", help="API token (or set GOODDATA_TOKEN).")
96
+ models_cmd.add_argument("--profile", help="Profile name in ~/.gooddata/profiles.yaml.")
97
+ models_cmd.add_argument(
98
+ "--workspace",
99
+ help="Workspace id. When provided, marks the currently active model.",
100
+ )
101
+ return parser
102
+
103
+
104
+ def parse_args(argv: list[str]) -> argparse.Namespace:
105
+ return _build_parser().parse_args(argv)
106
+
107
+
108
+ def _truncate(text: str, limit: int = 80) -> str:
109
+ return text if len(text) <= limit else text[: limit - 1] + "…"
110
+
111
+
112
+ def _parse_model_arg(val: str) -> tuple[str | None, str]:
113
+ """Parse a model argument that may include a provider prefix.
114
+
115
+ Accepts two forms:
116
+ "gpt-5.2" → (None, "gpt-5.2")
117
+ "ProviderName/gpt-5.2" → ("ProviderName", "gpt-5.2")
118
+ "provider_id.../model_id" → ("provider_id...", "model_id")
119
+
120
+ The provider part (if present) is passed to resolve_and_activate and
121
+ accepted as either a provider name or provider id.
122
+ """
123
+ if "/" in val:
124
+ provider_ref, _, model_id = val.partition("/")
125
+ return provider_ref.strip() or None, model_id.strip()
126
+ return None, val
127
+
128
+
129
+ def _make_progress_callbacks(console: Console):
130
+ """Build (on_item_start, on_run_done, on_item_done) callbacks that stream progress."""
131
+
132
+ def on_item_start(index: int, total: int, item: DatasetItem) -> None:
133
+ console.print(f"[dim]\\[{index}/{total}][/dim] [cyan]{item.id}[/cyan] {_truncate(item.question)}")
134
+
135
+ def on_run_done(index: int, total: int, run_index: int, runs: int, passed: bool, latency: float) -> None:
136
+ tag = "[green]pass[/green]" if passed else "[red]fail[/red]"
137
+ console.print(f"[dim]\\[{index}/{total}][/dim] run {run_index}/{runs} {tag} [dim]{latency:.2f}s[/dim]")
138
+
139
+ def on_item_done(index: int, total: int, report: ItemReport) -> None:
140
+ if report.skipped:
141
+ tag = "[yellow]SKIP[/yellow]"
142
+ elif report.error:
143
+ tag = "[red]ERR [/red]"
144
+ elif report.pass_at_k:
145
+ tag = "[green]PASS[/green]"
146
+ else:
147
+ tag = "[red]FAIL[/red]"
148
+ if report.skipped:
149
+ suffix = ""
150
+ else:
151
+ quality_str = f"{report.quality_score:.0%}"
152
+ suffix = (
153
+ f" [dim]({report.latency_s:.2f}s total, {report.avg_latency_s:.2f}s avg, "
154
+ f"quality={quality_str}, {report.runs} run(s))[/dim]"
155
+ )
156
+ console.print(f"[dim]\\[{index}/{total}][/dim] -> {tag} [cyan]{report.id}[/cyan]{suffix}")
157
+
158
+ return on_item_start, on_run_done, on_item_done
159
+
160
+
161
+ def _load_dataset(config: RunConfig):
162
+ if config.dataset_folder is not None:
163
+ return load_local_dataset(config.dataset_folder)
164
+ from gooddata_eval.core.dataset.langfuse_source import load_langfuse_dataset # noqa: PLC0415
165
+
166
+ if config.langfuse_dataset is None: # pragma: no cover - argparse mutually-exclusive group guarantees one is set
167
+ raise ValueError("Either --dataset or --langfuse-dataset is required.")
168
+ return load_langfuse_dataset(config.langfuse_dataset)
169
+
170
+
171
+ def _list_models(host: str, token: str, workspace_id: str | None) -> int:
172
+ """List all LLM providers and their models; mark the active one if --workspace given."""
173
+ from gooddata_eval.core.workspace import WorkspaceModelController # noqa: PLC0415
174
+
175
+ controller = WorkspaceModelController(host, token, workspace_id or "")
176
+ info = controller._provider_info() # {provider_id: {name, models: [{id, family}]}}
177
+
178
+ active_provider_id: str | None = None
179
+ active_model_id: str | None = None
180
+ if workspace_id:
181
+ active = controller.get_active()
182
+ if active:
183
+ active_provider_id = active.provider_id
184
+ active_model_id = active.default_model_id
185
+
186
+ console = Console()
187
+
188
+ if not info:
189
+ console.print("[yellow]No LLM providers configured in this organisation.[/yellow]")
190
+ return _EXIT_OK
191
+
192
+ table = Table(title=f"LLM Providers and Models{f' (workspace: {workspace_id})' if workspace_id else ''}")
193
+ table.add_column("Provider")
194
+ table.add_column("Provider ID")
195
+ table.add_column("Model ID")
196
+ table.add_column("Family")
197
+ table.add_column("Active")
198
+
199
+ for provider_id, pinfo in sorted(info.items(), key=lambda kv: kv[1].get("name") or kv[0]):
200
+ name = pinfo.get("name") or provider_id
201
+ models = pinfo.get("models") or []
202
+ if not models:
203
+ is_active_provider = provider_id == active_provider_id
204
+ table.add_row(name, provider_id, "[dim](none listed)[/dim]", "", "◀" if is_active_provider else "")
205
+ for i, model in enumerate(models):
206
+ model_id = model.get("id", "?") if isinstance(model, dict) else str(model)
207
+ family = model.get("family", "") if isinstance(model, dict) else ""
208
+ is_active = provider_id == active_provider_id and model_id == active_model_id
209
+ active_marker = "[green]◀ active[/green]" if is_active else ""
210
+ table.add_row(
211
+ name if i == 0 else "",
212
+ provider_id if i == 0 else "",
213
+ model_id,
214
+ family,
215
+ active_marker,
216
+ )
217
+
218
+ console.print(table)
219
+ return _EXIT_OK
220
+
221
+
222
+ def _run(config: RunConfig) -> int:
223
+ if config.log_to_langfuse and config.langfuse_dataset is None:
224
+ print(
225
+ "error: --langfuse requires --langfuse-dataset (local datasets have no Langfuse item ids to link to).",
226
+ file=sys.stderr,
227
+ )
228
+ return _EXIT_OPERATIONAL_ERROR
229
+
230
+ items = _load_dataset(config)
231
+ models = config.models or []
232
+ run_ts = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M")
233
+ n_models = len(models) if models else 1
234
+
235
+ controller = WorkspaceModelController(config.host, config.token, config.workspace_id)
236
+ original_active = controller.get_active()
237
+
238
+ progress_console = Console(stderr=True) if not config.quiet else None
239
+ if progress_console:
240
+ multi_suffix = f" — {n_models} model(s)" if n_models > 1 else ""
241
+ progress_console.print(f"Evaluating {len(items)} item(s) on workspace '{config.workspace_id}'{multi_suffix}")
242
+
243
+ reports: list = []
244
+ try:
245
+ for k, model_id in enumerate(models or [None], start=1):
246
+ provider_ref, bare_model_id = _parse_model_arg(model_id or "")
247
+ effective_provider = provider_ref
248
+ try:
249
+ resolved = controller.resolve_and_activate(bare_model_id or None, effective_provider)
250
+ except (ModelResolutionError, httpx.HTTPError, ApiException, RuntimeError) as exc:
251
+ print(f"warning: skipping model '{model_id}': {exc}", file=sys.stderr)
252
+ continue
253
+
254
+ if progress_console:
255
+ if n_models > 1:
256
+ progress_console.print(f"\n── Model {k}/{n_models}: {resolved.model_id} " + "─" * 40)
257
+ else:
258
+ switched = " [switched active provider]" if resolved.switched else ""
259
+ provider_display = resolved.provider_name or resolved.provider_id
260
+ progress_console.print(f"Provider={provider_display}, model={resolved.model_id}{switched}")
261
+
262
+ run_name = f"gd-eval-{run_ts}-{resolved.model_id}"
263
+ if progress_console and config.log_to_langfuse:
264
+ progress_console.print(f"Logging to Langfuse run '{run_name}'...")
265
+
266
+ on_item_start, on_run_done, on_item_done = (None, None, None)
267
+ if progress_console:
268
+ on_item_start, on_run_done, on_item_done = _make_progress_callbacks(progress_console)
269
+
270
+ on_langfuse_item_done = None
271
+ if config.log_to_langfuse:
272
+ assert config.langfuse_dataset is not None
273
+ sink = LangfuseSink(
274
+ dataset_name=config.langfuse_dataset,
275
+ run_name=run_name,
276
+ model_id=resolved.model_id,
277
+ provider_type=resolved.provider_type,
278
+ )
279
+
280
+ def on_langfuse_item_done(
281
+ index: int,
282
+ total: int,
283
+ report: ItemReport,
284
+ _sink: LangfuseSink = sink,
285
+ _model_id: str = resolved.model_id,
286
+ _provider_type: str = resolved.provider_type,
287
+ ) -> None:
288
+ _sink.log_item(report, dataset_item_id=report.id)
289
+
290
+ backend = _RoutingBackend(
291
+ ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
292
+ SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
293
+ )
294
+ try:
295
+ report = run_items(
296
+ items,
297
+ backend,
298
+ runs=config.runs,
299
+ model=resolved.model_id,
300
+ provider_name=resolved.provider_name or resolved.provider_id,
301
+ provider_type=resolved.provider_type,
302
+ workspace_id=config.workspace_id,
303
+ on_item_start=on_item_start,
304
+ on_run_done=on_run_done,
305
+ on_item_done=on_item_done,
306
+ on_langfuse_item_done=on_langfuse_item_done,
307
+ concurrency=config.concurrency,
308
+ )
309
+ finally:
310
+ if hasattr(backend, "close"):
311
+ backend.close()
312
+
313
+ skipped_kinds = sorted({i.test_kind for i in report.items if i.skipped})
314
+ if skipped_kinds:
315
+ print(
316
+ f"warning: skipped {sum(i.skipped for i in report.items)} item(s) with "
317
+ f"unsupported test_kind(s): {', '.join(skipped_kinds)}",
318
+ file=sys.stderr,
319
+ )
320
+
321
+ render_console(report)
322
+ reports.append(report)
323
+ finally:
324
+ try:
325
+ controller.restore(original_active)
326
+ except Exception as _restore_exc:
327
+ print(
328
+ f"warning: failed to restore workspace active model: {_restore_exc}",
329
+ file=sys.stderr,
330
+ )
331
+
332
+ if not reports:
333
+ print("error: no models evaluated successfully.", file=sys.stderr)
334
+ return _EXIT_OPERATIONAL_ERROR
335
+
336
+ if len(reports) > 1:
337
+ render_comparison(reports)
338
+
339
+ if config.json_path is not None:
340
+ write_multi_model_report(reports, config.json_path)
341
+
342
+ return _EXIT_OK
343
+
344
+
345
+ def main(argv: list[str] | None = None) -> int:
346
+ args = parse_args(argv if argv is not None else sys.argv[1:])
347
+ if hasattr(args, "concurrency") and args.concurrency < 1:
348
+ print("error: --concurrency must be >= 1.", file=sys.stderr)
349
+ return _EXIT_OPERATIONAL_ERROR
350
+ try:
351
+ host, token = resolve_connection(host=args.host, token=args.token, profile=args.profile)
352
+ if args.command == "models":
353
+ return _list_models(host, token, getattr(args, "workspace", None))
354
+ config = RunConfig(
355
+ host=host,
356
+ token=token,
357
+ workspace_id=args.workspace,
358
+ dataset_folder=Path(args.dataset) if args.dataset else None,
359
+ langfuse_dataset=args.langfuse_dataset,
360
+ models=args.models or [],
361
+ runs=args.runs,
362
+ concurrency=args.concurrency,
363
+ json_path=Path(args.json_path) if args.json_path else None,
364
+ log_to_langfuse=args.langfuse,
365
+ quiet=args.quiet,
366
+ )
367
+ return _run(config)
368
+ except (
369
+ ConnectionError_,
370
+ ModelResolutionError,
371
+ FileNotFoundError,
372
+ ValueError,
373
+ httpx.HTTPError,
374
+ ApiException,
375
+ RuntimeError,
376
+ ) as e:
377
+ print(f"error: {e}", file=sys.stderr)
378
+ return _EXIT_OPERATIONAL_ERROR
379
+
380
+
381
+ if __name__ == "__main__":
382
+ raise SystemExit(main())
@@ -0,0 +1 @@
1
+ # (C) 2026 GoodData Corporation
@@ -0,0 +1 @@
1
+ # (C) 2026 GoodData Corporation
@@ -0,0 +1,181 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """SSE chat client for the agentic AI conversations API.
3
+
4
+ Ported from gdc-nas tavern-e2e app/sse_client.py (httpx instead of requests).
5
+
6
+ Why not gooddata_sdk.compute.ai_chat / ai_chat_stream? Those target the legacy
7
+ ``/api/v1/actions/workspaces/{ws}/ai/chat[Stream]`` endpoint and expose a different
8
+ visualization shape (``metrics``/``dimensionality``). This evaluator scores the
9
+ *agentic* visualization (AAC ``query.fields`` shape) returned by the newer
10
+ ``/api/v1/ai/workspaces/{ws}/chat/conversations`` endpoint, which is not yet
11
+ present in the generated api-client. When that endpoint lands in the SDK, this
12
+ module is the single place to swap — the runner only depends on the ChatBackend
13
+ protocol, not on this class.
14
+ """
15
+
16
+ import json
17
+ from dataclasses import dataclass, field
18
+ from typing import Any, Iterable
19
+
20
+ import httpx
21
+
22
+ from gooddata_eval.core.models import ChatResult, DatasetItem
23
+
24
+ SSE_DATA_PREFIX = "data: "
25
+
26
+
27
+ @dataclass
28
+ class _SseAccumulator:
29
+ text_parts: list[str] = field(default_factory=list)
30
+ viz_reasoning_parts: list[str] = field(default_factory=list)
31
+ visualizations: list[dict[str, Any]] = field(default_factory=list)
32
+ tool_call_events: list[dict[str, Any]] = field(default_factory=list)
33
+ call_id_to_event_index: dict[str, int] = field(default_factory=dict)
34
+ reasoning_steps: list[dict[str, Any]] = field(default_factory=list)
35
+ adhoc_viz_args: list[dict[str, Any]] = field(default_factory=list)
36
+
37
+
38
+ def _handle_text(content: dict[str, Any], acc: _SseAccumulator) -> None:
39
+ text = content.get("text", "")
40
+ if text:
41
+ acc.text_parts.append(text)
42
+
43
+
44
+ def _handle_multipart(content: dict[str, Any], acc: _SseAccumulator) -> None:
45
+ for part in content.get("parts", []):
46
+ ptype = part.get("type")
47
+ if ptype == "text":
48
+ t = part.get("text", "")
49
+ if t:
50
+ acc.text_parts.append(t)
51
+ acc.viz_reasoning_parts.append(t)
52
+ elif ptype == "visualization" and part.get("visualization"):
53
+ acc.visualizations.append(part["visualization"])
54
+
55
+
56
+ def _handle_reasoning(content: dict[str, Any], acc: _SseAccumulator) -> None:
57
+ summary = content.get("summary", "")
58
+ if summary:
59
+ acc.reasoning_steps.append({"summary": summary})
60
+
61
+
62
+ def _handle_tool_call(content: dict[str, Any], acc: _SseAccumulator) -> None:
63
+ call_id = content.get("callId", "")
64
+ acc.call_id_to_event_index[call_id] = len(acc.tool_call_events)
65
+ acc.tool_call_events.append(
66
+ {
67
+ "functionName": content.get("name", ""),
68
+ "functionArguments": json.dumps(content.get("arguments", {})),
69
+ "result": None,
70
+ }
71
+ )
72
+ # Stash visualization definition from create_adhoc_visualization so we can
73
+ # evaluate the agent's intended answer even when the data source call fails.
74
+ if content.get("name") == "create_adhoc_visualization":
75
+ viz = (content.get("arguments") or {}).get("visualization")
76
+ if viz and isinstance(viz, dict):
77
+ acc.adhoc_viz_args.append(viz)
78
+
79
+
80
+ def _handle_tool_result(content: dict[str, Any], acc: _SseAccumulator) -> None:
81
+ call_id = content.get("callId", "")
82
+ idx = acc.call_id_to_event_index.get(call_id)
83
+ if idx is not None:
84
+ acc.tool_call_events[idx]["result"] = content.get("result", "")
85
+
86
+
87
+ def _build_chat_result(acc: _SseAccumulator) -> ChatResult:
88
+ payload: dict[str, Any] = {
89
+ "textResponse": "\n".join(acc.text_parts) or None,
90
+ "toolCallEvents": acc.tool_call_events,
91
+ }
92
+ if acc.visualizations:
93
+ payload["createdVisualizations"] = {
94
+ "objects": acc.visualizations,
95
+ "reasoning": "\n".join(acc.viz_reasoning_parts),
96
+ }
97
+ elif acc.adhoc_viz_args:
98
+ # Fallback: the agent produced a correct visualization definition via
99
+ # create_adhoc_visualization but the call failed (e.g. data source not
100
+ # accessible). The last attempt is the agent's best answer.
101
+ payload["createdVisualizations"] = {
102
+ "objects": [acc.adhoc_viz_args[-1]],
103
+ "reasoning": "\n".join(acc.viz_reasoning_parts),
104
+ }
105
+ return ChatResult.model_validate(payload)
106
+
107
+
108
+ def parse_sse_lines(lines: Iterable[str]) -> ChatResult:
109
+ """Parse an SSE stream (iterable of decoded lines) into a ChatResult."""
110
+ acc = _SseAccumulator()
111
+ for raw_line in lines:
112
+ line = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
113
+ if not line or line.startswith("event: ") or not line.startswith(SSE_DATA_PREFIX):
114
+ continue
115
+ data_str = line[len(SSE_DATA_PREFIX) :]
116
+ try:
117
+ event_data = json.loads(data_str)
118
+ except json.JSONDecodeError:
119
+ continue
120
+ if "statusCode" in event_data:
121
+ raise RuntimeError(f"SSE error {event_data.get('statusCode')}: {event_data.get('detail')}")
122
+ item = event_data.get("item")
123
+ if not item:
124
+ continue
125
+ role = item.get("role")
126
+ content: dict[str, Any] = item.get("content") or {}
127
+ ctype = content.get("type")
128
+ if role == "assistant":
129
+ if ctype == "text":
130
+ _handle_text(content, acc)
131
+ elif ctype == "multipart":
132
+ _handle_multipart(content, acc)
133
+ elif ctype == "reasoning":
134
+ _handle_reasoning(content, acc)
135
+ elif ctype == "toolCall":
136
+ _handle_tool_call(content, acc)
137
+ elif role == "tool" and ctype == "toolResult":
138
+ _handle_tool_result(content, acc)
139
+ return _build_chat_result(acc)
140
+
141
+
142
+ class ChatClient:
143
+ """Single-turn AI chat client over the GoodData AI conversation endpoints."""
144
+
145
+ def __init__(self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0):
146
+ self._base = f"{host.rstrip('/')}/api/v1/ai/workspaces/{workspace_id}/chat/conversations"
147
+ self._auth = {"Authorization": f"Bearer {token}"}
148
+ self._client = httpx.Client(timeout=timeout)
149
+
150
+ def _create_conversation(self) -> str:
151
+ resp = self._client.post(self._base, headers={**self._auth, "Content-Type": "application/json"})
152
+ resp.raise_for_status()
153
+ body = resp.json()
154
+ if "conversationId" not in body:
155
+ raise ValueError(f"GoodData /chat/conversations response missing 'conversationId': {body}")
156
+ return body["conversationId"]
157
+
158
+ def _delete_conversation(self, conversation_id: str) -> None:
159
+ try:
160
+ self._client.delete(f"{self._base}/{conversation_id}", headers=self._auth)
161
+ except httpx.HTTPError:
162
+ pass # best-effort cleanup
163
+
164
+ def _send_message(self, conversation_id: str, question: str) -> ChatResult:
165
+ url = f"{self._base}/{conversation_id}/messages"
166
+ headers = {**self._auth, "Accept": "text/event-stream", "Content-Type": "application/json"}
167
+ body = {"item": {"role": "user", "content": {"type": "text", "text": question}}}
168
+ with self._client.stream("POST", url, json=body, headers=headers) as resp:
169
+ resp.raise_for_status()
170
+ return parse_sse_lines(resp.iter_lines())
171
+
172
+ def ask(self, item: DatasetItem) -> ChatResult:
173
+ """Run one single-turn conversation: create, send, parse, clean up."""
174
+ conversation_id = self._create_conversation()
175
+ try:
176
+ return self._send_message(conversation_id, item.question)
177
+ finally:
178
+ self._delete_conversation(conversation_id)
179
+
180
+ def close(self) -> None:
181
+ self._client.close()
@@ -0,0 +1,20 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Validated run configuration produced by the CLI and consumed by the runner."""
3
+
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+
7
+
8
+ @dataclass
9
+ class RunConfig:
10
+ host: str
11
+ token: str
12
+ workspace_id: str
13
+ dataset_folder: Path | None = None
14
+ langfuse_dataset: str | None = None
15
+ models: list[str] = field(default_factory=list)
16
+ runs: int = 2
17
+ concurrency: int = 1
18
+ json_path: Path | None = None
19
+ log_to_langfuse: bool = False
20
+ quiet: bool = False
@@ -0,0 +1,33 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Resolve (host, token) from explicit flags, environment, or a gooddata.yaml profile."""
3
+
4
+ import os
5
+
6
+ from gooddata_sdk.utils import profile_content
7
+
8
+
9
+ class ConnectionError_(Exception):
10
+ """Raised when host/token cannot be resolved."""
11
+
12
+
13
+ def resolve_connection(host: str | None, token: str | None, profile: str | None) -> tuple[str, str]:
14
+ """Resolve connection parameters.
15
+
16
+ Precedence: explicit flags > GOODDATA_TOKEN env (token only) > profile file.
17
+
18
+ Raises:
19
+ ConnectionError_: host or token could not be determined.
20
+ """
21
+ resolved_host = host
22
+ resolved_token = token or os.environ.get("GOODDATA_TOKEN")
23
+
24
+ if profile is not None and (resolved_host is None or resolved_token is None):
25
+ content = profile_content(profile)
26
+ resolved_host = resolved_host or content.get("host")
27
+ resolved_token = resolved_token or content.get("token")
28
+
29
+ if not resolved_host:
30
+ raise ConnectionError_("Missing host. Pass --host or use a --profile that defines it.")
31
+ if not resolved_token:
32
+ raise ConnectionError_("Missing token. Pass --token, set GOODDATA_TOKEN, or use a --profile that defines it.")
33
+ return resolved_host, resolved_token
@@ -0,0 +1 @@
1
+ # (C) 2026 GoodData Corporation