gooddata-eval 1.68.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gooddata_eval/__init__.py +6 -0
  2. gooddata_eval/_version.py +7 -0
  3. gooddata_eval/cli/__init__.py +1 -0
  4. gooddata_eval/cli/main.py +382 -0
  5. gooddata_eval/core/__init__.py +1 -0
  6. gooddata_eval/core/chat/__init__.py +1 -0
  7. gooddata_eval/core/chat/sse_client.py +181 -0
  8. gooddata_eval/core/config.py +20 -0
  9. gooddata_eval/core/connection.py +33 -0
  10. gooddata_eval/core/dataset/__init__.py +1 -0
  11. gooddata_eval/core/dataset/langfuse_source.py +123 -0
  12. gooddata_eval/core/dataset/local.py +39 -0
  13. gooddata_eval/core/evaluators/__init__.py +67 -0
  14. gooddata_eval/core/evaluators/_deep_subset.py +35 -0
  15. gooddata_eval/core/evaluators/_llm_judge.py +66 -0
  16. gooddata_eval/core/evaluators/_text_utils.py +11 -0
  17. gooddata_eval/core/evaluators/alert_skill.py +128 -0
  18. gooddata_eval/core/evaluators/base.py +24 -0
  19. gooddata_eval/core/evaluators/general_question.py +34 -0
  20. gooddata_eval/core/evaluators/guardrail.py +52 -0
  21. gooddata_eval/core/evaluators/metric_skill.py +58 -0
  22. gooddata_eval/core/evaluators/search_tool.py +40 -0
  23. gooddata_eval/core/evaluators/summary.py +96 -0
  24. gooddata_eval/core/evaluators/visualization.py +156 -0
  25. gooddata_eval/core/langfuse/__init__.py +1 -0
  26. gooddata_eval/core/langfuse/sink.py +178 -0
  27. gooddata_eval/core/models.py +116 -0
  28. gooddata_eval/core/reporting/__init__.py +1 -0
  29. gooddata_eval/core/reporting/console.py +117 -0
  30. gooddata_eval/core/reporting/json_report.py +81 -0
  31. gooddata_eval/core/runner.py +214 -0
  32. gooddata_eval/core/scoring.py +155 -0
  33. gooddata_eval/core/summary/__init__.py +1 -0
  34. gooddata_eval/core/summary/http_client.py +54 -0
  35. gooddata_eval/core/workspace.py +262 -0
  36. gooddata_eval-1.68.0.dist-info/METADATA +275 -0
  37. gooddata_eval-1.68.0.dist-info/RECORD +40 -0
  38. gooddata_eval-1.68.0.dist-info/WHEEL +4 -0
  39. gooddata_eval-1.68.0.dist-info/entry_points.txt +2 -0
  40. gooddata_eval-1.68.0.dist-info/licenses/LICENSE.txt +3252 -0
@@ -0,0 +1,117 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Render a human-readable console report using rich."""
3
+
4
+ from rich.console import Console
5
+ from rich.table import Table
6
+
7
+ from gooddata_eval.core.runner import EvalReport
8
+
9
+
10
+ def render_console(report: EvalReport, *, console: Console | None = None) -> str:
11
+ """Render the report to the console and return the rendered text.
12
+
13
+ Passing a Console lets callers print to stdout; the returned string aids testing.
14
+ """
15
+ out = console or Console(record=True, width=120)
16
+
17
+ table = Table(title=f"Evaluation — model={report.model} workspace={report.workspace_id}")
18
+ table.add_column("Item")
19
+ table.add_column("Kind")
20
+ table.add_column("Result")
21
+ table.add_column("Runs")
22
+ table.add_column("Latency")
23
+ table.add_column("Avg/run")
24
+ table.add_column("Quality")
25
+ table.add_column("Notes")
26
+
27
+ for item in report.items:
28
+ if item.skipped:
29
+ result, notes = "SKIPPED", f"test_kind '{item.test_kind}' not supported in this phase"
30
+ elif item.error:
31
+ result, notes = "ERROR", item.error
32
+ elif item.pass_at_k:
33
+ result, notes = "PASS", ""
34
+ else:
35
+ # Evaluator-agnostic: report whichever boolean checks came back False
36
+ # (visualization uses metrics_correct/…; dashboard_summary uses
37
+ # include_*/exclude_*/rubric_*). Falls back to a generic message.
38
+ failing = [k for k, v in item.best_detail.items() if v is False]
39
+ notes = "failed: " + ", ".join(failing) if failing else "did not pass strict checks"
40
+ result = "FAIL"
41
+ latency = "-" if item.runs == 0 else f"{item.latency_s:.2f}s"
42
+ avg = "-" if item.runs == 0 else f"{item.avg_latency_s:.2f}s"
43
+ quality = "-" if item.skipped else f"{item.quality_score:.0%}"
44
+ table.add_row(item.id, item.test_kind, result, str(item.runs), latency, avg, quality, notes)
45
+
46
+ out.print(table)
47
+ _wall = report.wall_clock_s
48
+ _agent = report.latency_s
49
+ if _wall > 0 and abs(_wall - _agent) > 1: # concurrency > 1: show both
50
+ timing = f"{_wall:.2f}s wall-clock, {_agent:.2f}s agent time (avg {report.avg_latency_s:.2f}s/run)"
51
+ else:
52
+ timing = f"{_agent:.2f}s (avg {report.avg_latency_s:.2f}s/run)"
53
+ out.print(
54
+ f"\nSummary: {report.passed}/{report.total} passed "
55
+ f"({report.skipped} skipped, {report.errored} errored) "
56
+ f"avg quality {report.avg_quality_score:.0%} in {timing}"
57
+ )
58
+ return out.export_text() if out.record else ""
59
+
60
+
61
+ def render_comparison(reports: list[EvalReport], *, console: Console | None = None) -> str:
62
+ """Render a side-by-side comparison for multiple model runs.
63
+
64
+ Winner is selected by (pass_rate, avg_quality_score, -avg_latency_s) —
65
+ higher pass rate first, then quality, then lower latency as final tiebreaker.
66
+ Returns an empty string when fewer than two reports are provided.
67
+ """
68
+ if len(reports) < 2:
69
+ return ""
70
+
71
+ out = console or Console(record=True, width=120)
72
+
73
+ table = Table(title="Model Comparison")
74
+ table.add_column("Model")
75
+ table.add_column("Passed")
76
+ table.add_column("Quality")
77
+ table.add_column("Avg/run")
78
+ table.add_column("Total time")
79
+
80
+ for r in reports:
81
+ evaluated = r.total - r.skipped
82
+ pass_pct = f"{r.passed / evaluated:.0%}" if evaluated else "—"
83
+ model_label = f"{r.provider_name}/{r.model}" if r.provider_name else r.model or "?"
84
+ table.add_row(
85
+ model_label,
86
+ f"{r.passed}/{r.total} ({pass_pct})",
87
+ f"{r.avg_quality_score:.0%}",
88
+ f"{r.avg_latency_s:.2f}s",
89
+ f"{r.latency_s:.0f}s",
90
+ )
91
+
92
+ out.print(table)
93
+
94
+ evaluated_reports = [r for r in reports if r.total > 0]
95
+ if evaluated_reports:
96
+ winner = max(
97
+ evaluated_reports,
98
+ key=lambda r: (
99
+ r.passed / r.total if r.total else 0,
100
+ r.avg_quality_score,
101
+ -r.avg_latency_s, # lower latency wins when pass rate and quality tie
102
+ ),
103
+ )
104
+ runner_up = max(
105
+ (r for r in evaluated_reports if r is not winner),
106
+ key=lambda r: r.passed / r.total if r.total else 0,
107
+ default=None,
108
+ )
109
+ delta = ""
110
+ if runner_up and runner_up.total > 0:
111
+ delta_items = winner.passed - runner_up.passed
112
+ delta_quality = winner.avg_quality_score - runner_up.avg_quality_score
113
+ delta = f" (+{delta_items} item(s) passed, +{delta_quality:.0%} quality)"
114
+ winner_label = f"{winner.provider_name}/{winner.model}" if winner.provider_name else winner.model or "?"
115
+ out.print(f"\n[bold]Winner: {winner_label}[/bold]{delta}")
116
+
117
+ return out.export_text() if out.record else ""
@@ -0,0 +1,81 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Build and write machine-readable reports (single-model or multi-model)."""
3
+
4
+ from pathlib import Path
5
+
6
+ import orjson
7
+
8
+ from gooddata_eval.core.runner import EvalReport
9
+
10
+
11
+ def _build_run_dict(report: EvalReport) -> dict:
12
+ return {
13
+ "model": report.model,
14
+ "workspace_id": report.workspace_id,
15
+ "summary": {
16
+ "total": report.total,
17
+ "passed": report.passed,
18
+ "failed": report.total - report.passed - report.skipped,
19
+ "skipped": report.skipped,
20
+ "errored": report.errored,
21
+ "latency_s": round(report.latency_s, 3),
22
+ "avg_latency_s": round(report.avg_latency_s, 3),
23
+ "wall_clock_s": round(report.wall_clock_s, 3),
24
+ },
25
+ "items": {
26
+ item.id: {
27
+ "dataset_name": item.dataset_name,
28
+ "test_kind": item.test_kind,
29
+ "question": item.question,
30
+ "pass_at_k": item.pass_at_k,
31
+ "skipped": item.skipped,
32
+ "error": item.error,
33
+ "runs": item.runs,
34
+ "latency_s": round(item.latency_s, 3),
35
+ "avg_latency_s": round(item.avg_latency_s, 3),
36
+ "detail": item.best_detail,
37
+ }
38
+ for item in report.items
39
+ },
40
+ }
41
+
42
+
43
+ def _build_comparison_entry(report: EvalReport) -> dict:
44
+ total = report.total
45
+ passed = report.passed
46
+ return {
47
+ "provider_name": report.provider_name,
48
+ "passed": passed,
49
+ "total": total,
50
+ "pass_rate": round(passed / total, 4) if total else 0.0,
51
+ "avg_quality_score": round(report.avg_quality_score, 4),
52
+ "avg_latency_s": round(report.avg_latency_s, 3),
53
+ "total_latency_s": round(report.latency_s, 3),
54
+ }
55
+
56
+
57
+ def _run_key(report: EvalReport) -> str:
58
+ """Collision-free key matching the console comparison table label."""
59
+ return f"{report.provider_name}/{report.model}" if report.provider_name else report.model or "?"
60
+
61
+
62
+ def build_multi_model_report(reports: list[EvalReport]) -> dict:
63
+ """Build the nested multi-model JSON report (used for single-model runs too)."""
64
+ return {
65
+ "models": [_run_key(r) for r in reports],
66
+ "runs": {_run_key(r): _build_run_dict(r) for r in reports},
67
+ "comparison": {_run_key(r): _build_comparison_entry(r) for r in reports},
68
+ }
69
+
70
+
71
+ def write_multi_model_report(reports: list[EvalReport], path: Path) -> None:
72
+ Path(path).write_bytes(orjson.dumps(build_multi_model_report(reports), option=orjson.OPT_INDENT_2))
73
+
74
+
75
+ # Backward-compatible aliases so existing callers keep working.
76
+ def build_json_report(report: EvalReport) -> dict:
77
+ return _build_run_dict(report)
78
+
79
+
80
+ def write_json_report(report: EvalReport, path: Path) -> None:
81
+ write_multi_model_report([report], path)
@@ -0,0 +1,214 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Dataset run orchestration: per item, K single-turn runs, route by test_kind, aggregate pass@K."""
3
+
4
+ import time
5
+ import traceback
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from dataclasses import dataclass, field
8
+ from functools import partial
9
+ from typing import Callable, Protocol
10
+
11
+ from gooddata_eval.core.evaluators import get_evaluator, supported_test_kinds
12
+ from gooddata_eval.core.evaluators.base import ItemEvaluation
13
+ from gooddata_eval.core.models import ChatResult, DatasetItem
14
+
15
+
16
+ class ChatBackend(Protocol):
17
+ # Receives the whole item so backends can use per-item context beyond the
18
+ # question text (e.g. dashboard_summary needs item.summary_input).
19
+ def ask(self, item: DatasetItem) -> ChatResult: ...
20
+
21
+
22
+ @dataclass
23
+ class ItemReport:
24
+ id: str
25
+ dataset_name: str
26
+ test_kind: str
27
+ question: str
28
+ pass_at_k: bool = False
29
+ skipped: bool = False
30
+ error: str | None = None
31
+ runs: int = 0
32
+ latency_s: float = 0.0 # total wall-clock across this item's runs
33
+ best_detail: dict = field(default_factory=dict)
34
+
35
+ @property
36
+ def avg_latency_s(self) -> float:
37
+ return self.latency_s / self.runs if self.runs else 0.0
38
+
39
+ @property
40
+ def quality_score(self) -> float:
41
+ """Fraction of bool-valued strict checks in best_detail that are True.
42
+
43
+ Falls back to 1.0 if pass_at_k else 0.0 when no bool checks exist
44
+ (e.g. text evaluators where best_detail has no bool flags).
45
+ """
46
+ checks = {k: v for k, v in self.best_detail.items() if isinstance(v, bool)}
47
+ if checks:
48
+ return sum(1 for v in checks.values() if v) / len(checks)
49
+ return 1.0 if self.pass_at_k else 0.0
50
+
51
+
52
+ @dataclass
53
+ class EvalReport:
54
+ model: str | None
55
+ provider_name: str = ""
56
+ provider_type: str = ""
57
+ workspace_id: str = ""
58
+ items: list[ItemReport] = field(default_factory=list)
59
+ wall_clock_s: float = 0.0 # actual elapsed time; differs from latency_s under concurrency
60
+
61
+ @property
62
+ def total(self) -> int:
63
+ return len(self.items)
64
+
65
+ @property
66
+ def passed(self) -> int:
67
+ return sum(1 for i in self.items if i.pass_at_k)
68
+
69
+ @property
70
+ def skipped(self) -> int:
71
+ return sum(1 for i in self.items if i.skipped)
72
+
73
+ @property
74
+ def errored(self) -> int:
75
+ return sum(1 for i in self.items if i.error is not None)
76
+
77
+ @property
78
+ def latency_s(self) -> float:
79
+ return sum(i.latency_s for i in self.items)
80
+
81
+ @property
82
+ def total_runs(self) -> int:
83
+ return sum(i.runs for i in self.items)
84
+
85
+ @property
86
+ def avg_latency_s(self) -> float:
87
+ total_runs = self.total_runs
88
+ return self.latency_s / total_runs if total_runs else 0.0
89
+
90
+ @property
91
+ def avg_quality_score(self) -> float:
92
+ """Mean quality_score across evaluated (non-skipped, non-errored) items."""
93
+ evaluated = [i.quality_score for i in self.items if not i.skipped and i.error is None]
94
+ return sum(evaluated) / len(evaluated) if evaluated else 0.0
95
+
96
+
97
+ # A per-run progress callback: (run_index, runs_total, passed, latency_s) -> None
98
+ RunCallback = Callable[[int, int, bool, float], None]
99
+
100
+
101
+ def _run_one_item(
102
+ item: DatasetItem, backend: ChatBackend, runs: int, on_run_done: RunCallback | None = None
103
+ ) -> ItemReport:
104
+ report = ItemReport(id=item.id, dataset_name=item.dataset_name, test_kind=item.test_kind, question=item.question)
105
+ if item.test_kind not in supported_test_kinds():
106
+ report.skipped = True
107
+ return report
108
+
109
+ evaluator = get_evaluator(item.test_kind)
110
+ best: ItemEvaluation | None = None
111
+ try:
112
+ for run_index in range(1, runs + 1):
113
+ t0 = time.perf_counter()
114
+ chat_result = backend.ask(item)
115
+ evaluation = evaluator.evaluate(item, chat_result)
116
+ latency = time.perf_counter() - t0
117
+ report.runs += 1
118
+ report.latency_s += latency
119
+ if best is None or evaluation.rank_key > best.rank_key:
120
+ best = evaluation
121
+ if evaluation.passed:
122
+ report.pass_at_k = True
123
+ if on_run_done is not None:
124
+ on_run_done(run_index, runs, evaluation.passed, latency)
125
+ except Exception as e: # agent/network/parse failure for this item
126
+ report.error = f"{type(e).__name__}: {e}"
127
+ if best is not None:
128
+ report.best_detail = best.detail
129
+ return report
130
+
131
+ if best is not None:
132
+ report.best_detail = best.detail
133
+ return report
134
+
135
+
136
+ def _forward_run_event(
137
+ user_cb: "Callable[[int, int, int, int, bool, float], None]",
138
+ item_index: int,
139
+ total: int,
140
+ run_index: int,
141
+ runs_total: int,
142
+ passed: bool,
143
+ latency: float,
144
+ ) -> None:
145
+ user_cb(item_index, total, run_index, runs_total, passed, latency)
146
+
147
+
148
+ def run_items(
149
+ items: list[DatasetItem],
150
+ backend: ChatBackend,
151
+ *,
152
+ runs: int = 2,
153
+ model: str | None = None,
154
+ provider_name: str = "",
155
+ provider_type: str = "",
156
+ workspace_id: str = "",
157
+ on_item_start: Callable[[int, int, DatasetItem], None] | None = None,
158
+ on_run_done: Callable[[int, int, int, int, bool, float], None] | None = None,
159
+ on_item_done: Callable[[int, int, ItemReport], None] | None = None,
160
+ on_langfuse_item_done: Callable[[int, int, ItemReport], None] | None = None,
161
+ concurrency: int = 1,
162
+ ) -> EvalReport:
163
+ """Run every item K times, routing by test_kind, and aggregate pass@K.
164
+
165
+ Optional callbacks stream progress without coupling core to any I/O library
166
+ (index/run_index are 1-based):
167
+ - on_item_start(index, total, item) before an item's runs begin
168
+ - on_run_done(index, total, run_index, runs, passed, latency) after each individual run
169
+ - on_item_done(index, total, report) after an item is fully evaluated
170
+ - on_langfuse_item_done(index, total, report) after non-skipped, non-errored items only
171
+
172
+ concurrency > 1 dispatches items to a ThreadPoolExecutor so multiple
173
+ questions are sent to the agent simultaneously. Each item still runs
174
+ --runs times sequentially (pass@K). Results are collected in input order.
175
+ """
176
+ concurrency = max(1, concurrency)
177
+ report = EvalReport(
178
+ model=model, provider_name=provider_name, provider_type=provider_type, workspace_id=workspace_id
179
+ )
180
+ total = len(items)
181
+
182
+ def _process_item(index: int, item: DatasetItem) -> ItemReport:
183
+ try:
184
+ if on_item_start is not None:
185
+ on_item_start(index, total, item)
186
+ except Exception: # non-fatal — callback must not abort a parallel run
187
+ traceback.print_exc()
188
+ run_cb = partial(_forward_run_event, on_run_done, index, total) if on_run_done is not None else None
189
+ item_report = _run_one_item(item, backend, runs, on_run_done=run_cb)
190
+ try:
191
+ if on_item_done is not None:
192
+ on_item_done(index, total, item_report)
193
+ if on_langfuse_item_done is not None and not item_report.skipped and item_report.error is None:
194
+ on_langfuse_item_done(index, total, item_report)
195
+ except Exception: # non-fatal — log but don't abort
196
+ traceback.print_exc()
197
+ return item_report
198
+
199
+ _t0 = time.perf_counter()
200
+ if concurrency <= 1:
201
+ for index, item in enumerate(items, start=1):
202
+ report.items.append(_process_item(index, item))
203
+ else:
204
+ # Dispatch concurrently; collect in original order.
205
+ with ThreadPoolExecutor(max_workers=concurrency) as pool:
206
+ futures = {pool.submit(_process_item, index, item): index for index, item in enumerate(items, start=1)}
207
+ results: dict[int, ItemReport] = {}
208
+ for future in as_completed(futures):
209
+ idx = futures[future]
210
+ results[idx] = future.result()
211
+ for index in range(1, total + 1):
212
+ report.items.append(results[index])
213
+ report.wall_clock_s = time.perf_counter() - _t0
214
+ return report
@@ -0,0 +1,155 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Visualization scoring — ported from gdc-nas tavern-e2e app/vis_assertions/metrics.py."""
3
+
4
+ import json
5
+ from dataclasses import dataclass
6
+
7
+ from gooddata_eval.core.models import AacBucketRef, AacQueryField, CreatedVisualization
8
+
9
+ # Maps dataset chart-type names (and agent enum values) to a canonical token.
10
+ _AAC_TYPE_MAP = {
11
+ "line_chart": "LINE",
12
+ "bar_chart": "BAR",
13
+ "column_chart": "COLUMN",
14
+ "pie_chart": "PIE",
15
+ "table": "TABLE",
16
+ "headline": "HEADLINE",
17
+ }
18
+
19
+
20
+ @dataclass
21
+ class FilterScores:
22
+ date_ok: bool
23
+ ranking_ok: bool
24
+ attribute_ok: bool
25
+
26
+ @property
27
+ def all_ok(self) -> bool:
28
+ return self.date_ok and self.ranking_ok and self.attribute_ok
29
+
30
+
31
+ def _resolve_alias_to_uri(alias: str, fields: dict[str, AacQueryField | str]) -> str:
32
+ """Resolve a field alias to its `using` URI; return the alias unchanged if absent."""
33
+ field = fields.get(alias)
34
+ if field is None:
35
+ return alias
36
+ if isinstance(field, AacQueryField):
37
+ return field.using
38
+ return field
39
+
40
+
41
+ def _resolve_bucket_to_uri_set(bucket: list[AacBucketRef | str], fields: dict[str, AacQueryField | str]) -> set[str]:
42
+ uris: set[str] = set()
43
+ for ref in bucket:
44
+ alias = ref.field if isinstance(ref, AacBucketRef) else ref
45
+ uris.add(_resolve_alias_to_uri(alias, fields))
46
+ return uris
47
+
48
+
49
+ def get_metric_uri_set(viz: CreatedVisualization) -> set[str]:
50
+ return _resolve_bucket_to_uri_set(viz.metrics, viz.query.fields)
51
+
52
+
53
+ def get_dimension_uri_set(viz: CreatedVisualization) -> set[str]:
54
+ all_dim_buckets = viz.view_by + viz.segment_by + viz.rows + viz.columns
55
+ return _resolve_bucket_to_uri_set(all_dim_buckets, viz.query.fields)
56
+
57
+
58
+ def uri_to_display_name(uri: str) -> str:
59
+ """Convert 'metric/net_sales' -> 'net sales', 'label/date.month' -> 'date - month'."""
60
+ last = uri.split("/", 1)[-1]
61
+ return last.replace(".", " - ").replace("_", " ")
62
+
63
+
64
+ def validate_cross_references(viz: CreatedVisualization) -> tuple[bool, list[str]]:
65
+ """Validate ranking-filter `using`/`attribute` resolve to correct URI prefixes."""
66
+ errors: list[str] = []
67
+ fields = viz.query.fields
68
+ for filter_key, filter_dict in viz.query.filter_by.items():
69
+ if filter_dict.get("type") != "ranking_filter":
70
+ continue
71
+ using_val = filter_dict.get("using", "")
72
+ using_uri = _resolve_alias_to_uri(using_val, fields)
73
+ if not using_uri.startswith(("metric/", "fact/")):
74
+ errors.append(
75
+ f"ranking filter '{filter_key}': using='{using_val}' "
76
+ f"resolves to '{using_uri}' — expected a metric/ or fact/ URI"
77
+ )
78
+ if "attribute" in filter_dict:
79
+ attr_val = filter_dict["attribute"]
80
+ attr_uri = _resolve_alias_to_uri(attr_val, fields)
81
+ if not attr_uri.startswith(("label/", "attribute/")):
82
+ errors.append(
83
+ f"ranking filter '{filter_key}': attribute='{attr_val}' "
84
+ f"resolves to '{attr_uri}' — expected a label/ or attribute/ URI"
85
+ )
86
+ return len(errors) == 0, errors
87
+
88
+
89
+ def _normalize_date_filter(filter_dict: dict, _fields: dict) -> dict:
90
+ return {
91
+ "type": "date_filter",
92
+ "dataset_uri": filter_dict.get("using", ""),
93
+ "from": filter_dict.get("from"),
94
+ "to": filter_dict.get("to"),
95
+ "granularity": filter_dict.get("granularity"),
96
+ }
97
+
98
+
99
+ def _normalize_ranking_filter(filter_dict: dict, fields: dict[str, AacQueryField | str]) -> dict:
100
+ entry: dict = {
101
+ "type": "ranking_filter",
102
+ "metric_uri": _resolve_alias_to_uri(filter_dict.get("using", ""), fields),
103
+ "dim_uri": _resolve_alias_to_uri(filter_dict.get("attribute", ""), fields),
104
+ }
105
+ if "top" in filter_dict:
106
+ entry["top"] = filter_dict["top"]
107
+ if "bottom" in filter_dict:
108
+ entry["bottom"] = filter_dict["bottom"]
109
+ return entry
110
+
111
+
112
+ def _normalize_attribute_filter(filter_dict: dict, _fields: dict) -> dict:
113
+ raw_state = filter_dict.get("state") or {}
114
+ state = {k: v for k, v in raw_state.items() if v}
115
+ return {
116
+ "type": "attribute_filter",
117
+ "field_uri": filter_dict.get("using", ""),
118
+ "state": state,
119
+ }
120
+
121
+
122
+ def _split_and_normalize_filters(viz: CreatedVisualization) -> tuple[set[str], set[str], set[str]]:
123
+ date_set: set[str] = set()
124
+ ranking_set: set[str] = set()
125
+ attr_set: set[str] = set()
126
+ fields = viz.query.fields
127
+ for filter_dict in viz.query.filter_by.values():
128
+ ft = filter_dict.get("type")
129
+ if ft == "date_filter":
130
+ date_set.add(json.dumps(_normalize_date_filter(filter_dict, fields), sort_keys=True))
131
+ elif ft == "ranking_filter":
132
+ ranking_set.add(json.dumps(_normalize_ranking_filter(filter_dict, fields), sort_keys=True))
133
+ elif ft == "attribute_filter":
134
+ attr_set.add(json.dumps(_normalize_attribute_filter(filter_dict, fields), sort_keys=True))
135
+ return date_set, ranking_set, attr_set
136
+
137
+
138
+ def check_filters(expected: CreatedVisualization, actual: CreatedVisualization) -> FilterScores:
139
+ exp_date, exp_rank, exp_attr = _split_and_normalize_filters(expected)
140
+ act_date, act_rank, act_attr = _split_and_normalize_filters(actual)
141
+ return FilterScores(
142
+ date_ok=act_date == exp_date,
143
+ ranking_ok=act_rank == exp_rank,
144
+ attribute_ok=act_attr == exp_attr,
145
+ )
146
+
147
+
148
+ def _normalize_viz_type(raw_type: str) -> str:
149
+ return _AAC_TYPE_MAP.get(raw_type, raw_type.replace("_chart", "").upper())
150
+
151
+
152
+ def check_viz_type(expected: CreatedVisualization, actual: CreatedVisualization) -> bool:
153
+ if not expected.type:
154
+ return True
155
+ return _normalize_viz_type(expected.type) == _normalize_viz_type(actual.type)
@@ -0,0 +1 @@
1
+ # (C) 2026 GoodData Corporation
@@ -0,0 +1,54 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """HTTP client for the dedicated dashboard-summary endpoint.
3
+
4
+ Unlike the conversational chat skill, this endpoint executes the AFM for each
5
+ visualization server-side and returns a plain synchronous JSON summary — no SSE
6
+ stream and no client-side ``result_id`` wrangling. The response is adapted into
7
+ a ``ChatResult`` (summary text -> ``text_response``) so the existing
8
+ LLM-as-judge evaluators can score it unchanged.
9
+
10
+ Endpoint (gen-ai service):
11
+ POST /api/v1/ai/workspaces/{workspace_id}/summary
12
+
13
+ If the route is ever renamed (e.g. to ``/summarize``), change ``_PATH`` only.
14
+ """
15
+
16
+ import httpx
17
+
18
+ from gooddata_eval.core.models import ChatResult, DatasetItem
19
+
20
+ _PATH = "/api/v1/ai/workspaces/{workspace_id}/summary"
21
+
22
+
23
+ class SummaryClient:
24
+ """Single-shot client for the dashboard-summary endpoint."""
25
+
26
+ def __init__(self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0):
27
+ self._url = f"{host.rstrip('/')}{_PATH.format(workspace_id=workspace_id)}"
28
+ self._auth = {"Authorization": f"Bearer {token}"}
29
+ self._client = httpx.Client(timeout=timeout)
30
+
31
+ def ask(self, item: DatasetItem) -> ChatResult:
32
+ """Request a summary for one dataset item and adapt it to a ChatResult."""
33
+ si = item.summary_input
34
+ if si is None:
35
+ raise ValueError(f"dashboard_summary item '{item.id}' is missing required 'summary_input'.")
36
+
37
+ body: dict = {"dashboardId": si.dashboard_id}
38
+ if si.visualizations is not None:
39
+ body["visualizations"] = si.visualizations
40
+ if si.filter_context is not None:
41
+ body["filterContext"] = si.filter_context
42
+ if si.tab_id is not None:
43
+ body["tabId"] = si.tab_id
44
+ if si.format_hint is not None:
45
+ body["formatHint"] = si.format_hint
46
+
47
+ resp = self._client.post(self._url, json=body, headers={**self._auth, "Content-Type": "application/json"})
48
+ resp.raise_for_status()
49
+ data = resp.json()
50
+ summary = data.get("summary") or ""
51
+ return ChatResult.model_validate({"textResponse": summary})
52
+
53
+ def close(self) -> None:
54
+ self._client.close()