gooddata-eval 1.68.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gooddata_eval/__init__.py +6 -0
  2. gooddata_eval/_version.py +7 -0
  3. gooddata_eval/cli/__init__.py +1 -0
  4. gooddata_eval/cli/main.py +382 -0
  5. gooddata_eval/core/__init__.py +1 -0
  6. gooddata_eval/core/chat/__init__.py +1 -0
  7. gooddata_eval/core/chat/sse_client.py +181 -0
  8. gooddata_eval/core/config.py +20 -0
  9. gooddata_eval/core/connection.py +33 -0
  10. gooddata_eval/core/dataset/__init__.py +1 -0
  11. gooddata_eval/core/dataset/langfuse_source.py +123 -0
  12. gooddata_eval/core/dataset/local.py +39 -0
  13. gooddata_eval/core/evaluators/__init__.py +67 -0
  14. gooddata_eval/core/evaluators/_deep_subset.py +35 -0
  15. gooddata_eval/core/evaluators/_llm_judge.py +66 -0
  16. gooddata_eval/core/evaluators/_text_utils.py +11 -0
  17. gooddata_eval/core/evaluators/alert_skill.py +128 -0
  18. gooddata_eval/core/evaluators/base.py +24 -0
  19. gooddata_eval/core/evaluators/general_question.py +34 -0
  20. gooddata_eval/core/evaluators/guardrail.py +52 -0
  21. gooddata_eval/core/evaluators/metric_skill.py +58 -0
  22. gooddata_eval/core/evaluators/search_tool.py +40 -0
  23. gooddata_eval/core/evaluators/summary.py +96 -0
  24. gooddata_eval/core/evaluators/visualization.py +156 -0
  25. gooddata_eval/core/langfuse/__init__.py +1 -0
  26. gooddata_eval/core/langfuse/sink.py +178 -0
  27. gooddata_eval/core/models.py +116 -0
  28. gooddata_eval/core/reporting/__init__.py +1 -0
  29. gooddata_eval/core/reporting/console.py +117 -0
  30. gooddata_eval/core/reporting/json_report.py +81 -0
  31. gooddata_eval/core/runner.py +214 -0
  32. gooddata_eval/core/scoring.py +155 -0
  33. gooddata_eval/core/summary/__init__.py +1 -0
  34. gooddata_eval/core/summary/http_client.py +54 -0
  35. gooddata_eval/core/workspace.py +262 -0
  36. gooddata_eval-1.68.0.dist-info/METADATA +275 -0
  37. gooddata_eval-1.68.0.dist-info/RECORD +40 -0
  38. gooddata_eval-1.68.0.dist-info/WHEEL +4 -0
  39. gooddata_eval-1.68.0.dist-info/entry_points.txt +2 -0
  40. gooddata_eval-1.68.0.dist-info/licenses/LICENSE.txt +3252 -0
@@ -0,0 +1,40 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Evaluator for search_tool: agent must call the catalog search with expected parameters."""
3
+
4
+ from gooddata_eval.core.evaluators.base import ItemEvaluation
5
+ from gooddata_eval.core.models import ChatResult, DatasetItem
6
+
7
+
8
+ def _args_match(actual_args: dict, expected_args: dict) -> bool:
9
+ if sorted(actual_args.get("keywords") or []) != sorted(expected_args.get("keywords") or []):
10
+ return False
11
+ if sorted(actual_args.get("object_types") or []) != sorted(expected_args.get("object_types") or []):
12
+ return False
13
+ if actual_args.get("limit") != expected_args.get("limit"):
14
+ return False
15
+ return actual_args.get("emit_widget") == expected_args.get("emit_widget")
16
+
17
+
18
+ class SearchToolEvaluator:
19
+ test_kind = "search_tool"
20
+
21
+ def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
22
+ expected_call = (item.expected_output or {}).get("tool_call", {})
23
+ expected_fn = expected_call.get("function_name", "search_objects")
24
+ expected_args = expected_call.get("function_arguments", {})
25
+
26
+ matching_events = [ev for ev in chat_result.tool_call_events if ev.function_name == expected_fn]
27
+ tool_selection = len(matching_events) > 0
28
+ tool_correctness = any(_args_match(ev.parsed_arguments(), expected_args) for ev in matching_events)
29
+
30
+ # tool_selection is the hard gate; tool_correctness is scored but not blocking
31
+ return ItemEvaluation(
32
+ passed=tool_selection,
33
+ rank_key=(int(tool_selection), int(tool_correctness)),
34
+ detail={
35
+ "tool_selection": tool_selection,
36
+ "tool_correctness": tool_correctness,
37
+ "expected_function": expected_fn,
38
+ "calls_found": len(matching_events),
39
+ },
40
+ )
@@ -0,0 +1,96 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Evaluator for dashboard_summary: rubric-based LLM-as-judge scoring.
3
+
4
+ Summaries are free text, so we do not match strings. Instead, `expected_output`
5
+ is a rubric of checkable criteria:
6
+
7
+ {
8
+ "must_include": ["...facts a good summary must contain..."],
9
+ "must_not_include": ["...things a good summary must avoid (hallucinations)..."],
10
+ "rubric": ["...soft quality dimensions..."]
11
+ }
12
+
13
+ Each criterion is scored independently by the judge (True/False), so the
14
+ runner's `quality_score` becomes the fraction of satisfied criteria. The item
15
+ *passes* only when every `must_include` is satisfied and no `must_not_include`
16
+ is violated; `rubric` items contribute to quality but do not gate pass/fail.
17
+
18
+ As a fallback, a non-dict `expected_output` is treated as a single rubric
19
+ criterion (same behaviour as `general_question`).
20
+ """
21
+
22
+ from typing import Any
23
+
24
+ from gooddata_eval.core.evaluators._llm_judge import LLMJudge
25
+ from gooddata_eval.core.evaluators._text_utils import extract_text
26
+ from gooddata_eval.core.evaluators.base import ItemEvaluation
27
+ from gooddata_eval.core.models import ChatResult, DatasetItem
28
+
29
+ _POSITIVE_STEPS = [
30
+ "Read the INPUT (the user's request) and the EXPECTED OUTPUT (one criterion the summary must satisfy).",
31
+ "Read the ACTUAL OUTPUT (the generated summary).",
32
+ "Score 1 if the actual output clearly satisfies the criterion (allow paraphrasing and reasonable numeric tolerance).",
33
+ "Score 0 if the criterion is missing, contradicted, or only partially addressed.",
34
+ ]
35
+
36
+ # For must_not_include we ask the judge a plain presence question and invert the
37
+ # result in code. Scoring "does the summary AVOID X?" via a field labelled
38
+ # EXPECTED OUTPUT is unreliable: the model reads the forbidden behaviour as
39
+ # desired and flips the verdict. Detecting presence (no negation, no
40
+ # contradictory label) is far more robust.
41
+ _VIOLATION_STEPS = [
42
+ "Read the CHARACTERISTIC described in EXPECTED OUTPUT.",
43
+ "Read the ACTUAL OUTPUT (the generated summary).",
44
+ "Score 1 if the actual output clearly exhibits the described characteristic.",
45
+ "Score 0 if it does not exhibit it.",
46
+ ]
47
+
48
+
49
+ class DashboardSummaryEvaluator:
50
+ test_kind = "dashboard_summary"
51
+
52
+ def __init__(self):
53
+ self._positive_judge = LLMJudge(evaluation_steps=_POSITIVE_STEPS)
54
+ self._violation_judge = LLMJudge(evaluation_steps=_VIOLATION_STEPS)
55
+
56
+ @staticmethod
57
+ def _criteria(expected_output: Any) -> tuple[list[str], list[str], list[str]]:
58
+ if isinstance(expected_output, dict):
59
+ must_include = [str(c) for c in expected_output.get("must_include", [])]
60
+ must_not_include = [str(c) for c in expected_output.get("must_not_include", [])]
61
+ rubric = [str(c) for c in expected_output.get("rubric", [])]
62
+ if must_include or must_not_include or rubric:
63
+ return must_include, must_not_include, rubric
64
+ # Fallback: treat the whole expected_output as a single gating criterion
65
+ # (same pass/fail semantics as general_question).
66
+ return [str(expected_output)], [], []
67
+
68
+ def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
69
+ actual = extract_text(chat_result)
70
+ must_include, must_not_include, rubric = self._criteria(item.expected_output)
71
+
72
+ detail: dict[str, Any] = {"actual_output": actual}
73
+ passed = True
74
+
75
+ for i, criterion in enumerate(must_include):
76
+ ok, reason = self._positive_judge.score(item.question, criterion, actual)
77
+ detail[f"include_{i}"] = ok
78
+ detail[f"include_{i}_reason"] = reason
79
+ passed = passed and ok
80
+
81
+ for i, criterion in enumerate(must_not_include):
82
+ violated, reason = self._violation_judge.score(item.question, criterion, actual)
83
+ ok = not violated # True == characteristic absent == correctly avoided
84
+ detail[f"exclude_{i}"] = ok
85
+ detail[f"exclude_{i}_reason"] = reason
86
+ passed = passed and ok
87
+
88
+ for i, criterion in enumerate(rubric):
89
+ ok, reason = self._positive_judge.score(item.question, criterion, actual)
90
+ detail[f"rubric_{i}"] = ok
91
+ detail[f"rubric_{i}_reason"] = reason
92
+
93
+ bool_checks = [v for v in detail.values() if isinstance(v, bool)]
94
+ quality = sum(1 for v in bool_checks if v) / len(bool_checks) if bool_checks else 0.0
95
+
96
+ return ItemEvaluation(passed=passed, rank_key=(int(passed), quality), detail=detail)
@@ -0,0 +1,156 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Agentic visualization evaluator — ported from gdc-nas tavern-e2e app/vis_agentic.py."""
3
+
4
+ from dataclasses import dataclass
5
+
6
+ from gooddata_eval.core.evaluators.base import ItemEvaluation
7
+ from gooddata_eval.core.models import ChatResult, CreatedVisualization, DatasetItem
8
+ from gooddata_eval.core.scoring import (
9
+ check_filters,
10
+ check_viz_type,
11
+ get_dimension_uri_set,
12
+ get_metric_uri_set,
13
+ validate_cross_references,
14
+ )
15
+
16
+
17
+ @dataclass
18
+ class EvaluationResult:
19
+ visualization_created: bool
20
+ cross_ref_valid: bool
21
+ metrics_correct: bool
22
+ dimensions_correct: bool
23
+ filters_correct: bool
24
+ viz_type_hard: bool
25
+ filter_date_score: bool
26
+ filter_ranking_score: bool
27
+ filter_attribute_score: bool
28
+ cross_ref_errors: list[str]
29
+ expected_metric_uris: set[str]
30
+ actual_metric_uris: set[str]
31
+ expected_dim_uris: set[str]
32
+ actual_dim_uris: set[str]
33
+
34
+ @property
35
+ def strict_pass(self) -> bool:
36
+ return (
37
+ self.visualization_created
38
+ and self.cross_ref_valid
39
+ and self.metrics_correct
40
+ and self.dimensions_correct
41
+ and self.filters_correct
42
+ and self.viz_type_hard
43
+ )
44
+
45
+ @property
46
+ def strict_checks_passed_count(self) -> int:
47
+ return sum(
48
+ [
49
+ self.cross_ref_valid,
50
+ self.metrics_correct,
51
+ self.dimensions_correct,
52
+ self.filters_correct,
53
+ self.viz_type_hard,
54
+ ]
55
+ )
56
+
57
+
58
+ def _evaluate_visualization(expected: CreatedVisualization, actual: CreatedVisualization | None) -> EvaluationResult:
59
+ exp_metric_uris = get_metric_uri_set(expected)
60
+ exp_dim_uris = get_dimension_uri_set(expected)
61
+ if actual is None:
62
+ return EvaluationResult(
63
+ visualization_created=False,
64
+ cross_ref_valid=False,
65
+ metrics_correct=False,
66
+ dimensions_correct=False,
67
+ filters_correct=False,
68
+ viz_type_hard=False,
69
+ filter_date_score=False,
70
+ filter_ranking_score=False,
71
+ filter_attribute_score=False,
72
+ cross_ref_errors=["No visualization was created"],
73
+ expected_metric_uris=exp_metric_uris,
74
+ actual_metric_uris=set(),
75
+ expected_dim_uris=exp_dim_uris,
76
+ actual_dim_uris=set(),
77
+ )
78
+ cross_ref_valid, cross_ref_errors = validate_cross_references(actual)
79
+ act_metric_uris = get_metric_uri_set(actual)
80
+ act_dim_uris = get_dimension_uri_set(actual)
81
+ filter_scores = check_filters(expected, actual)
82
+ return EvaluationResult(
83
+ visualization_created=True,
84
+ cross_ref_valid=cross_ref_valid,
85
+ metrics_correct=act_metric_uris == exp_metric_uris,
86
+ dimensions_correct=act_dim_uris == exp_dim_uris,
87
+ filters_correct=filter_scores.all_ok,
88
+ viz_type_hard=check_viz_type(expected, actual),
89
+ filter_date_score=filter_scores.date_ok,
90
+ filter_ranking_score=filter_scores.ranking_ok,
91
+ filter_attribute_score=filter_scores.attribute_ok,
92
+ cross_ref_errors=cross_ref_errors,
93
+ expected_metric_uris=exp_metric_uris,
94
+ actual_metric_uris=act_metric_uris,
95
+ expected_dim_uris=exp_dim_uris,
96
+ actual_dim_uris=act_dim_uris,
97
+ )
98
+
99
+
100
+ def _evaluate_against_candidates(
101
+ expected_outputs: list[CreatedVisualization], actual: CreatedVisualization | None
102
+ ) -> tuple[EvaluationResult, CreatedVisualization]:
103
+ pairs = [(_evaluate_visualization(exp, actual), exp) for exp in expected_outputs]
104
+ best_result, best_expected = max(pairs, key=lambda p: (p[0].strict_pass, p[0].strict_checks_passed_count))
105
+ return best_result, best_expected
106
+
107
+
108
+ def _parse_expected(expected_output: dict) -> list[CreatedVisualization]:
109
+ if not isinstance(expected_output, dict):
110
+ raise ValueError("'expected_output' must be a JSON object")
111
+ raw_viz = expected_output.get("visualization")
112
+ if raw_viz is None:
113
+ raise ValueError("'expected_output.visualization' is required")
114
+ if isinstance(raw_viz, list):
115
+ if not raw_viz:
116
+ raise ValueError("'expected_output.visualization' array must not be empty")
117
+ return [CreatedVisualization.model_validate(v) for v in raw_viz]
118
+ if isinstance(raw_viz, dict):
119
+ return [CreatedVisualization.model_validate(raw_viz)]
120
+ raise ValueError("'expected_output.visualization' must be a JSON object or non-empty array")
121
+
122
+
123
+ def _extract_actual(chat_result: ChatResult) -> CreatedVisualization | None:
124
+ cv = chat_result.created_visualizations
125
+ if cv is None or not cv.objects:
126
+ return None
127
+ return cv.objects[0]
128
+
129
+
130
+ class VisualizationEvaluator:
131
+ test_kind = "visualization"
132
+
133
+ def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
134
+ candidates = _parse_expected(item.expected_output)
135
+ actual = _extract_actual(chat_result)
136
+ ev, _best_expected = _evaluate_against_candidates(candidates, actual)
137
+ return ItemEvaluation(
138
+ passed=ev.strict_pass,
139
+ rank_key=(ev.strict_pass, ev.strict_checks_passed_count),
140
+ detail={
141
+ "visualization_created": ev.visualization_created,
142
+ "cross_ref_valid": ev.cross_ref_valid,
143
+ "cross_ref_errors": ev.cross_ref_errors,
144
+ "metrics_correct": ev.metrics_correct,
145
+ "dimensions_correct": ev.dimensions_correct,
146
+ "filters_correct": ev.filters_correct,
147
+ "filter_date_score": ev.filter_date_score,
148
+ "filter_ranking_score": ev.filter_ranking_score,
149
+ "filter_attribute_score": ev.filter_attribute_score,
150
+ "viz_type_hard": ev.viz_type_hard,
151
+ "expected_metric_uris": sorted(ev.expected_metric_uris),
152
+ "actual_metric_uris": sorted(ev.actual_metric_uris),
153
+ "expected_dim_uris": sorted(ev.expected_dim_uris),
154
+ "actual_dim_uris": sorted(ev.actual_dim_uris),
155
+ },
156
+ )
@@ -0,0 +1 @@
1
+ # (C) 2026 GoodData Corporation
@@ -0,0 +1,178 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Langfuse scoring sink — posts evaluation results via the Langfuse REST API."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import base64
7
+ import os
8
+ import sys
9
+ import uuid
10
+ from datetime import datetime, timezone
11
+ from typing import TYPE_CHECKING, Any
12
+
13
+ import httpx
14
+
15
+ _MAX_LATENCY_S = 60.0
16
+ _QUALITY_WEIGHT = 0.6
17
+ _SPEED_WEIGHT = 0.2
18
+
19
+ if TYPE_CHECKING:
20
+ from gooddata_eval.core.runner import ItemReport
21
+
22
+
23
+ def compute_scores(
24
+ pass_at_k: bool,
25
+ avg_latency_s: float,
26
+ best_detail: dict[str, Any],
27
+ ) -> dict[str, float]:
28
+ """Compute Langfuse score values from an ItemReport.
29
+
30
+ Returns a dict with keys: pass_at_k, quality_score, value_score, latency_s.
31
+ quality_score = fraction of bool-valued keys in best_detail that are True.
32
+ Falls back to pass_at_k if no bool keys exist (text evaluators).
33
+ """
34
+ bool_checks = {k: v for k, v in best_detail.items() if isinstance(v, bool)}
35
+ quality = sum(1 for v in bool_checks.values() if v) / len(bool_checks) if bool_checks else 1.0 if pass_at_k else 0.0
36
+
37
+ speed = max(0.0, 1.0 - avg_latency_s / _MAX_LATENCY_S)
38
+ value = _QUALITY_WEIGHT * quality + _SPEED_WEIGHT * speed
39
+
40
+ return {
41
+ "pass_at_k": 1 if pass_at_k else 0,
42
+ "quality_score": round(quality, 4),
43
+ "value_score": round(value, 4),
44
+ "latency_s": round(avg_latency_s, 3),
45
+ }
46
+
47
+
48
+ class LangfuseSink:
49
+ """Posts evaluation results to Langfuse via the ingestion REST API."""
50
+
51
+ def __init__(self, dataset_name: str, run_name: str, model_id: str = "", provider_type: str = ""):
52
+ self._dataset_name = dataset_name
53
+ self._run_name = run_name
54
+ self._model_id = model_id
55
+ self._provider_type = provider_type
56
+ host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com").rstrip("/")
57
+ pub = os.environ.get("LANGFUSE_PUBLIC_KEY", "")
58
+ sec = os.environ.get("LANGFUSE_SECRET_KEY", "")
59
+ if not pub or not sec:
60
+ raise RuntimeError(
61
+ "Langfuse credentials not set. Export LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY to use --langfuse."
62
+ )
63
+ creds = base64.b64encode(f"{pub}:{sec}".encode()).decode()
64
+ self._host = host
65
+ self._auth_header = f"Basic {creds}"
66
+
67
+ def log_item(self, report: ItemReport, *, dataset_item_id: str) -> None:
68
+ """Send trace + dataset-run-item + scores for one evaluated item.
69
+
70
+ Swallows all errors — Langfuse failures never abort the eval run.
71
+ """
72
+ trace_id = str(uuid.uuid4())
73
+ now = datetime.now(timezone.utc).isoformat()
74
+ scores = compute_scores(
75
+ pass_at_k=report.pass_at_k,
76
+ avg_latency_s=report.avg_latency_s,
77
+ best_detail=report.best_detail,
78
+ )
79
+
80
+ # Each ingestion event needs a top-level id (dedup) and timestamp
81
+ # in addition to the body-level id/timestamp for the trace/score itself.
82
+ def _event(event_type: str, body: dict[str, Any]) -> dict[str, Any]:
83
+ return {"id": str(uuid.uuid4()), "timestamp": now, "type": event_type, "body": body}
84
+
85
+ batch: list[dict[str, Any]] = [
86
+ _event(
87
+ "trace-create",
88
+ {
89
+ "id": trace_id,
90
+ "timestamp": now,
91
+ "name": f"gd-eval: {report.question[:80]}",
92
+ # Expose the model on a first-class trace field so Langfuse
93
+ # dashboards can filter / break down by it ("Version"); trace
94
+ # metadata is not available as a breakdown dimension.
95
+ "version": self._model_id or None,
96
+ "input": {"question": report.question},
97
+ "output": report.best_detail,
98
+ "metadata": {
99
+ "dataset_name": report.dataset_name,
100
+ "test_kind": report.test_kind,
101
+ "item_id": report.id,
102
+ "model": self._model_id,
103
+ "provider_type": self._provider_type,
104
+ },
105
+ "tags": [t for t in [report.test_kind, self._provider_type] if t],
106
+ },
107
+ ),
108
+ ]
109
+
110
+ score_defs = [
111
+ ("pass_at_k", scores["pass_at_k"], "BOOLEAN"),
112
+ ("quality_score", scores["quality_score"], "NUMERIC"),
113
+ ("value_score", scores["value_score"], "NUMERIC"),
114
+ ("latency_s", scores["latency_s"], "NUMERIC"),
115
+ ]
116
+ for name, value, data_type in score_defs:
117
+ batch.append(
118
+ _event(
119
+ "score-create",
120
+ {
121
+ "id": str(uuid.uuid4()),
122
+ "traceId": trace_id,
123
+ "name": name,
124
+ "value": value,
125
+ "dataType": data_type,
126
+ },
127
+ )
128
+ )
129
+
130
+ try:
131
+ with httpx.Client(
132
+ base_url=self._host,
133
+ headers={"Authorization": self._auth_header},
134
+ timeout=10,
135
+ ) as client:
136
+ resp = client.post("/api/public/ingestion", json={"batch": batch})
137
+ resp.raise_for_status()
138
+ # The ingestion endpoint returns HTTP 200 even when individual events
139
+ # fail — per-event errors are in the response body.
140
+ body = resp.json()
141
+ errors = body.get("errors") or []
142
+ for err in errors:
143
+ print(
144
+ f"warning: Langfuse event failed for item '{report.id}': "
145
+ f"type={err.get('error')} status={err.get('status')} id={err.get('id')}",
146
+ file=sys.stderr,
147
+ )
148
+ except Exception as exc:
149
+ print(f"warning: Langfuse ingestion failed for item '{report.id}': {exc}", file=sys.stderr)
150
+
151
+ # Link trace to dataset run via the dedicated endpoint (simpler than ingestion —
152
+ # does not require datasetId/runId; creates the run by name if absent).
153
+ try:
154
+ with httpx.Client(
155
+ base_url=self._host,
156
+ headers={"Authorization": self._auth_header},
157
+ timeout=10,
158
+ ) as client:
159
+ r = client.post(
160
+ "/api/public/dataset-run-items",
161
+ json={
162
+ "runName": self._run_name,
163
+ "runDescription": (
164
+ f"{self._provider_type}/{self._model_id}"
165
+ if self._provider_type and self._model_id
166
+ else self._model_id or ""
167
+ ),
168
+ "metadata": {
169
+ "model": self._model_id,
170
+ "provider_type": self._provider_type,
171
+ },
172
+ "datasetItemId": dataset_item_id,
173
+ "traceId": trace_id,
174
+ },
175
+ )
176
+ r.raise_for_status()
177
+ except Exception as exc:
178
+ print(f"warning: Langfuse dataset-run-item failed for item '{report.id}': {exc}", file=sys.stderr)
@@ -0,0 +1,116 @@
1
+ # (C) 2026 GoodData Corporation
2
+ """Pydantic models for the eval dataset envelope and the agent's AAC output.
3
+
4
+ Ported from gdc-nas tavern-e2e app/llm_as_judge/schemas/chat.py.
5
+ """
6
+
7
+ import json
8
+ from typing import Any
9
+
10
+ from pydantic import BaseModel, ConfigDict, Field
11
+
12
+
13
+ class AacQueryField(BaseModel):
14
+ model_config = ConfigDict(extra="allow")
15
+
16
+ using: str
17
+ title: str | None = None
18
+ aggregation: str | None = None
19
+
20
+
21
+ class AacBucketRef(BaseModel):
22
+ model_config = ConfigDict(extra="allow")
23
+
24
+ field: str
25
+
26
+
27
+ class AacQuery(BaseModel):
28
+ fields: dict[str, AacQueryField | str]
29
+ filter_by: dict[str, dict] = Field(default_factory=dict)
30
+
31
+
32
+ class CreatedVisualization(BaseModel):
33
+ """Visualization in the AAC format (agent output and dataset expected output)."""
34
+
35
+ model_config = ConfigDict(extra="ignore")
36
+
37
+ id: str
38
+ title: str | None = None
39
+ type: str
40
+ query: AacQuery
41
+ metrics: list[AacBucketRef | str] = Field(default_factory=list)
42
+ view_by: list[AacBucketRef | str] = Field(default_factory=list)
43
+ segment_by: list[AacBucketRef | str] = Field(default_factory=list)
44
+ rows: list[AacBucketRef | str] = Field(default_factory=list)
45
+ columns: list[AacBucketRef | str] = Field(default_factory=list)
46
+ config: dict | None = None
47
+
48
+
49
+ class CreatedVisualizations(BaseModel):
50
+ model_config = ConfigDict(extra="ignore")
51
+
52
+ objects: list[CreatedVisualization] = Field(default_factory=list)
53
+ reasoning: str = ""
54
+
55
+
56
+ class ToolCallEvent(BaseModel):
57
+ model_config = ConfigDict(populate_by_name=True)
58
+
59
+ function_name: str = Field(alias="functionName")
60
+ function_arguments: str = Field(alias="functionArguments")
61
+ result: str | None = None
62
+
63
+ def parsed_arguments(self) -> dict[str, Any]:
64
+ try:
65
+ return json.loads(self.function_arguments) if self.function_arguments else {}
66
+ except json.JSONDecodeError:
67
+ return {}
68
+
69
+ def parsed_result(self) -> dict[str, Any] | None:
70
+ if not self.result:
71
+ return None
72
+ try:
73
+ return json.loads(self.result)
74
+ except json.JSONDecodeError:
75
+ return None
76
+
77
+
78
+ class ChatResult(BaseModel):
79
+ """Subset of the agent chat response needed for Phase 1 evaluation."""
80
+
81
+ model_config = ConfigDict(extra="ignore", populate_by_name=True)
82
+
83
+ text_response: str | None = Field(default=None, alias="textResponse")
84
+ created_visualizations: CreatedVisualizations | None = Field(default=None, alias="createdVisualizations")
85
+ tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents")
86
+
87
+
88
+ class SummaryInput(BaseModel):
89
+ """Structured input for the `dashboard_summary` test kind.
90
+
91
+ Maps onto the dedicated summary endpoint's request body
92
+ (`POST /api/v1/ai/workspaces/{ws}/summary`). Authored in snake_case in the
93
+ dataset; the SummaryClient maps it to the endpoint's camelCase fields.
94
+ """
95
+
96
+ model_config = ConfigDict(extra="ignore")
97
+
98
+ dashboard_id: str
99
+ visualizations: list[str] | None = None
100
+ filter_context: list[dict] | None = None
101
+ tab_id: str | None = None
102
+ format_hint: str | None = None
103
+
104
+
105
+ class DatasetItem(BaseModel):
106
+ """Common dataset envelope. `expected_output` stays raw; each evaluator parses its own shape."""
107
+
108
+ model_config = ConfigDict(extra="ignore")
109
+
110
+ id: str
111
+ dataset_name: str
112
+ test_kind: str
113
+ question: str
114
+ expected_output: Any
115
+ # Only used by the `dashboard_summary` test kind; ignored by all others.
116
+ summary_input: SummaryInput | None = None
@@ -0,0 +1 @@
1
+ # (C) 2026 GoodData Corporation