evalvault 1.62.1__py3-none-any.whl → 1.63.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. evalvault/adapters/inbound/api/adapter.py +190 -19
  2. evalvault/adapters/inbound/api/routers/runs.py +66 -2
  3. evalvault/adapters/inbound/cli/commands/method.py +5 -2
  4. evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
  5. evalvault/adapters/inbound/cli/commands/run.py +43 -2
  6. evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
  7. evalvault/adapters/inbound/mcp/tools.py +5 -2
  8. evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
  9. evalvault/adapters/outbound/llm/__init__.py +5 -43
  10. evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
  11. evalvault/adapters/outbound/llm/factory.py +103 -0
  12. evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
  13. evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
  14. evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
  15. evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
  16. evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
  17. evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
  18. evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
  19. evalvault/adapters/outbound/storage/base_sql.py +527 -21
  20. evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
  21. evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
  22. evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
  23. evalvault/debug_ragas.py +7 -1
  24. evalvault/debug_ragas_real.py +5 -1
  25. evalvault/domain/entities/__init__.py +10 -0
  26. evalvault/domain/entities/prompt_suggestion.py +50 -0
  27. evalvault/domain/services/__init__.py +6 -0
  28. evalvault/domain/services/evaluator.py +191 -103
  29. evalvault/domain/services/holdout_splitter.py +67 -0
  30. evalvault/domain/services/intent_classifier.py +73 -0
  31. evalvault/domain/services/pipeline_template_registry.py +3 -0
  32. evalvault/domain/services/prompt_candidate_service.py +117 -0
  33. evalvault/domain/services/prompt_registry.py +40 -2
  34. evalvault/domain/services/prompt_scoring_service.py +286 -0
  35. evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
  36. evalvault/domain/services/synthetic_qa_generator.py +4 -3
  37. evalvault/ports/inbound/learning_hook_port.py +4 -1
  38. evalvault/ports/outbound/__init__.py +2 -0
  39. evalvault/ports/outbound/llm_factory_port.py +13 -0
  40. evalvault/ports/outbound/llm_port.py +34 -2
  41. evalvault/ports/outbound/storage_port.py +38 -0
  42. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
  43. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
  44. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
  45. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
  46. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -4,10 +4,11 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  from abc import ABC, abstractmethod
7
- from collections.abc import Sequence
8
- from contextlib import contextmanager
7
+ from collections.abc import Mapping, Sequence
8
+ from contextlib import AbstractContextManager, closing
9
9
  from datetime import datetime
10
- from typing import Any
10
+ from pathlib import Path
11
+ from typing import Any, cast
11
12
 
12
13
  from evalvault.domain.entities import (
13
14
  EvaluationRun,
@@ -181,24 +182,28 @@ class BaseSQLStorageAdapter(ABC):
181
182
  # Connection helpers -------------------------------------------------
182
183
 
183
184
  @abstractmethod
184
- def _connect(self):
185
+ def _connect(self) -> Any:
185
186
  """Return a new DB-API compatible connection."""
187
+ raise NotImplementedError
186
188
 
187
- @contextmanager
188
- def _get_connection(self):
189
+ def _get_connection(self) -> AbstractContextManager[Any]:
189
190
  conn = self._connect()
190
- try:
191
- yield conn
192
- finally:
193
- conn.close()
191
+ if conn is None:
192
+ raise RuntimeError("Database connection not available")
193
+ return closing(conn)
194
194
 
195
195
  def _fetch_lastrowid(self, cursor) -> int:
196
196
  return cursor.lastrowid
197
197
 
198
- def _execute(self, conn, query: str, params: Sequence[Any] | None = None):
198
+ def _execute(
199
+ self,
200
+ conn: Any,
201
+ query: str,
202
+ params: Sequence[object] | Mapping[str, object] | None = None,
203
+ ) -> Any:
199
204
  if params is None:
200
- params = ()
201
- return conn.execute(query, tuple(params))
205
+ return conn.execute(query)
206
+ return conn.execute(query, params)
202
207
 
203
208
  # CRUD helpers -------------------------------------------------------
204
209
 
@@ -244,7 +249,7 @@ class BaseSQLStorageAdapter(ABC):
244
249
  dataset_name=run_row["dataset_name"],
245
250
  dataset_version=run_row["dataset_version"],
246
251
  model_name=run_row["model_name"],
247
- started_at=self._deserialize_datetime(run_row["started_at"]),
252
+ started_at=self._deserialize_datetime(run_row["started_at"]) or datetime.now(),
248
253
  finished_at=self._deserialize_datetime(run_row["finished_at"]),
249
254
  total_tokens=run_row["total_tokens"],
250
255
  total_cost_usd=self._maybe_float(run_row["total_cost_usd"]),
@@ -285,7 +290,7 @@ class BaseSQLStorageAdapter(ABC):
285
290
  def delete_run(self, run_id: str) -> bool:
286
291
  with self._get_connection() as conn:
287
292
  cursor = self._execute(conn, self.queries.delete_run(), (run_id,))
288
- deleted = cursor.rowcount > 0
293
+ deleted = (cursor.rowcount or 0) > 0
289
294
  conn.commit()
290
295
  return deleted
291
296
 
@@ -428,8 +433,33 @@ class BaseSQLStorageAdapter(ABC):
428
433
 
429
434
  def get_feedback_summary(self, run_id: str) -> FeedbackSummary:
430
435
  feedbacks = self.list_feedback(run_id)
431
- scores = [f.satisfaction_score for f in feedbacks if f.satisfaction_score is not None]
432
- thumbs = [f.thumb_feedback for f in feedbacks if f.thumb_feedback in {"up", "down"}]
436
+ latest: dict[tuple[str, str | None], SatisfactionFeedback] = {}
437
+ for feedback in feedbacks:
438
+ key = (feedback.test_case_id, feedback.rater_id)
439
+ current = latest.get(key)
440
+ if current is None:
441
+ latest[key] = feedback
442
+ continue
443
+ current_time = current.created_at or datetime.min
444
+ feedback_time = feedback.created_at or datetime.min
445
+ if feedback_time >= current_time:
446
+ latest[key] = feedback
447
+
448
+ effective = [
449
+ feedback
450
+ for feedback in latest.values()
451
+ if feedback.satisfaction_score is not None or feedback.thumb_feedback in {"up", "down"}
452
+ ]
453
+ scores = [
454
+ feedback.satisfaction_score
455
+ for feedback in effective
456
+ if feedback.satisfaction_score is not None
457
+ ]
458
+ thumbs = [
459
+ feedback.thumb_feedback
460
+ for feedback in effective
461
+ if feedback.thumb_feedback in {"up", "down"}
462
+ ]
433
463
  avg_score = sum(scores) / len(scores) if scores else None
434
464
  thumb_up_rate = None
435
465
  if thumbs:
@@ -437,7 +467,7 @@ class BaseSQLStorageAdapter(ABC):
437
467
  return FeedbackSummary(
438
468
  avg_satisfaction_score=avg_score,
439
469
  thumb_up_rate=thumb_up_rate,
440
- total_feedback=len(feedbacks),
470
+ total_feedback=len(effective),
441
471
  )
442
472
 
443
473
  # Serialization helpers --------------------------------------------
@@ -495,7 +525,7 @@ class BaseSQLStorageAdapter(ABC):
495
525
  latency_ms=row["latency_ms"],
496
526
  cost_usd=self._maybe_float(row["cost_usd"]),
497
527
  trace_id=row["trace_id"],
498
- started_at=self._deserialize_datetime(row["started_at"]),
528
+ started_at=self._deserialize_datetime(row["started_at"]) or datetime.now(),
499
529
  finished_at=self._deserialize_datetime(row["finished_at"]),
500
530
  question=row["question"],
501
531
  answer=row["answer"],
@@ -525,8 +555,8 @@ class BaseSQLStorageAdapter(ABC):
525
555
  return [
526
556
  MetricScore(
527
557
  name=self._resolve_metric_name(row, metric_column),
528
- score=self._maybe_float(self._row_value(row, "score")),
529
- threshold=self._maybe_float(self._row_value(row, "threshold")),
558
+ score=self._maybe_float(self._row_value(row, "score")) or 0.0,
559
+ threshold=self._maybe_float(self._row_value(row, "threshold")) or 0.7,
530
560
  reason=self._row_value(row, "reason"),
531
561
  )
532
562
  for row in rows
@@ -585,3 +615,479 @@ class BaseSQLStorageAdapter(ABC):
585
615
  return row[key]
586
616
  except (KeyError, TypeError, IndexError):
587
617
  return None
618
+
619
+ def _row_to_mapping(self, row: Any) -> dict[str, Any]:
620
+ if row is None:
621
+ return {}
622
+ if isinstance(row, dict):
623
+ return dict(row)
624
+ if hasattr(row, "keys"):
625
+ return {key: row[key] for key in row}
626
+ try:
627
+ return dict(row)
628
+ except Exception:
629
+ return {}
630
+
631
+ def _coerce_excel_value(self, value: Any, *, force_json: bool = False) -> Any:
632
+ if force_json:
633
+ payload = self._deserialize_json(value)
634
+ if payload is None:
635
+ return None
636
+ return json.dumps(payload, ensure_ascii=False)
637
+ if isinstance(value, (dict, list)):
638
+ return json.dumps(value, ensure_ascii=False)
639
+ if isinstance(value, datetime):
640
+ return value.isoformat()
641
+ if isinstance(value, bytes):
642
+ return value.decode("utf-8", errors="replace")
643
+ return value
644
+
645
+ def _normalize_rows(
646
+ self,
647
+ rows: Sequence[Any],
648
+ *,
649
+ json_columns: set[str] | None = None,
650
+ ) -> list[dict[str, Any]]:
651
+ json_columns = json_columns or set()
652
+ normalized: list[dict[str, Any]] = []
653
+ for row in rows:
654
+ payload = self._row_to_mapping(row)
655
+ for key, value in payload.items():
656
+ payload[key] = self._coerce_excel_value(
657
+ value,
658
+ force_json=key in json_columns,
659
+ )
660
+ normalized.append(payload)
661
+ return normalized
662
+
663
+ def export_run_to_excel(self, run_id: str, output_path) -> Path:
664
+ from openpyxl import Workbook
665
+
666
+ output = Path(output_path)
667
+ output.parent.mkdir(parents=True, exist_ok=True)
668
+
669
+ placeholder = self.queries.placeholder
670
+
671
+ with self._get_connection() as conn:
672
+ run_row = self._execute(conn, self.queries.select_run(), (run_id,)).fetchone()
673
+ if not run_row:
674
+ raise KeyError(f"Run not found: {run_id}")
675
+
676
+ run_rows = self._normalize_rows(
677
+ [run_row],
678
+ json_columns={"metrics_evaluated", "thresholds", "metadata", "retrieval_metadata"},
679
+ )
680
+
681
+ test_case_rows = self._execute(
682
+ conn,
683
+ (
684
+ "SELECT id, run_id, test_case_id, tokens_used, latency_ms, cost_usd, trace_id, "
685
+ "started_at, finished_at, question, answer, contexts, ground_truth "
686
+ f"FROM test_case_results WHERE run_id = {placeholder} ORDER BY id"
687
+ ),
688
+ (run_id,),
689
+ ).fetchall()
690
+ test_case_payloads = self._normalize_rows(
691
+ test_case_rows,
692
+ json_columns={"contexts"},
693
+ )
694
+
695
+ metric_rows = self._execute(
696
+ conn,
697
+ (
698
+ "SELECT m.result_id, t.test_case_id, m."
699
+ f"{self.queries.metric_name_column} AS metric_name, m.score, m.threshold, m.reason "
700
+ "FROM metric_scores m JOIN test_case_results t ON m.result_id = t.id "
701
+ f"WHERE t.run_id = {placeholder} ORDER BY m.id"
702
+ ),
703
+ (run_id,),
704
+ ).fetchall()
705
+ metric_payloads = self._normalize_rows(metric_rows)
706
+
707
+ run_prompt_rows = self._execute(
708
+ conn,
709
+ (
710
+ "SELECT run_id, prompt_set_id, created_at FROM run_prompt_sets "
711
+ f"WHERE run_id = {placeholder} ORDER BY created_at DESC"
712
+ ),
713
+ (run_id,),
714
+ ).fetchall()
715
+ run_prompt_payloads = self._normalize_rows(run_prompt_rows)
716
+ prompt_set_ids = [row.get("prompt_set_id") for row in run_prompt_payloads if row]
717
+
718
+ prompt_sets_payloads: list[dict[str, Any]] = []
719
+ prompt_set_item_payloads: list[dict[str, Any]] = []
720
+ prompt_payloads: list[dict[str, Any]] = []
721
+
722
+ if prompt_set_ids:
723
+ placeholders = ", ".join([placeholder] * len(prompt_set_ids))
724
+ prompt_set_rows = self._execute(
725
+ conn,
726
+ (
727
+ "SELECT prompt_set_id, name, description, metadata, created_at "
728
+ f"FROM prompt_sets WHERE prompt_set_id IN ({placeholders})"
729
+ ),
730
+ prompt_set_ids,
731
+ ).fetchall()
732
+ prompt_sets_payloads = self._normalize_rows(
733
+ prompt_set_rows,
734
+ json_columns={"metadata"},
735
+ )
736
+
737
+ item_rows = self._execute(
738
+ conn,
739
+ (
740
+ "SELECT id, prompt_set_id, prompt_id, role, item_order, metadata "
741
+ f"FROM prompt_set_items WHERE prompt_set_id IN ({placeholders})"
742
+ ),
743
+ prompt_set_ids,
744
+ ).fetchall()
745
+ prompt_set_item_payloads = self._normalize_rows(
746
+ item_rows,
747
+ json_columns={"metadata"},
748
+ )
749
+
750
+ prompt_ids = [row.get("prompt_id") for row in prompt_set_item_payloads if row]
751
+ if prompt_ids:
752
+ prompt_placeholders = ", ".join([placeholder] * len(prompt_ids))
753
+ prompt_rows = self._execute(
754
+ conn,
755
+ (
756
+ "SELECT prompt_id, name, kind, content, checksum, source, notes, metadata, created_at "
757
+ f"FROM prompts WHERE prompt_id IN ({prompt_placeholders})"
758
+ ),
759
+ prompt_ids,
760
+ ).fetchall()
761
+ prompt_payloads = self._normalize_rows(
762
+ prompt_rows,
763
+ json_columns={"metadata"},
764
+ )
765
+
766
+ feedback_rows = self._execute(
767
+ conn,
768
+ (
769
+ "SELECT id, run_id, test_case_id, satisfaction_score, thumb_feedback, comment, rater_id, created_at "
770
+ f"FROM satisfaction_feedback WHERE run_id = {placeholder} ORDER BY created_at DESC"
771
+ ),
772
+ (run_id,),
773
+ ).fetchall()
774
+ feedback_payloads = self._normalize_rows(feedback_rows)
775
+
776
+ cluster_rows = self._execute(
777
+ conn,
778
+ (
779
+ "SELECT run_id, map_id, test_case_id, cluster_id, source, metadata, created_at "
780
+ f"FROM run_cluster_maps WHERE run_id = {placeholder} ORDER BY created_at DESC"
781
+ ),
782
+ (run_id,),
783
+ ).fetchall()
784
+ cluster_payloads = self._normalize_rows(cluster_rows, json_columns={"metadata"})
785
+
786
+ stage_event_rows = self._execute(
787
+ conn,
788
+ (
789
+ "SELECT id, run_id, stage_id, parent_stage_id, stage_type, stage_name, status, "
790
+ "attempt, started_at, finished_at, duration_ms, input_ref, output_ref, attributes, "
791
+ "metadata, trace_id, span_id FROM stage_events "
792
+ f"WHERE run_id = {placeholder} ORDER BY id"
793
+ ),
794
+ (run_id,),
795
+ ).fetchall()
796
+ stage_event_payloads = self._normalize_rows(
797
+ stage_event_rows,
798
+ json_columns={"attributes", "metadata"},
799
+ )
800
+
801
+ stage_metric_rows = self._execute(
802
+ conn,
803
+ (
804
+ "SELECT id, run_id, stage_id, metric_name, score, threshold, evidence "
805
+ f"FROM stage_metrics WHERE run_id = {placeholder} ORDER BY id"
806
+ ),
807
+ (run_id,),
808
+ ).fetchall()
809
+ stage_metric_payloads = self._normalize_rows(
810
+ stage_metric_rows, json_columns={"evidence"}
811
+ )
812
+
813
+ report_rows = self._execute(
814
+ conn,
815
+ (
816
+ "SELECT report_id, run_id, experiment_id, report_type, format, content, metadata, created_at "
817
+ f"FROM analysis_reports WHERE run_id = {placeholder} ORDER BY created_at DESC"
818
+ ),
819
+ (run_id,),
820
+ ).fetchall()
821
+ report_payloads = self._normalize_rows(report_rows, json_columns={"metadata"})
822
+
823
+ pipeline_rows = self._execute(
824
+ conn,
825
+ (
826
+ "SELECT result_id, intent, query, run_id, pipeline_id, profile, tags, metadata, "
827
+ "is_complete, duration_ms, final_output, node_results, started_at, finished_at, created_at "
828
+ f"FROM pipeline_results WHERE run_id = {placeholder} ORDER BY created_at DESC"
829
+ ),
830
+ (run_id,),
831
+ ).fetchall()
832
+ pipeline_payloads = self._normalize_rows(
833
+ pipeline_rows,
834
+ json_columns={"tags", "metadata", "final_output", "node_results"},
835
+ )
836
+
837
+ summary_rows: list[dict[str, Any]] = []
838
+ run_payload = run_rows[0] if run_rows else {}
839
+ prompt_set_id = None
840
+ prompt_set_name = None
841
+ if run_prompt_payloads:
842
+ prompt_set_id = run_prompt_payloads[0].get("prompt_set_id")
843
+ if prompt_sets_payloads:
844
+ prompt_set_name = prompt_sets_payloads[0].get("name")
845
+ summary_rows.append(
846
+ {
847
+ "run_id": run_payload.get("run_id"),
848
+ "dataset_name": run_payload.get("dataset_name"),
849
+ "model_name": run_payload.get("model_name"),
850
+ "started_at": run_payload.get("started_at"),
851
+ "finished_at": run_payload.get("finished_at"),
852
+ "total_test_cases": len(test_case_payloads),
853
+ "total_tokens": run_payload.get("total_tokens"),
854
+ "total_cost_usd": run_payload.get("total_cost_usd"),
855
+ "pass_rate": run_payload.get("pass_rate"),
856
+ "metrics_evaluated": run_payload.get("metrics_evaluated"),
857
+ "prompt_set_id": prompt_set_id,
858
+ "prompt_set_name": prompt_set_name,
859
+ }
860
+ )
861
+
862
+ metric_summary_rows: list[dict[str, Any]] = []
863
+ metrics_index: dict[str, dict[str, Any]] = {}
864
+ for row in metric_payloads:
865
+ metric_name = row.get("metric_name")
866
+ if not metric_name:
867
+ continue
868
+ entry = metrics_index.setdefault(
869
+ metric_name,
870
+ {"metric_name": metric_name, "count": 0, "score_sum": 0.0, "pass_count": 0},
871
+ )
872
+ score = row.get("score")
873
+ threshold = row.get("threshold")
874
+ if isinstance(score, (int, float)):
875
+ entry["count"] += 1
876
+ entry["score_sum"] += float(score)
877
+ if isinstance(threshold, (int, float)) and score >= threshold:
878
+ entry["pass_count"] += 1
879
+
880
+ for entry in metrics_index.values():
881
+ count = entry["count"] or 0
882
+ metric_summary_rows.append(
883
+ {
884
+ "metric_name": entry["metric_name"],
885
+ "avg_score": (entry["score_sum"] / count) if count else None,
886
+ "pass_rate": (entry["pass_count"] / count) if count else None,
887
+ "samples": count,
888
+ }
889
+ )
890
+
891
+ sheet_order: list[tuple[str, list[dict[str, Any]], list[str]]] = [
892
+ (
893
+ "Summary",
894
+ summary_rows,
895
+ [
896
+ "run_id",
897
+ "dataset_name",
898
+ "model_name",
899
+ "started_at",
900
+ "finished_at",
901
+ "total_test_cases",
902
+ "total_tokens",
903
+ "total_cost_usd",
904
+ "pass_rate",
905
+ "metrics_evaluated",
906
+ "prompt_set_id",
907
+ "prompt_set_name",
908
+ ],
909
+ ),
910
+ (
911
+ "Run",
912
+ run_rows,
913
+ [
914
+ "run_id",
915
+ "dataset_name",
916
+ "dataset_version",
917
+ "model_name",
918
+ "started_at",
919
+ "finished_at",
920
+ "total_tokens",
921
+ "total_cost_usd",
922
+ "pass_rate",
923
+ "metrics_evaluated",
924
+ "thresholds",
925
+ "langfuse_trace_id",
926
+ "metadata",
927
+ "retrieval_metadata",
928
+ "created_at",
929
+ ],
930
+ ),
931
+ (
932
+ "TestCases",
933
+ test_case_payloads,
934
+ [
935
+ "id",
936
+ "run_id",
937
+ "test_case_id",
938
+ "tokens_used",
939
+ "latency_ms",
940
+ "cost_usd",
941
+ "trace_id",
942
+ "started_at",
943
+ "finished_at",
944
+ "question",
945
+ "answer",
946
+ "contexts",
947
+ "ground_truth",
948
+ ],
949
+ ),
950
+ (
951
+ "MetricScores",
952
+ metric_payloads,
953
+ ["result_id", "test_case_id", "metric_name", "score", "threshold", "reason"],
954
+ ),
955
+ (
956
+ "MetricsSummary",
957
+ metric_summary_rows,
958
+ ["metric_name", "avg_score", "pass_rate", "samples"],
959
+ ),
960
+ (
961
+ "RunPromptSets",
962
+ run_prompt_payloads,
963
+ ["run_id", "prompt_set_id", "created_at"],
964
+ ),
965
+ (
966
+ "PromptSets",
967
+ prompt_sets_payloads,
968
+ ["prompt_set_id", "name", "description", "metadata", "created_at"],
969
+ ),
970
+ (
971
+ "PromptSetItems",
972
+ prompt_set_item_payloads,
973
+ ["id", "prompt_set_id", "prompt_id", "role", "item_order", "metadata"],
974
+ ),
975
+ (
976
+ "Prompts",
977
+ prompt_payloads,
978
+ [
979
+ "prompt_id",
980
+ "name",
981
+ "kind",
982
+ "content",
983
+ "checksum",
984
+ "source",
985
+ "notes",
986
+ "metadata",
987
+ "created_at",
988
+ ],
989
+ ),
990
+ (
991
+ "Feedback",
992
+ feedback_payloads,
993
+ [
994
+ "id",
995
+ "run_id",
996
+ "test_case_id",
997
+ "satisfaction_score",
998
+ "thumb_feedback",
999
+ "comment",
1000
+ "rater_id",
1001
+ "created_at",
1002
+ ],
1003
+ ),
1004
+ (
1005
+ "ClusterMaps",
1006
+ cluster_payloads,
1007
+ [
1008
+ "run_id",
1009
+ "map_id",
1010
+ "test_case_id",
1011
+ "cluster_id",
1012
+ "source",
1013
+ "metadata",
1014
+ "created_at",
1015
+ ],
1016
+ ),
1017
+ (
1018
+ "StageEvents",
1019
+ stage_event_payloads,
1020
+ [
1021
+ "id",
1022
+ "run_id",
1023
+ "stage_id",
1024
+ "parent_stage_id",
1025
+ "stage_type",
1026
+ "stage_name",
1027
+ "status",
1028
+ "attempt",
1029
+ "started_at",
1030
+ "finished_at",
1031
+ "duration_ms",
1032
+ "input_ref",
1033
+ "output_ref",
1034
+ "attributes",
1035
+ "metadata",
1036
+ "trace_id",
1037
+ "span_id",
1038
+ ],
1039
+ ),
1040
+ (
1041
+ "StageMetrics",
1042
+ stage_metric_payloads,
1043
+ ["id", "run_id", "stage_id", "metric_name", "score", "threshold", "evidence"],
1044
+ ),
1045
+ (
1046
+ "AnalysisReports",
1047
+ report_payloads,
1048
+ [
1049
+ "report_id",
1050
+ "run_id",
1051
+ "experiment_id",
1052
+ "report_type",
1053
+ "format",
1054
+ "content",
1055
+ "metadata",
1056
+ "created_at",
1057
+ ],
1058
+ ),
1059
+ (
1060
+ "PipelineResults",
1061
+ pipeline_payloads,
1062
+ [
1063
+ "result_id",
1064
+ "intent",
1065
+ "query",
1066
+ "run_id",
1067
+ "pipeline_id",
1068
+ "profile",
1069
+ "tags",
1070
+ "metadata",
1071
+ "is_complete",
1072
+ "duration_ms",
1073
+ "final_output",
1074
+ "node_results",
1075
+ "started_at",
1076
+ "finished_at",
1077
+ "created_at",
1078
+ ],
1079
+ ),
1080
+ ]
1081
+
1082
+ workbook = Workbook()
1083
+ default_sheet = workbook.active
1084
+ if default_sheet is not None:
1085
+ workbook.remove(default_sheet)
1086
+ for sheet_name, rows, columns in sheet_order:
1087
+ worksheet = cast(Any, workbook.create_sheet(title=sheet_name))
1088
+ worksheet.append(columns)
1089
+ for row in rows:
1090
+ worksheet.append([row.get(column) for column in columns])
1091
+
1092
+ workbook.save(output)
1093
+ return output