evalvault 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +88 -5
- evalvault/adapters/inbound/cli/commands/run_helpers.py +12 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/improvement/pattern_detector.py +1 -1
- evalvault/adapters/outbound/improvement/playbook_loader.py +1 -1
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +528 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/METADATA +228 -4
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/RECORD +48 -40
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/WHEEL +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -4,10 +4,11 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
|
-
from collections.abc import Sequence
|
|
8
|
-
from contextlib import
|
|
7
|
+
from collections.abc import Mapping, Sequence
|
|
8
|
+
from contextlib import AbstractContextManager, closing
|
|
9
9
|
from datetime import datetime
|
|
10
|
-
from
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, cast
|
|
11
12
|
|
|
12
13
|
from evalvault.domain.entities import (
|
|
13
14
|
EvaluationRun,
|
|
@@ -181,24 +182,28 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
181
182
|
# Connection helpers -------------------------------------------------
|
|
182
183
|
|
|
183
184
|
@abstractmethod
|
|
184
|
-
def _connect(self):
|
|
185
|
+
def _connect(self) -> Any:
|
|
185
186
|
"""Return a new DB-API compatible connection."""
|
|
187
|
+
raise NotImplementedError
|
|
186
188
|
|
|
187
|
-
|
|
188
|
-
def _get_connection(self):
|
|
189
|
+
def _get_connection(self) -> AbstractContextManager[Any]:
|
|
189
190
|
conn = self._connect()
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
conn.close()
|
|
191
|
+
if conn is None:
|
|
192
|
+
raise RuntimeError("Database connection not available")
|
|
193
|
+
return closing(conn)
|
|
194
194
|
|
|
195
195
|
def _fetch_lastrowid(self, cursor) -> int:
|
|
196
196
|
return cursor.lastrowid
|
|
197
197
|
|
|
198
|
-
def _execute(
|
|
198
|
+
def _execute(
|
|
199
|
+
self,
|
|
200
|
+
conn: Any,
|
|
201
|
+
query: str,
|
|
202
|
+
params: Sequence[object] | Mapping[str, object] | None = None,
|
|
203
|
+
) -> Any:
|
|
199
204
|
if params is None:
|
|
200
|
-
|
|
201
|
-
return conn.execute(query,
|
|
205
|
+
return conn.execute(query)
|
|
206
|
+
return conn.execute(query, params)
|
|
202
207
|
|
|
203
208
|
# CRUD helpers -------------------------------------------------------
|
|
204
209
|
|
|
@@ -244,7 +249,7 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
244
249
|
dataset_name=run_row["dataset_name"],
|
|
245
250
|
dataset_version=run_row["dataset_version"],
|
|
246
251
|
model_name=run_row["model_name"],
|
|
247
|
-
started_at=self._deserialize_datetime(run_row["started_at"]),
|
|
252
|
+
started_at=self._deserialize_datetime(run_row["started_at"]) or datetime.now(),
|
|
248
253
|
finished_at=self._deserialize_datetime(run_row["finished_at"]),
|
|
249
254
|
total_tokens=run_row["total_tokens"],
|
|
250
255
|
total_cost_usd=self._maybe_float(run_row["total_cost_usd"]),
|
|
@@ -285,7 +290,7 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
285
290
|
def delete_run(self, run_id: str) -> bool:
|
|
286
291
|
with self._get_connection() as conn:
|
|
287
292
|
cursor = self._execute(conn, self.queries.delete_run(), (run_id,))
|
|
288
|
-
deleted = cursor.rowcount > 0
|
|
293
|
+
deleted = (cursor.rowcount or 0) > 0
|
|
289
294
|
conn.commit()
|
|
290
295
|
return deleted
|
|
291
296
|
|
|
@@ -428,8 +433,33 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
428
433
|
|
|
429
434
|
def get_feedback_summary(self, run_id: str) -> FeedbackSummary:
|
|
430
435
|
feedbacks = self.list_feedback(run_id)
|
|
431
|
-
|
|
432
|
-
|
|
436
|
+
latest: dict[tuple[str, str | None], SatisfactionFeedback] = {}
|
|
437
|
+
for feedback in feedbacks:
|
|
438
|
+
key = (feedback.test_case_id, feedback.rater_id)
|
|
439
|
+
current = latest.get(key)
|
|
440
|
+
if current is None:
|
|
441
|
+
latest[key] = feedback
|
|
442
|
+
continue
|
|
443
|
+
current_time = current.created_at or datetime.min
|
|
444
|
+
feedback_time = feedback.created_at or datetime.min
|
|
445
|
+
if feedback_time >= current_time:
|
|
446
|
+
latest[key] = feedback
|
|
447
|
+
|
|
448
|
+
effective = [
|
|
449
|
+
feedback
|
|
450
|
+
for feedback in latest.values()
|
|
451
|
+
if feedback.satisfaction_score is not None or feedback.thumb_feedback in {"up", "down"}
|
|
452
|
+
]
|
|
453
|
+
scores = [
|
|
454
|
+
feedback.satisfaction_score
|
|
455
|
+
for feedback in effective
|
|
456
|
+
if feedback.satisfaction_score is not None
|
|
457
|
+
]
|
|
458
|
+
thumbs = [
|
|
459
|
+
feedback.thumb_feedback
|
|
460
|
+
for feedback in effective
|
|
461
|
+
if feedback.thumb_feedback in {"up", "down"}
|
|
462
|
+
]
|
|
433
463
|
avg_score = sum(scores) / len(scores) if scores else None
|
|
434
464
|
thumb_up_rate = None
|
|
435
465
|
if thumbs:
|
|
@@ -437,7 +467,7 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
437
467
|
return FeedbackSummary(
|
|
438
468
|
avg_satisfaction_score=avg_score,
|
|
439
469
|
thumb_up_rate=thumb_up_rate,
|
|
440
|
-
total_feedback=len(
|
|
470
|
+
total_feedback=len(effective),
|
|
441
471
|
)
|
|
442
472
|
|
|
443
473
|
# Serialization helpers --------------------------------------------
|
|
@@ -495,7 +525,7 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
495
525
|
latency_ms=row["latency_ms"],
|
|
496
526
|
cost_usd=self._maybe_float(row["cost_usd"]),
|
|
497
527
|
trace_id=row["trace_id"],
|
|
498
|
-
started_at=self._deserialize_datetime(row["started_at"]),
|
|
528
|
+
started_at=self._deserialize_datetime(row["started_at"]) or datetime.now(),
|
|
499
529
|
finished_at=self._deserialize_datetime(row["finished_at"]),
|
|
500
530
|
question=row["question"],
|
|
501
531
|
answer=row["answer"],
|
|
@@ -525,8 +555,8 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
525
555
|
return [
|
|
526
556
|
MetricScore(
|
|
527
557
|
name=self._resolve_metric_name(row, metric_column),
|
|
528
|
-
score=self._maybe_float(self._row_value(row, "score")),
|
|
529
|
-
threshold=self._maybe_float(self._row_value(row, "threshold")),
|
|
558
|
+
score=self._maybe_float(self._row_value(row, "score")) or 0.0,
|
|
559
|
+
threshold=self._maybe_float(self._row_value(row, "threshold")) or 0.7,
|
|
530
560
|
reason=self._row_value(row, "reason"),
|
|
531
561
|
)
|
|
532
562
|
for row in rows
|
|
@@ -585,3 +615,480 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
585
615
|
return row[key]
|
|
586
616
|
except (KeyError, TypeError, IndexError):
|
|
587
617
|
return None
|
|
618
|
+
|
|
619
|
+
def _row_to_mapping(self, row: Any) -> dict[str, Any]:
|
|
620
|
+
if row is None:
|
|
621
|
+
return {}
|
|
622
|
+
if isinstance(row, dict):
|
|
623
|
+
return dict(row)
|
|
624
|
+
if hasattr(row, "keys"):
|
|
625
|
+
keys = row.keys()
|
|
626
|
+
return {key: row[key] for key in keys}
|
|
627
|
+
try:
|
|
628
|
+
return dict(row)
|
|
629
|
+
except Exception:
|
|
630
|
+
return {}
|
|
631
|
+
|
|
632
|
+
def _coerce_excel_value(self, value: Any, *, force_json: bool = False) -> Any:
|
|
633
|
+
if force_json:
|
|
634
|
+
payload = self._deserialize_json(value)
|
|
635
|
+
if payload is None:
|
|
636
|
+
return None
|
|
637
|
+
return json.dumps(payload, ensure_ascii=False)
|
|
638
|
+
if isinstance(value, (dict, list)):
|
|
639
|
+
return json.dumps(value, ensure_ascii=False)
|
|
640
|
+
if isinstance(value, datetime):
|
|
641
|
+
return value.isoformat()
|
|
642
|
+
if isinstance(value, bytes):
|
|
643
|
+
return value.decode("utf-8", errors="replace")
|
|
644
|
+
return value
|
|
645
|
+
|
|
646
|
+
def _normalize_rows(
|
|
647
|
+
self,
|
|
648
|
+
rows: Sequence[Any],
|
|
649
|
+
*,
|
|
650
|
+
json_columns: set[str] | None = None,
|
|
651
|
+
) -> list[dict[str, Any]]:
|
|
652
|
+
json_columns = json_columns or set()
|
|
653
|
+
normalized: list[dict[str, Any]] = []
|
|
654
|
+
for row in rows:
|
|
655
|
+
payload = self._row_to_mapping(row)
|
|
656
|
+
for key, value in payload.items():
|
|
657
|
+
payload[key] = self._coerce_excel_value(
|
|
658
|
+
value,
|
|
659
|
+
force_json=key in json_columns,
|
|
660
|
+
)
|
|
661
|
+
normalized.append(payload)
|
|
662
|
+
return normalized
|
|
663
|
+
|
|
664
|
+
def export_run_to_excel(self, run_id: str, output_path) -> Path:
|
|
665
|
+
from openpyxl import Workbook
|
|
666
|
+
|
|
667
|
+
output = Path(output_path)
|
|
668
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
669
|
+
|
|
670
|
+
placeholder = self.queries.placeholder
|
|
671
|
+
|
|
672
|
+
with self._get_connection() as conn:
|
|
673
|
+
run_row = self._execute(conn, self.queries.select_run(), (run_id,)).fetchone()
|
|
674
|
+
if not run_row:
|
|
675
|
+
raise KeyError(f"Run not found: {run_id}")
|
|
676
|
+
|
|
677
|
+
run_rows = self._normalize_rows(
|
|
678
|
+
[run_row],
|
|
679
|
+
json_columns={"metrics_evaluated", "thresholds", "metadata", "retrieval_metadata"},
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
test_case_rows = self._execute(
|
|
683
|
+
conn,
|
|
684
|
+
(
|
|
685
|
+
"SELECT id, run_id, test_case_id, tokens_used, latency_ms, cost_usd, trace_id, "
|
|
686
|
+
"started_at, finished_at, question, answer, contexts, ground_truth "
|
|
687
|
+
f"FROM test_case_results WHERE run_id = {placeholder} ORDER BY id"
|
|
688
|
+
),
|
|
689
|
+
(run_id,),
|
|
690
|
+
).fetchall()
|
|
691
|
+
test_case_payloads = self._normalize_rows(
|
|
692
|
+
test_case_rows,
|
|
693
|
+
json_columns={"contexts"},
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
metric_rows = self._execute(
|
|
697
|
+
conn,
|
|
698
|
+
(
|
|
699
|
+
"SELECT m.result_id, t.test_case_id, m."
|
|
700
|
+
f"{self.queries.metric_name_column} AS metric_name, m.score, m.threshold, m.reason "
|
|
701
|
+
"FROM metric_scores m JOIN test_case_results t ON m.result_id = t.id "
|
|
702
|
+
f"WHERE t.run_id = {placeholder} ORDER BY m.id"
|
|
703
|
+
),
|
|
704
|
+
(run_id,),
|
|
705
|
+
).fetchall()
|
|
706
|
+
metric_payloads = self._normalize_rows(metric_rows)
|
|
707
|
+
|
|
708
|
+
run_prompt_rows = self._execute(
|
|
709
|
+
conn,
|
|
710
|
+
(
|
|
711
|
+
"SELECT run_id, prompt_set_id, created_at FROM run_prompt_sets "
|
|
712
|
+
f"WHERE run_id = {placeholder} ORDER BY created_at DESC"
|
|
713
|
+
),
|
|
714
|
+
(run_id,),
|
|
715
|
+
).fetchall()
|
|
716
|
+
run_prompt_payloads = self._normalize_rows(run_prompt_rows)
|
|
717
|
+
prompt_set_ids = [row.get("prompt_set_id") for row in run_prompt_payloads if row]
|
|
718
|
+
|
|
719
|
+
prompt_sets_payloads: list[dict[str, Any]] = []
|
|
720
|
+
prompt_set_item_payloads: list[dict[str, Any]] = []
|
|
721
|
+
prompt_payloads: list[dict[str, Any]] = []
|
|
722
|
+
|
|
723
|
+
if prompt_set_ids:
|
|
724
|
+
placeholders = ", ".join([placeholder] * len(prompt_set_ids))
|
|
725
|
+
prompt_set_rows = self._execute(
|
|
726
|
+
conn,
|
|
727
|
+
(
|
|
728
|
+
"SELECT prompt_set_id, name, description, metadata, created_at "
|
|
729
|
+
f"FROM prompt_sets WHERE prompt_set_id IN ({placeholders})"
|
|
730
|
+
),
|
|
731
|
+
prompt_set_ids,
|
|
732
|
+
).fetchall()
|
|
733
|
+
prompt_sets_payloads = self._normalize_rows(
|
|
734
|
+
prompt_set_rows,
|
|
735
|
+
json_columns={"metadata"},
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
item_rows = self._execute(
|
|
739
|
+
conn,
|
|
740
|
+
(
|
|
741
|
+
"SELECT id, prompt_set_id, prompt_id, role, item_order, metadata "
|
|
742
|
+
f"FROM prompt_set_items WHERE prompt_set_id IN ({placeholders})"
|
|
743
|
+
),
|
|
744
|
+
prompt_set_ids,
|
|
745
|
+
).fetchall()
|
|
746
|
+
prompt_set_item_payloads = self._normalize_rows(
|
|
747
|
+
item_rows,
|
|
748
|
+
json_columns={"metadata"},
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
prompt_ids = [row.get("prompt_id") for row in prompt_set_item_payloads if row]
|
|
752
|
+
if prompt_ids:
|
|
753
|
+
prompt_placeholders = ", ".join([placeholder] * len(prompt_ids))
|
|
754
|
+
prompt_rows = self._execute(
|
|
755
|
+
conn,
|
|
756
|
+
(
|
|
757
|
+
"SELECT prompt_id, name, kind, content, checksum, source, notes, metadata, created_at "
|
|
758
|
+
f"FROM prompts WHERE prompt_id IN ({prompt_placeholders})"
|
|
759
|
+
),
|
|
760
|
+
prompt_ids,
|
|
761
|
+
).fetchall()
|
|
762
|
+
prompt_payloads = self._normalize_rows(
|
|
763
|
+
prompt_rows,
|
|
764
|
+
json_columns={"metadata"},
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
feedback_rows = self._execute(
|
|
768
|
+
conn,
|
|
769
|
+
(
|
|
770
|
+
"SELECT id, run_id, test_case_id, satisfaction_score, thumb_feedback, comment, rater_id, created_at "
|
|
771
|
+
f"FROM satisfaction_feedback WHERE run_id = {placeholder} ORDER BY created_at DESC"
|
|
772
|
+
),
|
|
773
|
+
(run_id,),
|
|
774
|
+
).fetchall()
|
|
775
|
+
feedback_payloads = self._normalize_rows(feedback_rows)
|
|
776
|
+
|
|
777
|
+
cluster_rows = self._execute(
|
|
778
|
+
conn,
|
|
779
|
+
(
|
|
780
|
+
"SELECT run_id, map_id, test_case_id, cluster_id, source, metadata, created_at "
|
|
781
|
+
f"FROM run_cluster_maps WHERE run_id = {placeholder} ORDER BY created_at DESC"
|
|
782
|
+
),
|
|
783
|
+
(run_id,),
|
|
784
|
+
).fetchall()
|
|
785
|
+
cluster_payloads = self._normalize_rows(cluster_rows, json_columns={"metadata"})
|
|
786
|
+
|
|
787
|
+
stage_event_rows = self._execute(
|
|
788
|
+
conn,
|
|
789
|
+
(
|
|
790
|
+
"SELECT id, run_id, stage_id, parent_stage_id, stage_type, stage_name, status, "
|
|
791
|
+
"attempt, started_at, finished_at, duration_ms, input_ref, output_ref, attributes, "
|
|
792
|
+
"metadata, trace_id, span_id FROM stage_events "
|
|
793
|
+
f"WHERE run_id = {placeholder} ORDER BY id"
|
|
794
|
+
),
|
|
795
|
+
(run_id,),
|
|
796
|
+
).fetchall()
|
|
797
|
+
stage_event_payloads = self._normalize_rows(
|
|
798
|
+
stage_event_rows,
|
|
799
|
+
json_columns={"attributes", "metadata"},
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
stage_metric_rows = self._execute(
|
|
803
|
+
conn,
|
|
804
|
+
(
|
|
805
|
+
"SELECT id, run_id, stage_id, metric_name, score, threshold, evidence "
|
|
806
|
+
f"FROM stage_metrics WHERE run_id = {placeholder} ORDER BY id"
|
|
807
|
+
),
|
|
808
|
+
(run_id,),
|
|
809
|
+
).fetchall()
|
|
810
|
+
stage_metric_payloads = self._normalize_rows(
|
|
811
|
+
stage_metric_rows, json_columns={"evidence"}
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
report_rows = self._execute(
|
|
815
|
+
conn,
|
|
816
|
+
(
|
|
817
|
+
"SELECT report_id, run_id, experiment_id, report_type, format, content, metadata, created_at "
|
|
818
|
+
f"FROM analysis_reports WHERE run_id = {placeholder} ORDER BY created_at DESC"
|
|
819
|
+
),
|
|
820
|
+
(run_id,),
|
|
821
|
+
).fetchall()
|
|
822
|
+
report_payloads = self._normalize_rows(report_rows, json_columns={"metadata"})
|
|
823
|
+
|
|
824
|
+
pipeline_rows = self._execute(
|
|
825
|
+
conn,
|
|
826
|
+
(
|
|
827
|
+
"SELECT result_id, intent, query, run_id, pipeline_id, profile, tags, metadata, "
|
|
828
|
+
"is_complete, duration_ms, final_output, node_results, started_at, finished_at, created_at "
|
|
829
|
+
f"FROM pipeline_results WHERE run_id = {placeholder} ORDER BY created_at DESC"
|
|
830
|
+
),
|
|
831
|
+
(run_id,),
|
|
832
|
+
).fetchall()
|
|
833
|
+
pipeline_payloads = self._normalize_rows(
|
|
834
|
+
pipeline_rows,
|
|
835
|
+
json_columns={"tags", "metadata", "final_output", "node_results"},
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
summary_rows: list[dict[str, Any]] = []
|
|
839
|
+
run_payload = run_rows[0] if run_rows else {}
|
|
840
|
+
prompt_set_id = None
|
|
841
|
+
prompt_set_name = None
|
|
842
|
+
if run_prompt_payloads:
|
|
843
|
+
prompt_set_id = run_prompt_payloads[0].get("prompt_set_id")
|
|
844
|
+
if prompt_sets_payloads:
|
|
845
|
+
prompt_set_name = prompt_sets_payloads[0].get("name")
|
|
846
|
+
summary_rows.append(
|
|
847
|
+
{
|
|
848
|
+
"run_id": run_payload.get("run_id"),
|
|
849
|
+
"dataset_name": run_payload.get("dataset_name"),
|
|
850
|
+
"model_name": run_payload.get("model_name"),
|
|
851
|
+
"started_at": run_payload.get("started_at"),
|
|
852
|
+
"finished_at": run_payload.get("finished_at"),
|
|
853
|
+
"total_test_cases": len(test_case_payloads),
|
|
854
|
+
"total_tokens": run_payload.get("total_tokens"),
|
|
855
|
+
"total_cost_usd": run_payload.get("total_cost_usd"),
|
|
856
|
+
"pass_rate": run_payload.get("pass_rate"),
|
|
857
|
+
"metrics_evaluated": run_payload.get("metrics_evaluated"),
|
|
858
|
+
"prompt_set_id": prompt_set_id,
|
|
859
|
+
"prompt_set_name": prompt_set_name,
|
|
860
|
+
}
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
metric_summary_rows: list[dict[str, Any]] = []
|
|
864
|
+
metrics_index: dict[str, dict[str, Any]] = {}
|
|
865
|
+
for row in metric_payloads:
|
|
866
|
+
metric_name = row.get("metric_name")
|
|
867
|
+
if not metric_name:
|
|
868
|
+
continue
|
|
869
|
+
entry = metrics_index.setdefault(
|
|
870
|
+
metric_name,
|
|
871
|
+
{"metric_name": metric_name, "count": 0, "score_sum": 0.0, "pass_count": 0},
|
|
872
|
+
)
|
|
873
|
+
score = row.get("score")
|
|
874
|
+
threshold = row.get("threshold")
|
|
875
|
+
if isinstance(score, (int, float)):
|
|
876
|
+
entry["count"] += 1
|
|
877
|
+
entry["score_sum"] += float(score)
|
|
878
|
+
if isinstance(threshold, (int, float)) and score >= threshold:
|
|
879
|
+
entry["pass_count"] += 1
|
|
880
|
+
|
|
881
|
+
for entry in metrics_index.values():
|
|
882
|
+
count = entry["count"] or 0
|
|
883
|
+
metric_summary_rows.append(
|
|
884
|
+
{
|
|
885
|
+
"metric_name": entry["metric_name"],
|
|
886
|
+
"avg_score": (entry["score_sum"] / count) if count else None,
|
|
887
|
+
"pass_rate": (entry["pass_count"] / count) if count else None,
|
|
888
|
+
"samples": count,
|
|
889
|
+
}
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
sheet_order: list[tuple[str, list[dict[str, Any]], list[str]]] = [
|
|
893
|
+
(
|
|
894
|
+
"Summary",
|
|
895
|
+
summary_rows,
|
|
896
|
+
[
|
|
897
|
+
"run_id",
|
|
898
|
+
"dataset_name",
|
|
899
|
+
"model_name",
|
|
900
|
+
"started_at",
|
|
901
|
+
"finished_at",
|
|
902
|
+
"total_test_cases",
|
|
903
|
+
"total_tokens",
|
|
904
|
+
"total_cost_usd",
|
|
905
|
+
"pass_rate",
|
|
906
|
+
"metrics_evaluated",
|
|
907
|
+
"prompt_set_id",
|
|
908
|
+
"prompt_set_name",
|
|
909
|
+
],
|
|
910
|
+
),
|
|
911
|
+
(
|
|
912
|
+
"Run",
|
|
913
|
+
run_rows,
|
|
914
|
+
[
|
|
915
|
+
"run_id",
|
|
916
|
+
"dataset_name",
|
|
917
|
+
"dataset_version",
|
|
918
|
+
"model_name",
|
|
919
|
+
"started_at",
|
|
920
|
+
"finished_at",
|
|
921
|
+
"total_tokens",
|
|
922
|
+
"total_cost_usd",
|
|
923
|
+
"pass_rate",
|
|
924
|
+
"metrics_evaluated",
|
|
925
|
+
"thresholds",
|
|
926
|
+
"langfuse_trace_id",
|
|
927
|
+
"metadata",
|
|
928
|
+
"retrieval_metadata",
|
|
929
|
+
"created_at",
|
|
930
|
+
],
|
|
931
|
+
),
|
|
932
|
+
(
|
|
933
|
+
"TestCases",
|
|
934
|
+
test_case_payloads,
|
|
935
|
+
[
|
|
936
|
+
"id",
|
|
937
|
+
"run_id",
|
|
938
|
+
"test_case_id",
|
|
939
|
+
"tokens_used",
|
|
940
|
+
"latency_ms",
|
|
941
|
+
"cost_usd",
|
|
942
|
+
"trace_id",
|
|
943
|
+
"started_at",
|
|
944
|
+
"finished_at",
|
|
945
|
+
"question",
|
|
946
|
+
"answer",
|
|
947
|
+
"contexts",
|
|
948
|
+
"ground_truth",
|
|
949
|
+
],
|
|
950
|
+
),
|
|
951
|
+
(
|
|
952
|
+
"MetricScores",
|
|
953
|
+
metric_payloads,
|
|
954
|
+
["result_id", "test_case_id", "metric_name", "score", "threshold", "reason"],
|
|
955
|
+
),
|
|
956
|
+
(
|
|
957
|
+
"MetricsSummary",
|
|
958
|
+
metric_summary_rows,
|
|
959
|
+
["metric_name", "avg_score", "pass_rate", "samples"],
|
|
960
|
+
),
|
|
961
|
+
(
|
|
962
|
+
"RunPromptSets",
|
|
963
|
+
run_prompt_payloads,
|
|
964
|
+
["run_id", "prompt_set_id", "created_at"],
|
|
965
|
+
),
|
|
966
|
+
(
|
|
967
|
+
"PromptSets",
|
|
968
|
+
prompt_sets_payloads,
|
|
969
|
+
["prompt_set_id", "name", "description", "metadata", "created_at"],
|
|
970
|
+
),
|
|
971
|
+
(
|
|
972
|
+
"PromptSetItems",
|
|
973
|
+
prompt_set_item_payloads,
|
|
974
|
+
["id", "prompt_set_id", "prompt_id", "role", "item_order", "metadata"],
|
|
975
|
+
),
|
|
976
|
+
(
|
|
977
|
+
"Prompts",
|
|
978
|
+
prompt_payloads,
|
|
979
|
+
[
|
|
980
|
+
"prompt_id",
|
|
981
|
+
"name",
|
|
982
|
+
"kind",
|
|
983
|
+
"content",
|
|
984
|
+
"checksum",
|
|
985
|
+
"source",
|
|
986
|
+
"notes",
|
|
987
|
+
"metadata",
|
|
988
|
+
"created_at",
|
|
989
|
+
],
|
|
990
|
+
),
|
|
991
|
+
(
|
|
992
|
+
"Feedback",
|
|
993
|
+
feedback_payloads,
|
|
994
|
+
[
|
|
995
|
+
"id",
|
|
996
|
+
"run_id",
|
|
997
|
+
"test_case_id",
|
|
998
|
+
"satisfaction_score",
|
|
999
|
+
"thumb_feedback",
|
|
1000
|
+
"comment",
|
|
1001
|
+
"rater_id",
|
|
1002
|
+
"created_at",
|
|
1003
|
+
],
|
|
1004
|
+
),
|
|
1005
|
+
(
|
|
1006
|
+
"ClusterMaps",
|
|
1007
|
+
cluster_payloads,
|
|
1008
|
+
[
|
|
1009
|
+
"run_id",
|
|
1010
|
+
"map_id",
|
|
1011
|
+
"test_case_id",
|
|
1012
|
+
"cluster_id",
|
|
1013
|
+
"source",
|
|
1014
|
+
"metadata",
|
|
1015
|
+
"created_at",
|
|
1016
|
+
],
|
|
1017
|
+
),
|
|
1018
|
+
(
|
|
1019
|
+
"StageEvents",
|
|
1020
|
+
stage_event_payloads,
|
|
1021
|
+
[
|
|
1022
|
+
"id",
|
|
1023
|
+
"run_id",
|
|
1024
|
+
"stage_id",
|
|
1025
|
+
"parent_stage_id",
|
|
1026
|
+
"stage_type",
|
|
1027
|
+
"stage_name",
|
|
1028
|
+
"status",
|
|
1029
|
+
"attempt",
|
|
1030
|
+
"started_at",
|
|
1031
|
+
"finished_at",
|
|
1032
|
+
"duration_ms",
|
|
1033
|
+
"input_ref",
|
|
1034
|
+
"output_ref",
|
|
1035
|
+
"attributes",
|
|
1036
|
+
"metadata",
|
|
1037
|
+
"trace_id",
|
|
1038
|
+
"span_id",
|
|
1039
|
+
],
|
|
1040
|
+
),
|
|
1041
|
+
(
|
|
1042
|
+
"StageMetrics",
|
|
1043
|
+
stage_metric_payloads,
|
|
1044
|
+
["id", "run_id", "stage_id", "metric_name", "score", "threshold", "evidence"],
|
|
1045
|
+
),
|
|
1046
|
+
(
|
|
1047
|
+
"AnalysisReports",
|
|
1048
|
+
report_payloads,
|
|
1049
|
+
[
|
|
1050
|
+
"report_id",
|
|
1051
|
+
"run_id",
|
|
1052
|
+
"experiment_id",
|
|
1053
|
+
"report_type",
|
|
1054
|
+
"format",
|
|
1055
|
+
"content",
|
|
1056
|
+
"metadata",
|
|
1057
|
+
"created_at",
|
|
1058
|
+
],
|
|
1059
|
+
),
|
|
1060
|
+
(
|
|
1061
|
+
"PipelineResults",
|
|
1062
|
+
pipeline_payloads,
|
|
1063
|
+
[
|
|
1064
|
+
"result_id",
|
|
1065
|
+
"intent",
|
|
1066
|
+
"query",
|
|
1067
|
+
"run_id",
|
|
1068
|
+
"pipeline_id",
|
|
1069
|
+
"profile",
|
|
1070
|
+
"tags",
|
|
1071
|
+
"metadata",
|
|
1072
|
+
"is_complete",
|
|
1073
|
+
"duration_ms",
|
|
1074
|
+
"final_output",
|
|
1075
|
+
"node_results",
|
|
1076
|
+
"started_at",
|
|
1077
|
+
"finished_at",
|
|
1078
|
+
"created_at",
|
|
1079
|
+
],
|
|
1080
|
+
),
|
|
1081
|
+
]
|
|
1082
|
+
|
|
1083
|
+
workbook = Workbook()
|
|
1084
|
+
default_sheet = workbook.active
|
|
1085
|
+
if default_sheet is not None:
|
|
1086
|
+
workbook.remove(default_sheet)
|
|
1087
|
+
for sheet_name, rows, columns in sheet_order:
|
|
1088
|
+
worksheet = cast(Any, workbook.create_sheet(title=sheet_name))
|
|
1089
|
+
worksheet.append(columns)
|
|
1090
|
+
for row in rows:
|
|
1091
|
+
worksheet.append([row.get(column) for column in columns])
|
|
1092
|
+
|
|
1093
|
+
workbook.save(output)
|
|
1094
|
+
return output
|