evalvault 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +88 -5
- evalvault/adapters/inbound/cli/commands/run_helpers.py +12 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/improvement/pattern_detector.py +1 -1
- evalvault/adapters/outbound/improvement/playbook_loader.py +1 -1
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +528 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/METADATA +228 -4
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/RECORD +48 -40
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/WHEEL +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -27,6 +27,7 @@ from evalvault.domain.entities.analysis import (
|
|
|
27
27
|
)
|
|
28
28
|
from evalvault.domain.entities.experiment import Experiment
|
|
29
29
|
from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
|
|
30
|
+
from evalvault.domain.entities.stage import StageEvent, StageMetric
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
@@ -823,6 +824,56 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
823
824
|
)
|
|
824
825
|
conn.commit()
|
|
825
826
|
|
|
827
|
+
def save_analysis_report(
|
|
828
|
+
self,
|
|
829
|
+
*,
|
|
830
|
+
report_id: str | None,
|
|
831
|
+
run_id: str | None,
|
|
832
|
+
experiment_id: str | None,
|
|
833
|
+
report_type: str,
|
|
834
|
+
format: str,
|
|
835
|
+
content: str | None,
|
|
836
|
+
metadata: dict[str, Any] | None = None,
|
|
837
|
+
created_at: str | None = None,
|
|
838
|
+
) -> str:
|
|
839
|
+
report_id = report_id or str(uuid.uuid4())
|
|
840
|
+
if created_at is None:
|
|
841
|
+
created_at_value = datetime.now(UTC)
|
|
842
|
+
else:
|
|
843
|
+
created_at_value = (
|
|
844
|
+
datetime.fromisoformat(created_at) if isinstance(created_at, str) else created_at
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
with self._get_connection() as conn:
|
|
848
|
+
conn.execute(
|
|
849
|
+
"""
|
|
850
|
+
INSERT INTO analysis_reports (
|
|
851
|
+
report_id, run_id, experiment_id, report_type, format, content, metadata, created_at
|
|
852
|
+
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|
853
|
+
ON CONFLICT (report_id) DO UPDATE SET
|
|
854
|
+
run_id = EXCLUDED.run_id,
|
|
855
|
+
experiment_id = EXCLUDED.experiment_id,
|
|
856
|
+
report_type = EXCLUDED.report_type,
|
|
857
|
+
format = EXCLUDED.format,
|
|
858
|
+
content = EXCLUDED.content,
|
|
859
|
+
metadata = EXCLUDED.metadata,
|
|
860
|
+
created_at = EXCLUDED.created_at
|
|
861
|
+
""",
|
|
862
|
+
(
|
|
863
|
+
report_id,
|
|
864
|
+
run_id,
|
|
865
|
+
experiment_id,
|
|
866
|
+
report_type,
|
|
867
|
+
format,
|
|
868
|
+
content,
|
|
869
|
+
self._serialize_pipeline_json(metadata),
|
|
870
|
+
created_at_value,
|
|
871
|
+
),
|
|
872
|
+
)
|
|
873
|
+
conn.commit()
|
|
874
|
+
|
|
875
|
+
return report_id
|
|
876
|
+
|
|
826
877
|
def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
827
878
|
"""파이프라인 분석 결과 목록을 조회합니다."""
|
|
828
879
|
query = """
|
|
@@ -837,6 +888,164 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
837
888
|
rows = conn.execute(query, (limit,)).fetchall()
|
|
838
889
|
return [self._deserialize_pipeline_result(row, include_payload=False) for row in rows]
|
|
839
890
|
|
|
891
|
+
def save_stage_events(self, events: list[StageEvent]) -> int:
|
|
892
|
+
if not events:
|
|
893
|
+
return 0
|
|
894
|
+
with self._get_connection() as conn:
|
|
895
|
+
conn.executemany(
|
|
896
|
+
"""
|
|
897
|
+
INSERT INTO stage_events (
|
|
898
|
+
run_id, stage_id, parent_stage_id, stage_type, stage_name,
|
|
899
|
+
status, attempt, started_at, finished_at, duration_ms,
|
|
900
|
+
input_ref, output_ref, attributes, metadata, trace_id, span_id
|
|
901
|
+
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
902
|
+
ON CONFLICT (run_id, stage_id) DO UPDATE SET
|
|
903
|
+
parent_stage_id = EXCLUDED.parent_stage_id,
|
|
904
|
+
stage_type = EXCLUDED.stage_type,
|
|
905
|
+
stage_name = EXCLUDED.stage_name,
|
|
906
|
+
status = EXCLUDED.status,
|
|
907
|
+
attempt = EXCLUDED.attempt,
|
|
908
|
+
started_at = EXCLUDED.started_at,
|
|
909
|
+
finished_at = EXCLUDED.finished_at,
|
|
910
|
+
duration_ms = EXCLUDED.duration_ms,
|
|
911
|
+
input_ref = EXCLUDED.input_ref,
|
|
912
|
+
output_ref = EXCLUDED.output_ref,
|
|
913
|
+
attributes = EXCLUDED.attributes,
|
|
914
|
+
metadata = EXCLUDED.metadata,
|
|
915
|
+
trace_id = EXCLUDED.trace_id,
|
|
916
|
+
span_id = EXCLUDED.span_id
|
|
917
|
+
""",
|
|
918
|
+
[self._serialize_stage_event(event) for event in events],
|
|
919
|
+
)
|
|
920
|
+
conn.commit()
|
|
921
|
+
return len(events)
|
|
922
|
+
|
|
923
|
+
def list_stage_events(
|
|
924
|
+
self,
|
|
925
|
+
run_id: str,
|
|
926
|
+
*,
|
|
927
|
+
stage_type: str | None = None,
|
|
928
|
+
) -> list[StageEvent]:
|
|
929
|
+
query = (
|
|
930
|
+
"SELECT run_id, stage_id, parent_stage_id, stage_type, stage_name, status, attempt, "
|
|
931
|
+
"started_at, finished_at, duration_ms, input_ref, output_ref, attributes, metadata, "
|
|
932
|
+
"trace_id, span_id FROM stage_events WHERE run_id = %s"
|
|
933
|
+
)
|
|
934
|
+
params: list[Any] = [run_id]
|
|
935
|
+
if stage_type:
|
|
936
|
+
query += " AND stage_type = %s"
|
|
937
|
+
params.append(stage_type)
|
|
938
|
+
query += " ORDER BY id"
|
|
939
|
+
with self._get_connection() as conn:
|
|
940
|
+
rows = conn.execute(query, params).fetchall()
|
|
941
|
+
return [self._deserialize_stage_event(row) for row in rows]
|
|
942
|
+
|
|
943
|
+
def save_stage_metrics(self, metrics: list[StageMetric]) -> int:
|
|
944
|
+
if not metrics:
|
|
945
|
+
return 0
|
|
946
|
+
with self._get_connection() as conn:
|
|
947
|
+
conn.executemany(
|
|
948
|
+
"""
|
|
949
|
+
INSERT INTO stage_metrics (
|
|
950
|
+
run_id, stage_id, metric_name, score, threshold, evidence
|
|
951
|
+
) VALUES (%s, %s, %s, %s, %s, %s)
|
|
952
|
+
""",
|
|
953
|
+
[self._serialize_stage_metric(metric) for metric in metrics],
|
|
954
|
+
)
|
|
955
|
+
conn.commit()
|
|
956
|
+
return len(metrics)
|
|
957
|
+
|
|
958
|
+
def list_stage_metrics(
|
|
959
|
+
self,
|
|
960
|
+
run_id: str,
|
|
961
|
+
*,
|
|
962
|
+
stage_id: str | None = None,
|
|
963
|
+
metric_name: str | None = None,
|
|
964
|
+
) -> list[StageMetric]:
|
|
965
|
+
query = (
|
|
966
|
+
"SELECT run_id, stage_id, metric_name, score, threshold, evidence "
|
|
967
|
+
"FROM stage_metrics WHERE run_id = %s"
|
|
968
|
+
)
|
|
969
|
+
params: list[Any] = [run_id]
|
|
970
|
+
if stage_id:
|
|
971
|
+
query += " AND stage_id = %s"
|
|
972
|
+
params.append(stage_id)
|
|
973
|
+
if metric_name:
|
|
974
|
+
query += " AND metric_name = %s"
|
|
975
|
+
params.append(metric_name)
|
|
976
|
+
query += " ORDER BY id"
|
|
977
|
+
with self._get_connection() as conn:
|
|
978
|
+
rows = conn.execute(query, params).fetchall()
|
|
979
|
+
return [self._deserialize_stage_metric(row) for row in rows]
|
|
980
|
+
|
|
981
|
+
def _serialize_stage_event(self, event: StageEvent) -> tuple[Any, ...]:
|
|
982
|
+
return (
|
|
983
|
+
event.run_id,
|
|
984
|
+
event.stage_id,
|
|
985
|
+
event.parent_stage_id,
|
|
986
|
+
event.stage_type,
|
|
987
|
+
event.stage_name,
|
|
988
|
+
event.status,
|
|
989
|
+
event.attempt,
|
|
990
|
+
event.started_at,
|
|
991
|
+
event.finished_at,
|
|
992
|
+
event.duration_ms,
|
|
993
|
+
self._serialize_payload_ref(event.input_ref),
|
|
994
|
+
self._serialize_payload_ref(event.output_ref),
|
|
995
|
+
self._serialize_pipeline_json(event.attributes),
|
|
996
|
+
self._serialize_pipeline_json(event.metadata),
|
|
997
|
+
event.trace_id,
|
|
998
|
+
event.span_id,
|
|
999
|
+
)
|
|
1000
|
+
|
|
1001
|
+
def _serialize_stage_metric(self, metric: StageMetric) -> tuple[Any, ...]:
|
|
1002
|
+
return (
|
|
1003
|
+
metric.run_id,
|
|
1004
|
+
metric.stage_id,
|
|
1005
|
+
metric.metric_name,
|
|
1006
|
+
metric.score,
|
|
1007
|
+
metric.threshold,
|
|
1008
|
+
self._serialize_pipeline_json(metric.evidence),
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
def _serialize_payload_ref(self, ref: Any) -> str | None:
|
|
1012
|
+
if ref is None:
|
|
1013
|
+
return None
|
|
1014
|
+
payload = ref.to_dict() if hasattr(ref, "to_dict") else ref
|
|
1015
|
+
return self._serialize_pipeline_json(payload)
|
|
1016
|
+
|
|
1017
|
+
def _deserialize_stage_event(self, row: dict[str, Any]) -> StageEvent:
|
|
1018
|
+
payload = {
|
|
1019
|
+
"run_id": row.get("run_id"),
|
|
1020
|
+
"stage_id": row.get("stage_id"),
|
|
1021
|
+
"parent_stage_id": row.get("parent_stage_id"),
|
|
1022
|
+
"stage_type": row.get("stage_type"),
|
|
1023
|
+
"stage_name": row.get("stage_name"),
|
|
1024
|
+
"status": row.get("status"),
|
|
1025
|
+
"attempt": row.get("attempt"),
|
|
1026
|
+
"started_at": row.get("started_at"),
|
|
1027
|
+
"finished_at": row.get("finished_at"),
|
|
1028
|
+
"duration_ms": row.get("duration_ms"),
|
|
1029
|
+
"input_ref": self._ensure_json(row.get("input_ref")),
|
|
1030
|
+
"output_ref": self._ensure_json(row.get("output_ref")),
|
|
1031
|
+
"attributes": self._ensure_json(row.get("attributes")) or {},
|
|
1032
|
+
"metadata": self._ensure_json(row.get("metadata")) or {},
|
|
1033
|
+
"trace_id": row.get("trace_id"),
|
|
1034
|
+
"span_id": row.get("span_id"),
|
|
1035
|
+
}
|
|
1036
|
+
return StageEvent.from_dict(payload)
|
|
1037
|
+
|
|
1038
|
+
def _deserialize_stage_metric(self, row: dict[str, Any]) -> StageMetric:
|
|
1039
|
+
payload = {
|
|
1040
|
+
"run_id": row.get("run_id"),
|
|
1041
|
+
"stage_id": row.get("stage_id"),
|
|
1042
|
+
"metric_name": row.get("metric_name"),
|
|
1043
|
+
"score": row.get("score"),
|
|
1044
|
+
"threshold": row.get("threshold"),
|
|
1045
|
+
"evidence": self._ensure_json(row.get("evidence")),
|
|
1046
|
+
}
|
|
1047
|
+
return StageMetric.from_dict(payload)
|
|
1048
|
+
|
|
840
1049
|
def get_pipeline_result(self, result_id: str) -> dict[str, Any]:
|
|
841
1050
|
"""저장된 파이프라인 분석 결과를 조회합니다."""
|
|
842
1051
|
with self._get_connection() as conn:
|
|
@@ -206,3 +206,41 @@ CREATE INDEX IF NOT EXISTS idx_pipeline_results_intent
|
|
|
206
206
|
ON pipeline_results(intent);
|
|
207
207
|
CREATE INDEX IF NOT EXISTS idx_pipeline_results_run_id
|
|
208
208
|
ON pipeline_results(run_id);
|
|
209
|
+
|
|
210
|
+
CREATE TABLE IF NOT EXISTS stage_events (
|
|
211
|
+
id BIGSERIAL PRIMARY KEY,
|
|
212
|
+
run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
|
|
213
|
+
stage_id TEXT NOT NULL,
|
|
214
|
+
parent_stage_id TEXT,
|
|
215
|
+
stage_type TEXT NOT NULL,
|
|
216
|
+
stage_name TEXT,
|
|
217
|
+
status TEXT,
|
|
218
|
+
attempt INTEGER DEFAULT 1,
|
|
219
|
+
started_at TIMESTAMP WITH TIME ZONE,
|
|
220
|
+
finished_at TIMESTAMP WITH TIME ZONE,
|
|
221
|
+
duration_ms DOUBLE PRECISION,
|
|
222
|
+
input_ref JSONB,
|
|
223
|
+
output_ref JSONB,
|
|
224
|
+
attributes JSONB,
|
|
225
|
+
metadata JSONB,
|
|
226
|
+
trace_id TEXT,
|
|
227
|
+
span_id TEXT
|
|
228
|
+
);
|
|
229
|
+
|
|
230
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_stage_events_run_stage_id
|
|
231
|
+
ON stage_events(run_id, stage_id);
|
|
232
|
+
CREATE INDEX IF NOT EXISTS idx_stage_events_run_id ON stage_events(run_id);
|
|
233
|
+
CREATE INDEX IF NOT EXISTS idx_stage_events_stage_type ON stage_events(stage_type);
|
|
234
|
+
|
|
235
|
+
CREATE TABLE IF NOT EXISTS stage_metrics (
|
|
236
|
+
id BIGSERIAL PRIMARY KEY,
|
|
237
|
+
run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
|
|
238
|
+
stage_id TEXT NOT NULL,
|
|
239
|
+
metric_name TEXT NOT NULL,
|
|
240
|
+
score DOUBLE PRECISION NOT NULL,
|
|
241
|
+
threshold DOUBLE PRECISION,
|
|
242
|
+
evidence JSONB
|
|
243
|
+
);
|
|
244
|
+
|
|
245
|
+
CREATE INDEX IF NOT EXISTS idx_stage_metrics_run_id ON stage_metrics(run_id);
|
|
246
|
+
CREATE INDEX IF NOT EXISTS idx_stage_metrics_stage_id ON stage_metrics(stage_id);
|
|
@@ -4,10 +4,12 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import sqlite3
|
|
7
|
+
import uuid
|
|
8
|
+
from contextlib import AbstractContextManager, closing
|
|
7
9
|
from dataclasses import asdict
|
|
8
10
|
from datetime import datetime
|
|
9
11
|
from pathlib import Path
|
|
10
|
-
from typing import TYPE_CHECKING, Any
|
|
12
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
11
13
|
|
|
12
14
|
from evalvault.adapters.outbound.storage.base_sql import BaseSQLStorageAdapter, SQLQueries
|
|
13
15
|
from evalvault.domain.entities.analysis import (
|
|
@@ -61,15 +63,20 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
61
63
|
conn.commit()
|
|
62
64
|
conn.close()
|
|
63
65
|
|
|
64
|
-
def _connect(self) ->
|
|
66
|
+
def _connect(self) -> Any:
|
|
65
67
|
"""Create a DB-API connection with the expected options."""
|
|
66
68
|
conn = sqlite3.connect(self.db_path)
|
|
67
69
|
conn.row_factory = sqlite3.Row
|
|
68
70
|
conn.execute("PRAGMA foreign_keys = ON")
|
|
69
71
|
return conn
|
|
70
72
|
|
|
71
|
-
def
|
|
73
|
+
def _get_connection(self) -> AbstractContextManager[sqlite3.Connection]:
|
|
74
|
+
conn = self._connect()
|
|
75
|
+
return closing(cast(sqlite3.Connection, conn))
|
|
76
|
+
|
|
77
|
+
def _apply_migrations(self, conn: Any) -> None:
|
|
72
78
|
"""Apply schema migrations for legacy databases."""
|
|
79
|
+
conn = cast(Any, conn)
|
|
73
80
|
cursor = conn.execute("PRAGMA table_info(evaluation_runs)")
|
|
74
81
|
columns = {row[1] for row in cursor.fetchall()}
|
|
75
82
|
if "metadata" not in columns:
|
|
@@ -180,6 +187,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
180
187
|
def save_prompt_set(self, bundle: PromptSetBundle) -> None:
|
|
181
188
|
"""Save prompt set, prompts, and join items."""
|
|
182
189
|
with self._get_connection() as conn:
|
|
190
|
+
conn = cast(Any, conn)
|
|
183
191
|
cursor = conn.cursor()
|
|
184
192
|
cursor.execute(
|
|
185
193
|
"""
|
|
@@ -241,6 +249,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
241
249
|
def link_prompt_set_to_run(self, run_id: str, prompt_set_id: str) -> None:
|
|
242
250
|
"""Attach a prompt set to a run."""
|
|
243
251
|
with self._get_connection() as conn:
|
|
252
|
+
conn = cast(Any, conn)
|
|
244
253
|
conn.execute(
|
|
245
254
|
"""
|
|
246
255
|
INSERT OR REPLACE INTO run_prompt_sets (
|
|
@@ -258,6 +267,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
258
267
|
def get_prompt_set(self, prompt_set_id: str) -> PromptSetBundle:
|
|
259
268
|
"""Load a prompt set bundle by ID."""
|
|
260
269
|
with self._get_connection() as conn:
|
|
270
|
+
conn = cast(Any, conn)
|
|
261
271
|
cursor = conn.execute(
|
|
262
272
|
"""
|
|
263
273
|
SELECT prompt_set_id, name, description, metadata, created_at
|
|
@@ -270,12 +280,17 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
270
280
|
if not row:
|
|
271
281
|
raise KeyError(f"Prompt set not found: {prompt_set_id}")
|
|
272
282
|
|
|
283
|
+
created_at = self._deserialize_datetime(row["created_at"])
|
|
284
|
+
if created_at is None:
|
|
285
|
+
created_at = datetime.now()
|
|
286
|
+
assert created_at is not None
|
|
287
|
+
|
|
273
288
|
prompt_set = PromptSet(
|
|
274
289
|
prompt_set_id=row["prompt_set_id"],
|
|
275
290
|
name=row["name"],
|
|
276
291
|
description=row["description"] or "",
|
|
277
292
|
metadata=json.loads(row["metadata"]) if row["metadata"] else {},
|
|
278
|
-
created_at=
|
|
293
|
+
created_at=created_at,
|
|
279
294
|
)
|
|
280
295
|
|
|
281
296
|
item_rows = conn.execute(
|
|
@@ -313,6 +328,11 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
313
328
|
tuple(prompt_ids),
|
|
314
329
|
).fetchall()
|
|
315
330
|
for prompt_row in prompt_rows:
|
|
331
|
+
created_at = self._deserialize_datetime(prompt_row["created_at"])
|
|
332
|
+
if created_at is None:
|
|
333
|
+
created_at = datetime.now()
|
|
334
|
+
assert created_at is not None
|
|
335
|
+
|
|
316
336
|
prompts.append(
|
|
317
337
|
Prompt(
|
|
318
338
|
prompt_id=prompt_row["prompt_id"],
|
|
@@ -325,7 +345,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
325
345
|
metadata=json.loads(prompt_row["metadata"])
|
|
326
346
|
if prompt_row["metadata"]
|
|
327
347
|
else {},
|
|
328
|
-
created_at=
|
|
348
|
+
created_at=created_at,
|
|
329
349
|
)
|
|
330
350
|
)
|
|
331
351
|
|
|
@@ -334,6 +354,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
334
354
|
def get_prompt_set_for_run(self, run_id: str) -> PromptSetBundle | None:
|
|
335
355
|
"""Load the prompt set linked to a run."""
|
|
336
356
|
with self._get_connection() as conn:
|
|
357
|
+
conn = cast(Any, conn)
|
|
337
358
|
row = conn.execute(
|
|
338
359
|
"""
|
|
339
360
|
SELECT prompt_set_id
|
|
@@ -360,6 +381,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
360
381
|
저장된 experiment의 ID
|
|
361
382
|
"""
|
|
362
383
|
with self._get_connection() as conn:
|
|
384
|
+
conn = cast(Any, conn)
|
|
363
385
|
cursor = conn.cursor()
|
|
364
386
|
# Insert or replace experiment
|
|
365
387
|
cursor.execute(
|
|
@@ -425,6 +447,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
425
447
|
KeyError: 실험을 찾을 수 없는 경우
|
|
426
448
|
"""
|
|
427
449
|
with self._get_connection() as conn:
|
|
450
|
+
conn = cast(Any, conn)
|
|
428
451
|
cursor = conn.cursor()
|
|
429
452
|
# Fetch experiment
|
|
430
453
|
cursor.execute(
|
|
@@ -503,6 +526,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
503
526
|
Experiment 객체 리스트
|
|
504
527
|
"""
|
|
505
528
|
with self._get_connection() as conn:
|
|
529
|
+
conn = cast(Any, conn)
|
|
506
530
|
cursor = conn.cursor()
|
|
507
531
|
query = "SELECT experiment_id FROM experiments WHERE 1=1"
|
|
508
532
|
params = []
|
|
@@ -539,6 +563,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
539
563
|
저장된 analysis의 ID
|
|
540
564
|
"""
|
|
541
565
|
with self._get_connection() as conn:
|
|
566
|
+
conn = cast(Any, conn)
|
|
542
567
|
cursor = conn.cursor()
|
|
543
568
|
# Serialize analysis to JSON
|
|
544
569
|
result_data = self._serialize_analysis(analysis)
|
|
@@ -574,6 +599,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
574
599
|
KeyError: 분석을 찾을 수 없는 경우
|
|
575
600
|
"""
|
|
576
601
|
with self._get_connection() as conn:
|
|
602
|
+
conn = cast(Any, conn)
|
|
577
603
|
cursor = conn.cursor()
|
|
578
604
|
cursor.execute(
|
|
579
605
|
"""
|
|
@@ -612,6 +638,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
612
638
|
StatisticalAnalysis 리스트
|
|
613
639
|
"""
|
|
614
640
|
with self._get_connection() as conn:
|
|
641
|
+
conn = cast(Any, conn)
|
|
615
642
|
cursor = conn.cursor()
|
|
616
643
|
query = """
|
|
617
644
|
SELECT analysis_id, run_id, analysis_type, result_data, created_at
|
|
@@ -650,6 +677,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
650
677
|
삭제 성공 여부
|
|
651
678
|
"""
|
|
652
679
|
with self._get_connection() as conn:
|
|
680
|
+
conn = cast(Any, conn)
|
|
653
681
|
cursor = conn.cursor()
|
|
654
682
|
cursor.execute(
|
|
655
683
|
"DELETE FROM analysis_results WHERE analysis_id = ?",
|
|
@@ -726,6 +754,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
726
754
|
import uuid
|
|
727
755
|
|
|
728
756
|
with self._get_connection() as conn:
|
|
757
|
+
conn = cast(Any, conn)
|
|
729
758
|
cursor = conn.cursor()
|
|
730
759
|
analysis_id = f"nlp-{analysis.run_id}-{uuid.uuid4().hex[:8]}"
|
|
731
760
|
result_data = self._serialize_nlp_analysis(analysis)
|
|
@@ -761,6 +790,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
761
790
|
KeyError: 분석을 찾을 수 없는 경우
|
|
762
791
|
"""
|
|
763
792
|
with self._get_connection() as conn:
|
|
793
|
+
conn = cast(Any, conn)
|
|
764
794
|
cursor = conn.cursor()
|
|
765
795
|
cursor.execute(
|
|
766
796
|
"""
|
|
@@ -788,6 +818,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
788
818
|
NLPAnalysis 또는 None (분석 결과가 없는 경우)
|
|
789
819
|
"""
|
|
790
820
|
with self._get_connection() as conn:
|
|
821
|
+
conn = cast(Any, conn)
|
|
791
822
|
cursor = conn.cursor()
|
|
792
823
|
cursor.execute(
|
|
793
824
|
"""
|
|
@@ -891,6 +922,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
891
922
|
is_complete = 1 if record.get("is_complete", False) else 0
|
|
892
923
|
|
|
893
924
|
with self._get_connection() as conn:
|
|
925
|
+
conn = cast(Any, conn)
|
|
894
926
|
conn.execute(
|
|
895
927
|
"""
|
|
896
928
|
INSERT OR REPLACE INTO pipeline_results (
|
|
@@ -920,6 +952,44 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
920
952
|
)
|
|
921
953
|
conn.commit()
|
|
922
954
|
|
|
955
|
+
def save_analysis_report(
|
|
956
|
+
self,
|
|
957
|
+
*,
|
|
958
|
+
report_id: str | None,
|
|
959
|
+
run_id: str | None,
|
|
960
|
+
experiment_id: str | None,
|
|
961
|
+
report_type: str,
|
|
962
|
+
format: str,
|
|
963
|
+
content: str | None,
|
|
964
|
+
metadata: dict[str, Any] | None = None,
|
|
965
|
+
created_at: str | None = None,
|
|
966
|
+
) -> str:
|
|
967
|
+
report_id = report_id or str(uuid.uuid4())
|
|
968
|
+
created_at = created_at or datetime.now().isoformat()
|
|
969
|
+
|
|
970
|
+
with self._get_connection() as conn:
|
|
971
|
+
conn = cast(Any, conn)
|
|
972
|
+
conn.execute(
|
|
973
|
+
"""
|
|
974
|
+
INSERT OR REPLACE INTO analysis_reports (
|
|
975
|
+
report_id, run_id, experiment_id, report_type, format, content, metadata, created_at
|
|
976
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
977
|
+
""",
|
|
978
|
+
(
|
|
979
|
+
report_id,
|
|
980
|
+
run_id,
|
|
981
|
+
experiment_id,
|
|
982
|
+
report_type,
|
|
983
|
+
format,
|
|
984
|
+
content,
|
|
985
|
+
self._serialize_json(metadata),
|
|
986
|
+
created_at,
|
|
987
|
+
),
|
|
988
|
+
)
|
|
989
|
+
conn.commit()
|
|
990
|
+
|
|
991
|
+
return report_id
|
|
992
|
+
|
|
923
993
|
def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
924
994
|
"""파이프라인 분석 결과 목록을 조회합니다."""
|
|
925
995
|
query = """
|
|
@@ -931,12 +1001,14 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
931
1001
|
LIMIT ?
|
|
932
1002
|
"""
|
|
933
1003
|
with self._get_connection() as conn:
|
|
1004
|
+
conn = cast(Any, conn)
|
|
934
1005
|
rows = conn.execute(query, (limit,)).fetchall()
|
|
935
1006
|
return [self._deserialize_pipeline_result(row, include_payload=False) for row in rows]
|
|
936
1007
|
|
|
937
1008
|
def get_pipeline_result(self, result_id: str) -> dict[str, Any]:
|
|
938
1009
|
"""저장된 파이프라인 분석 결과를 조회합니다."""
|
|
939
1010
|
with self._get_connection() as conn:
|
|
1011
|
+
conn = cast(Any, conn)
|
|
940
1012
|
row = conn.execute(
|
|
941
1013
|
"""
|
|
942
1014
|
SELECT result_id, intent, query, run_id, pipeline_id,
|
|
@@ -983,6 +1055,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
983
1055
|
def save_stage_event(self, event: StageEvent) -> str:
|
|
984
1056
|
"""단계 이벤트를 저장합니다."""
|
|
985
1057
|
with self._get_connection() as conn:
|
|
1058
|
+
conn = cast(Any, conn)
|
|
986
1059
|
conn.execute(
|
|
987
1060
|
"""
|
|
988
1061
|
INSERT OR REPLACE INTO stage_events (
|
|
@@ -1001,6 +1074,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1001
1074
|
if not events:
|
|
1002
1075
|
return 0
|
|
1003
1076
|
with self._get_connection() as conn:
|
|
1077
|
+
conn = cast(Any, conn)
|
|
1004
1078
|
conn.executemany(
|
|
1005
1079
|
"""
|
|
1006
1080
|
INSERT OR REPLACE INTO stage_events (
|
|
@@ -1034,6 +1108,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1034
1108
|
params.append(stage_type)
|
|
1035
1109
|
query += " ORDER BY id"
|
|
1036
1110
|
with self._get_connection() as conn:
|
|
1111
|
+
conn = cast(Any, conn)
|
|
1037
1112
|
cursor = conn.execute(query, params)
|
|
1038
1113
|
rows = cursor.fetchall()
|
|
1039
1114
|
return [self._deserialize_stage_event(row) for row in rows]
|
|
@@ -1043,6 +1118,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1043
1118
|
if not metrics:
|
|
1044
1119
|
return 0
|
|
1045
1120
|
with self._get_connection() as conn:
|
|
1121
|
+
conn = cast(Any, conn)
|
|
1046
1122
|
conn.executemany(
|
|
1047
1123
|
"""
|
|
1048
1124
|
INSERT INTO stage_metrics (
|
|
@@ -1076,6 +1152,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1076
1152
|
params.append(metric_name)
|
|
1077
1153
|
query += " ORDER BY id"
|
|
1078
1154
|
with self._get_connection() as conn:
|
|
1155
|
+
conn = cast(Any, conn)
|
|
1079
1156
|
cursor = conn.execute(query, params)
|
|
1080
1157
|
rows = cursor.fetchall()
|
|
1081
1158
|
return [self._deserialize_stage_metric(row) for row in rows]
|
|
@@ -1155,6 +1232,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1155
1232
|
|
|
1156
1233
|
def save_benchmark_run(self, run: BenchmarkRun) -> str:
|
|
1157
1234
|
with self._get_connection() as conn:
|
|
1235
|
+
conn = cast(Any, conn)
|
|
1158
1236
|
task_scores_json = json.dumps(
|
|
1159
1237
|
[
|
|
1160
1238
|
{
|
|
@@ -1208,6 +1286,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1208
1286
|
)
|
|
1209
1287
|
|
|
1210
1288
|
with self._get_connection() as conn:
|
|
1289
|
+
conn = cast(Any, conn)
|
|
1211
1290
|
cursor = conn.execute(
|
|
1212
1291
|
"""
|
|
1213
1292
|
SELECT run_id, benchmark_type, model_name, backend, tasks,
|
|
@@ -1288,6 +1367,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1288
1367
|
params.append(limit)
|
|
1289
1368
|
|
|
1290
1369
|
with self._get_connection() as conn:
|
|
1370
|
+
conn = cast(Any, conn)
|
|
1291
1371
|
cursor = conn.execute(query, params)
|
|
1292
1372
|
run_ids = [row["run_id"] for row in cursor.fetchall()]
|
|
1293
1373
|
|
|
@@ -1295,6 +1375,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1295
1375
|
|
|
1296
1376
|
def delete_benchmark_run(self, run_id: str) -> bool:
|
|
1297
1377
|
with self._get_connection() as conn:
|
|
1378
|
+
conn = cast(Any, conn)
|
|
1298
1379
|
cursor = conn.execute(
|
|
1299
1380
|
"DELETE FROM benchmark_runs WHERE run_id = ?",
|
|
1300
1381
|
(run_id,),
|
evalvault/debug_ragas.py
CHANGED
|
@@ -5,6 +5,9 @@ from unittest.mock import MagicMock
|
|
|
5
5
|
from ragas import SingleTurnSample
|
|
6
6
|
from ragas.metrics import AnswerRelevancy, Faithfulness
|
|
7
7
|
|
|
8
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory
|
|
9
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
10
|
+
from evalvault.config.settings import Settings
|
|
8
11
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
9
12
|
from evalvault.ports.outbound.llm_port import LLMPort
|
|
10
13
|
|
|
@@ -29,7 +32,10 @@ async def debug_ragas():
|
|
|
29
32
|
# Actually, Ragas metrics execute validation on `score` or `ascore`.
|
|
30
33
|
# Failing at LLM call (e.g. no auth) is different from failing at argument passing.
|
|
31
34
|
|
|
32
|
-
|
|
35
|
+
settings = Settings()
|
|
36
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
37
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
38
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
33
39
|
|
|
34
40
|
# Create sample similar to what we observed
|
|
35
41
|
sample = SingleTurnSample(
|
evalvault/debug_ragas_real.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory
|
|
4
5
|
from evalvault.adapters.outbound.llm.openai_adapter import OpenAIAdapter
|
|
6
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
5
7
|
from evalvault.config.settings import get_settings
|
|
6
8
|
from evalvault.domain.entities.dataset import Dataset, TestCase
|
|
7
9
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
@@ -25,7 +27,9 @@ async def debug_ragas_real():
|
|
|
25
27
|
print(f"Using Model: {settings.openai_model}")
|
|
26
28
|
|
|
27
29
|
llm = OpenAIAdapter(settings)
|
|
28
|
-
|
|
30
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
31
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
32
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
29
33
|
|
|
30
34
|
# Manual Dataset
|
|
31
35
|
test_case = TestCase(
|
|
@@ -37,6 +37,12 @@ from evalvault.domain.entities.improvement import (
|
|
|
37
37
|
from evalvault.domain.entities.kg import EntityModel, RelationModel
|
|
38
38
|
from evalvault.domain.entities.method import MethodInput, MethodInputDataset, MethodOutput
|
|
39
39
|
from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
|
|
40
|
+
from evalvault.domain.entities.prompt_suggestion import (
|
|
41
|
+
PromptCandidate,
|
|
42
|
+
PromptCandidateSampleScore,
|
|
43
|
+
PromptCandidateScore,
|
|
44
|
+
PromptSuggestionResult,
|
|
45
|
+
)
|
|
40
46
|
from evalvault.domain.entities.rag_trace import (
|
|
41
47
|
GenerationData,
|
|
42
48
|
RAGTraceData,
|
|
@@ -110,6 +116,10 @@ __all__ = [
|
|
|
110
116
|
"PromptSet",
|
|
111
117
|
"PromptSetBundle",
|
|
112
118
|
"PromptSetItem",
|
|
119
|
+
"PromptCandidate",
|
|
120
|
+
"PromptCandidateSampleScore",
|
|
121
|
+
"PromptCandidateScore",
|
|
122
|
+
"PromptSuggestionResult",
|
|
113
123
|
# RAG Trace
|
|
114
124
|
"GenerationData",
|
|
115
125
|
"RAGTraceData",
|