evalvault 1.72.0__py3-none-any.whl → 1.73.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/routers/pipeline.py +6 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +40 -1
- evalvault/adapters/inbound/cli/commands/pipeline.py +100 -0
- evalvault/adapters/inbound/cli/commands/regress.py +96 -0
- evalvault/adapters/inbound/cli/commands/stage.py +217 -24
- evalvault/adapters/outbound/analysis/__init__.py +4 -0
- evalvault/adapters/outbound/analysis/dataset_feature_analyzer_module.py +458 -0
- evalvault/adapters/outbound/analysis/pipeline_factory.py +1 -0
- evalvault/adapters/outbound/analysis/statistical_adapter.py +12 -6
- evalvault/adapters/outbound/improvement/pattern_detector.py +4 -0
- evalvault/adapters/outbound/storage/base_sql.py +160 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +132 -8
- evalvault/adapters/outbound/storage/postgres_schema.sql +15 -0
- evalvault/adapters/outbound/storage/schema.sql +18 -1
- evalvault/adapters/outbound/storage/sqlite_adapter.py +115 -1
- evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py +23 -1
- evalvault/config/settings.py +2 -1
- evalvault/domain/entities/analysis.py +1 -0
- evalvault/domain/entities/analysis_pipeline.py +1 -0
- evalvault/domain/entities/stage.py +13 -0
- evalvault/domain/services/intent_classifier.py +13 -0
- evalvault/domain/services/pipeline_template_registry.py +22 -0
- evalvault/ports/outbound/storage_port.py +32 -0
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/METADATA +2 -1
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/RECORD +28 -27
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/WHEEL +0 -0
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -249,6 +249,17 @@ class SQLQueries:
|
|
|
249
249
|
def list_runs_ordering(self) -> str:
|
|
250
250
|
return f" ORDER BY started_at DESC LIMIT {self.placeholder}"
|
|
251
251
|
|
|
252
|
+
def upsert_regression_baseline(self) -> str:
|
|
253
|
+
raise NotImplementedError("Override in subclass")
|
|
254
|
+
|
|
255
|
+
def select_regression_baseline(self) -> str:
|
|
256
|
+
return f"""
|
|
257
|
+
SELECT baseline_key, run_id, dataset_name, branch, commit_sha, metadata,
|
|
258
|
+
created_at, updated_at
|
|
259
|
+
FROM regression_baselines
|
|
260
|
+
WHERE baseline_key = {self.placeholder}
|
|
261
|
+
"""
|
|
262
|
+
|
|
252
263
|
|
|
253
264
|
class BaseSQLStorageAdapter(ABC):
|
|
254
265
|
"""Shared serialization and SQL helpers for DB-API based adapters."""
|
|
@@ -589,6 +600,54 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
589
600
|
total_feedback=len(effective),
|
|
590
601
|
)
|
|
591
602
|
|
|
603
|
+
def set_regression_baseline(
|
|
604
|
+
self,
|
|
605
|
+
baseline_key: str,
|
|
606
|
+
run_id: str,
|
|
607
|
+
*,
|
|
608
|
+
dataset_name: str | None = None,
|
|
609
|
+
branch: str | None = None,
|
|
610
|
+
commit_sha: str | None = None,
|
|
611
|
+
metadata: dict[str, Any] | None = None,
|
|
612
|
+
) -> None:
|
|
613
|
+
now = self._serialize_datetime(datetime.now())
|
|
614
|
+
with self._get_connection() as conn:
|
|
615
|
+
self._execute(
|
|
616
|
+
conn,
|
|
617
|
+
self.queries.upsert_regression_baseline(),
|
|
618
|
+
(
|
|
619
|
+
baseline_key,
|
|
620
|
+
run_id,
|
|
621
|
+
dataset_name,
|
|
622
|
+
branch,
|
|
623
|
+
commit_sha,
|
|
624
|
+
self._serialize_json(metadata),
|
|
625
|
+
now,
|
|
626
|
+
now,
|
|
627
|
+
),
|
|
628
|
+
)
|
|
629
|
+
conn.commit()
|
|
630
|
+
|
|
631
|
+
def get_regression_baseline(self, baseline_key: str) -> dict[str, Any] | None:
|
|
632
|
+
with self._get_connection() as conn:
|
|
633
|
+
row = self._execute(
|
|
634
|
+
conn,
|
|
635
|
+
self.queries.select_regression_baseline(),
|
|
636
|
+
(baseline_key,),
|
|
637
|
+
).fetchone()
|
|
638
|
+
if not row:
|
|
639
|
+
return None
|
|
640
|
+
return {
|
|
641
|
+
"baseline_key": self._row_value(row, "baseline_key"),
|
|
642
|
+
"run_id": str(self._row_value(row, "run_id")),
|
|
643
|
+
"dataset_name": self._row_value(row, "dataset_name"),
|
|
644
|
+
"branch": self._row_value(row, "branch"),
|
|
645
|
+
"commit_sha": self._row_value(row, "commit_sha"),
|
|
646
|
+
"metadata": self._deserialize_json(self._row_value(row, "metadata")),
|
|
647
|
+
"created_at": self._row_value(row, "created_at"),
|
|
648
|
+
"updated_at": self._row_value(row, "updated_at"),
|
|
649
|
+
}
|
|
650
|
+
|
|
592
651
|
# Serialization helpers --------------------------------------------
|
|
593
652
|
|
|
594
653
|
def _run_params(self, run: EvaluationRun) -> Sequence[Any]:
|
|
@@ -1302,6 +1361,107 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
1302
1361
|
workbook.save(output)
|
|
1303
1362
|
return output
|
|
1304
1363
|
|
|
1364
|
+
def export_analysis_results_to_excel(self, run_id: str, output_path) -> Path:
|
|
1365
|
+
from openpyxl import Workbook
|
|
1366
|
+
|
|
1367
|
+
output = Path(output_path)
|
|
1368
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
1369
|
+
placeholder = self.queries.placeholder
|
|
1370
|
+
|
|
1371
|
+
with self._get_connection() as conn:
|
|
1372
|
+
analysis_rows = self._execute(
|
|
1373
|
+
conn,
|
|
1374
|
+
(
|
|
1375
|
+
"SELECT analysis_id, run_id, analysis_type, result_data, created_at "
|
|
1376
|
+
f"FROM analysis_results WHERE run_id = {placeholder} ORDER BY created_at DESC"
|
|
1377
|
+
),
|
|
1378
|
+
(run_id,),
|
|
1379
|
+
).fetchall()
|
|
1380
|
+
analysis_payloads = self._normalize_rows(
|
|
1381
|
+
analysis_rows,
|
|
1382
|
+
json_columns={"result_data"},
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
report_rows = self._execute(
|
|
1386
|
+
conn,
|
|
1387
|
+
(
|
|
1388
|
+
"SELECT report_id, run_id, experiment_id, report_type, format, content, metadata, created_at "
|
|
1389
|
+
f"FROM analysis_reports WHERE run_id = {placeholder} ORDER BY created_at DESC"
|
|
1390
|
+
),
|
|
1391
|
+
(run_id,),
|
|
1392
|
+
).fetchall()
|
|
1393
|
+
report_payloads = self._normalize_rows(report_rows, json_columns={"metadata"})
|
|
1394
|
+
|
|
1395
|
+
pipeline_rows = self._execute(
|
|
1396
|
+
conn,
|
|
1397
|
+
(
|
|
1398
|
+
"SELECT result_id, intent, query, run_id, pipeline_id, profile, tags, metadata, "
|
|
1399
|
+
"is_complete, duration_ms, final_output, node_results, started_at, finished_at, created_at "
|
|
1400
|
+
f"FROM pipeline_results WHERE run_id = {placeholder} ORDER BY created_at DESC"
|
|
1401
|
+
),
|
|
1402
|
+
(run_id,),
|
|
1403
|
+
).fetchall()
|
|
1404
|
+
pipeline_payloads = self._normalize_rows(
|
|
1405
|
+
pipeline_rows,
|
|
1406
|
+
json_columns={"tags", "metadata", "final_output", "node_results"},
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
sheet_order: list[tuple[str, list[dict[str, Any]], list[str]]] = [
|
|
1410
|
+
(
|
|
1411
|
+
"AnalysisResults",
|
|
1412
|
+
analysis_payloads,
|
|
1413
|
+
["analysis_id", "run_id", "analysis_type", "result_data", "created_at"],
|
|
1414
|
+
),
|
|
1415
|
+
(
|
|
1416
|
+
"AnalysisReports",
|
|
1417
|
+
report_payloads,
|
|
1418
|
+
[
|
|
1419
|
+
"report_id",
|
|
1420
|
+
"run_id",
|
|
1421
|
+
"experiment_id",
|
|
1422
|
+
"report_type",
|
|
1423
|
+
"format",
|
|
1424
|
+
"content",
|
|
1425
|
+
"metadata",
|
|
1426
|
+
"created_at",
|
|
1427
|
+
],
|
|
1428
|
+
),
|
|
1429
|
+
(
|
|
1430
|
+
"PipelineResults",
|
|
1431
|
+
pipeline_payloads,
|
|
1432
|
+
[
|
|
1433
|
+
"result_id",
|
|
1434
|
+
"intent",
|
|
1435
|
+
"query",
|
|
1436
|
+
"run_id",
|
|
1437
|
+
"pipeline_id",
|
|
1438
|
+
"profile",
|
|
1439
|
+
"tags",
|
|
1440
|
+
"metadata",
|
|
1441
|
+
"is_complete",
|
|
1442
|
+
"duration_ms",
|
|
1443
|
+
"final_output",
|
|
1444
|
+
"node_results",
|
|
1445
|
+
"started_at",
|
|
1446
|
+
"finished_at",
|
|
1447
|
+
"created_at",
|
|
1448
|
+
],
|
|
1449
|
+
),
|
|
1450
|
+
]
|
|
1451
|
+
|
|
1452
|
+
workbook = Workbook()
|
|
1453
|
+
default_sheet = workbook.active
|
|
1454
|
+
if default_sheet is not None:
|
|
1455
|
+
workbook.remove(default_sheet)
|
|
1456
|
+
for sheet_name, rows, columns in sheet_order:
|
|
1457
|
+
worksheet = cast(Any, workbook.create_sheet(title=sheet_name))
|
|
1458
|
+
worksheet.append(columns)
|
|
1459
|
+
for row in rows:
|
|
1460
|
+
worksheet.append([row.get(column) for column in columns])
|
|
1461
|
+
|
|
1462
|
+
workbook.save(output)
|
|
1463
|
+
return output
|
|
1464
|
+
|
|
1305
1465
|
def export_multiturn_run_to_excel(self, run_id: str, output_path) -> Path:
|
|
1306
1466
|
from openpyxl import Workbook
|
|
1307
1467
|
|
|
@@ -11,6 +11,7 @@ from typing import Any
|
|
|
11
11
|
import psycopg
|
|
12
12
|
from psycopg.rows import dict_row
|
|
13
13
|
|
|
14
|
+
from evalvault.adapters.outbound.analysis.pipeline_helpers import to_serializable
|
|
14
15
|
from evalvault.adapters.outbound.storage.base_sql import BaseSQLStorageAdapter, SQLQueries
|
|
15
16
|
from evalvault.domain.entities.analysis import (
|
|
16
17
|
AnalysisType,
|
|
@@ -30,6 +31,31 @@ from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle,
|
|
|
30
31
|
from evalvault.domain.entities.stage import StageEvent, StageMetric
|
|
31
32
|
|
|
32
33
|
|
|
34
|
+
class PostgresQueries(SQLQueries):
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
super().__init__(
|
|
37
|
+
placeholder="%s",
|
|
38
|
+
metric_name_column="name",
|
|
39
|
+
test_case_returning_clause="RETURNING id",
|
|
40
|
+
feedback_returning_clause="RETURNING id",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def upsert_regression_baseline(self) -> str:
|
|
44
|
+
return """
|
|
45
|
+
INSERT INTO regression_baselines (
|
|
46
|
+
baseline_key, run_id, dataset_name, branch, commit_sha, metadata,
|
|
47
|
+
created_at, updated_at
|
|
48
|
+
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|
49
|
+
ON CONFLICT (baseline_key) DO UPDATE SET
|
|
50
|
+
run_id = EXCLUDED.run_id,
|
|
51
|
+
dataset_name = EXCLUDED.dataset_name,
|
|
52
|
+
branch = EXCLUDED.branch,
|
|
53
|
+
commit_sha = EXCLUDED.commit_sha,
|
|
54
|
+
metadata = EXCLUDED.metadata,
|
|
55
|
+
updated_at = EXCLUDED.updated_at
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
|
|
33
59
|
class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
34
60
|
"""PostgreSQL 기반 평가 결과 저장 어댑터.
|
|
35
61
|
|
|
@@ -56,14 +82,7 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
56
82
|
password: Database password
|
|
57
83
|
connection_string: Full connection string (overrides other params if provided)
|
|
58
84
|
"""
|
|
59
|
-
super().__init__(
|
|
60
|
-
SQLQueries(
|
|
61
|
-
placeholder="%s",
|
|
62
|
-
metric_name_column="name",
|
|
63
|
-
test_case_returning_clause="RETURNING id",
|
|
64
|
-
feedback_returning_clause="RETURNING id",
|
|
65
|
-
)
|
|
66
|
-
)
|
|
85
|
+
super().__init__(PostgresQueries())
|
|
67
86
|
if connection_string:
|
|
68
87
|
self._conn_string = connection_string
|
|
69
88
|
else:
|
|
@@ -304,6 +323,29 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
304
323
|
conn.execute(
|
|
305
324
|
"CREATE INDEX IF NOT EXISTS idx_multiturn_scores_metric_name ON multiturn_metric_scores(metric_name)"
|
|
306
325
|
)
|
|
326
|
+
conn.execute(
|
|
327
|
+
"""
|
|
328
|
+
CREATE TABLE IF NOT EXISTS regression_baselines (
|
|
329
|
+
baseline_key TEXT PRIMARY KEY,
|
|
330
|
+
run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
|
|
331
|
+
dataset_name VARCHAR(255),
|
|
332
|
+
branch TEXT,
|
|
333
|
+
commit_sha VARCHAR(64),
|
|
334
|
+
metadata JSONB,
|
|
335
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
336
|
+
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
337
|
+
)
|
|
338
|
+
"""
|
|
339
|
+
)
|
|
340
|
+
conn.execute(
|
|
341
|
+
"CREATE INDEX IF NOT EXISTS idx_baselines_run_id ON regression_baselines(run_id)"
|
|
342
|
+
)
|
|
343
|
+
conn.execute(
|
|
344
|
+
"CREATE INDEX IF NOT EXISTS idx_baselines_dataset ON regression_baselines(dataset_name)"
|
|
345
|
+
)
|
|
346
|
+
conn.execute(
|
|
347
|
+
"CREATE INDEX IF NOT EXISTS idx_baselines_updated_at ON regression_baselines(updated_at DESC)"
|
|
348
|
+
)
|
|
307
349
|
|
|
308
350
|
# Prompt set methods
|
|
309
351
|
|
|
@@ -720,6 +762,39 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
720
762
|
conn.commit()
|
|
721
763
|
return analysis.analysis_id
|
|
722
764
|
|
|
765
|
+
def save_analysis_result(
|
|
766
|
+
self,
|
|
767
|
+
*,
|
|
768
|
+
run_id: str,
|
|
769
|
+
analysis_type: str,
|
|
770
|
+
result_data: dict[str, Any],
|
|
771
|
+
analysis_id: str | None = None,
|
|
772
|
+
) -> str:
|
|
773
|
+
"""분석 결과(JSON)를 저장합니다."""
|
|
774
|
+
analysis_id = analysis_id or f"analysis-{analysis_type}-{run_id}-{uuid.uuid4().hex[:8]}"
|
|
775
|
+
payload = to_serializable(result_data)
|
|
776
|
+
|
|
777
|
+
with self._get_connection() as conn:
|
|
778
|
+
conn.execute(
|
|
779
|
+
"""
|
|
780
|
+
INSERT INTO analysis_results (
|
|
781
|
+
analysis_id, run_id, analysis_type, result_data, created_at
|
|
782
|
+
) VALUES (%s, %s, %s, %s, %s)
|
|
783
|
+
ON CONFLICT (analysis_id) DO UPDATE SET
|
|
784
|
+
result_data = EXCLUDED.result_data,
|
|
785
|
+
created_at = EXCLUDED.created_at
|
|
786
|
+
""",
|
|
787
|
+
(
|
|
788
|
+
analysis_id,
|
|
789
|
+
run_id,
|
|
790
|
+
analysis_type,
|
|
791
|
+
json.dumps(payload, ensure_ascii=False),
|
|
792
|
+
datetime.now(UTC),
|
|
793
|
+
),
|
|
794
|
+
)
|
|
795
|
+
conn.commit()
|
|
796
|
+
return analysis_id
|
|
797
|
+
|
|
723
798
|
def get_analysis(self, analysis_id: str) -> StatisticalAnalysis:
|
|
724
799
|
"""분석 결과를 조회합니다."""
|
|
725
800
|
with self._get_connection() as conn:
|
|
@@ -816,6 +891,55 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
816
891
|
conn.commit()
|
|
817
892
|
return analysis_id
|
|
818
893
|
|
|
894
|
+
def save_dataset_feature_analysis(
|
|
895
|
+
self,
|
|
896
|
+
*,
|
|
897
|
+
run_id: str,
|
|
898
|
+
result_data: dict[str, Any],
|
|
899
|
+
analysis_id: str | None = None,
|
|
900
|
+
) -> str:
|
|
901
|
+
"""데이터셋 특성 분석 결과를 저장합니다."""
|
|
902
|
+
analysis_id = analysis_id or f"dataset-features-{run_id}-{uuid.uuid4().hex[:8]}"
|
|
903
|
+
|
|
904
|
+
with self._get_connection() as conn:
|
|
905
|
+
conn.execute(
|
|
906
|
+
"""
|
|
907
|
+
INSERT INTO analysis_results (
|
|
908
|
+
analysis_id, run_id, analysis_type, result_data, created_at
|
|
909
|
+
) VALUES (%s, %s, %s, %s, %s)
|
|
910
|
+
ON CONFLICT (analysis_id) DO UPDATE SET
|
|
911
|
+
result_data = EXCLUDED.result_data,
|
|
912
|
+
created_at = EXCLUDED.created_at
|
|
913
|
+
""",
|
|
914
|
+
(
|
|
915
|
+
analysis_id,
|
|
916
|
+
run_id,
|
|
917
|
+
AnalysisType.DATASET_FEATURES.value,
|
|
918
|
+
json.dumps(result_data, ensure_ascii=False),
|
|
919
|
+
datetime.now(UTC),
|
|
920
|
+
),
|
|
921
|
+
)
|
|
922
|
+
conn.commit()
|
|
923
|
+
return analysis_id
|
|
924
|
+
|
|
925
|
+
def get_dataset_feature_analysis(self, analysis_id: str) -> dict[str, Any]:
|
|
926
|
+
"""데이터셋 특성 분석 결과를 조회합니다."""
|
|
927
|
+
with self._get_connection() as conn:
|
|
928
|
+
cursor = conn.execute(
|
|
929
|
+
"""
|
|
930
|
+
SELECT result_data
|
|
931
|
+
FROM analysis_results
|
|
932
|
+
WHERE analysis_id = %s AND analysis_type = %s
|
|
933
|
+
""",
|
|
934
|
+
(analysis_id, AnalysisType.DATASET_FEATURES.value),
|
|
935
|
+
)
|
|
936
|
+
row = cursor.fetchone()
|
|
937
|
+
|
|
938
|
+
if not row:
|
|
939
|
+
raise KeyError(f"Dataset feature analysis not found: {analysis_id}")
|
|
940
|
+
|
|
941
|
+
return self._ensure_json(row["result_data"])
|
|
942
|
+
|
|
819
943
|
def get_nlp_analysis(self, analysis_id: str) -> NLPAnalysis:
|
|
820
944
|
"""NLP 분석 결과를 조회합니다."""
|
|
821
945
|
with self._get_connection() as conn:
|
|
@@ -304,3 +304,18 @@ CREATE TABLE IF NOT EXISTS stage_metrics (
|
|
|
304
304
|
|
|
305
305
|
CREATE INDEX IF NOT EXISTS idx_stage_metrics_run_id ON stage_metrics(run_id);
|
|
306
306
|
CREATE INDEX IF NOT EXISTS idx_stage_metrics_stage_id ON stage_metrics(stage_id);
|
|
307
|
+
|
|
308
|
+
CREATE TABLE IF NOT EXISTS regression_baselines (
|
|
309
|
+
baseline_key TEXT PRIMARY KEY,
|
|
310
|
+
run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
|
|
311
|
+
dataset_name VARCHAR(255),
|
|
312
|
+
branch TEXT,
|
|
313
|
+
commit_sha VARCHAR(64),
|
|
314
|
+
metadata JSONB,
|
|
315
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
316
|
+
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
317
|
+
);
|
|
318
|
+
|
|
319
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_run_id ON regression_baselines(run_id);
|
|
320
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_dataset ON regression_baselines(dataset_name);
|
|
321
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_updated_at ON regression_baselines(updated_at DESC);
|
|
@@ -245,7 +245,7 @@ CREATE INDEX IF NOT EXISTS idx_group_runs_group_id ON experiment_group_runs(grou
|
|
|
245
245
|
CREATE TABLE IF NOT EXISTS analysis_results (
|
|
246
246
|
analysis_id TEXT PRIMARY KEY,
|
|
247
247
|
run_id TEXT NOT NULL,
|
|
248
|
-
analysis_type TEXT NOT NULL, -- 'statistical', 'nlp', 'causal', 'data_quality'
|
|
248
|
+
analysis_type TEXT NOT NULL, -- 'statistical', 'nlp', 'causal', 'data_quality', 'dataset_features'
|
|
249
249
|
result_data TEXT NOT NULL, -- JSON serialized analysis result
|
|
250
250
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
251
251
|
FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE CASCADE
|
|
@@ -360,3 +360,20 @@ CREATE TABLE IF NOT EXISTS benchmark_runs (
|
|
|
360
360
|
CREATE INDEX IF NOT EXISTS idx_benchmark_runs_type ON benchmark_runs(benchmark_type);
|
|
361
361
|
CREATE INDEX IF NOT EXISTS idx_benchmark_runs_model ON benchmark_runs(model_name);
|
|
362
362
|
CREATE INDEX IF NOT EXISTS idx_benchmark_runs_created_at ON benchmark_runs(created_at DESC);
|
|
363
|
+
|
|
364
|
+
-- Regression baselines table for CI/CD integration
|
|
365
|
+
CREATE TABLE IF NOT EXISTS regression_baselines (
|
|
366
|
+
baseline_key TEXT PRIMARY KEY,
|
|
367
|
+
run_id TEXT NOT NULL,
|
|
368
|
+
dataset_name TEXT,
|
|
369
|
+
branch TEXT,
|
|
370
|
+
commit_sha TEXT,
|
|
371
|
+
metadata TEXT, -- JSON metadata
|
|
372
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
373
|
+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
374
|
+
FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE CASCADE
|
|
375
|
+
);
|
|
376
|
+
|
|
377
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_run_id ON regression_baselines(run_id);
|
|
378
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_dataset ON regression_baselines(dataset_name);
|
|
379
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_updated_at ON regression_baselines(updated_at DESC);
|
|
@@ -11,6 +11,7 @@ from datetime import datetime
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import TYPE_CHECKING, Any, cast
|
|
13
13
|
|
|
14
|
+
from evalvault.adapters.outbound.analysis.pipeline_helpers import to_serializable
|
|
14
15
|
from evalvault.adapters.outbound.storage.base_sql import BaseSQLStorageAdapter, SQLQueries
|
|
15
16
|
from evalvault.domain.entities.analysis import (
|
|
16
17
|
AnalysisType,
|
|
@@ -33,6 +34,16 @@ if TYPE_CHECKING:
|
|
|
33
34
|
from evalvault.domain.entities.benchmark_run import BenchmarkRun
|
|
34
35
|
|
|
35
36
|
|
|
37
|
+
class SQLiteQueries(SQLQueries):
|
|
38
|
+
def upsert_regression_baseline(self) -> str:
|
|
39
|
+
return """
|
|
40
|
+
INSERT OR REPLACE INTO regression_baselines (
|
|
41
|
+
baseline_key, run_id, dataset_name, branch, commit_sha, metadata,
|
|
42
|
+
created_at, updated_at
|
|
43
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
|
|
36
47
|
class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
37
48
|
"""SQLite 기반 평가 결과 저장 어댑터.
|
|
38
49
|
|
|
@@ -45,7 +56,7 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
45
56
|
Args:
|
|
46
57
|
db_path: Path to SQLite database file (default: data/db/evalvault.db)
|
|
47
58
|
"""
|
|
48
|
-
super().__init__(
|
|
59
|
+
super().__init__(SQLiteQueries())
|
|
49
60
|
self.db_path = Path(db_path)
|
|
50
61
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
51
62
|
self._init_db()
|
|
@@ -247,6 +258,28 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
247
258
|
"""
|
|
248
259
|
)
|
|
249
260
|
|
|
261
|
+
baseline_cursor = conn.execute("PRAGMA table_info(regression_baselines)")
|
|
262
|
+
baseline_columns = {row[1] for row in baseline_cursor.fetchall()}
|
|
263
|
+
if not baseline_columns:
|
|
264
|
+
conn.executescript(
|
|
265
|
+
"""
|
|
266
|
+
CREATE TABLE IF NOT EXISTS regression_baselines (
|
|
267
|
+
baseline_key TEXT PRIMARY KEY,
|
|
268
|
+
run_id TEXT NOT NULL,
|
|
269
|
+
dataset_name TEXT,
|
|
270
|
+
branch TEXT,
|
|
271
|
+
commit_sha TEXT,
|
|
272
|
+
metadata TEXT,
|
|
273
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
274
|
+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
275
|
+
FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE CASCADE
|
|
276
|
+
);
|
|
277
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_run_id ON regression_baselines(run_id);
|
|
278
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_dataset ON regression_baselines(dataset_name);
|
|
279
|
+
CREATE INDEX IF NOT EXISTS idx_baselines_updated_at ON regression_baselines(updated_at DESC);
|
|
280
|
+
"""
|
|
281
|
+
)
|
|
282
|
+
|
|
250
283
|
# Prompt set methods
|
|
251
284
|
|
|
252
285
|
def save_prompt_set(self, bundle: PromptSetBundle) -> None:
|
|
@@ -651,6 +684,38 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
651
684
|
conn.commit()
|
|
652
685
|
return analysis.analysis_id
|
|
653
686
|
|
|
687
|
+
def save_analysis_result(
|
|
688
|
+
self,
|
|
689
|
+
*,
|
|
690
|
+
run_id: str,
|
|
691
|
+
analysis_type: str,
|
|
692
|
+
result_data: dict[str, Any],
|
|
693
|
+
analysis_id: str | None = None,
|
|
694
|
+
) -> str:
|
|
695
|
+
"""분석 결과(JSON)를 저장합니다."""
|
|
696
|
+
analysis_id = analysis_id or f"analysis-{analysis_type}-{run_id}-{uuid.uuid4().hex[:8]}"
|
|
697
|
+
payload = to_serializable(result_data)
|
|
698
|
+
|
|
699
|
+
with self._get_connection() as conn:
|
|
700
|
+
conn = cast(Any, conn)
|
|
701
|
+
cursor = conn.cursor()
|
|
702
|
+
cursor.execute(
|
|
703
|
+
"""
|
|
704
|
+
INSERT OR REPLACE INTO analysis_results (
|
|
705
|
+
analysis_id, run_id, analysis_type, result_data, created_at
|
|
706
|
+
) VALUES (?, ?, ?, ?, ?)
|
|
707
|
+
""",
|
|
708
|
+
(
|
|
709
|
+
analysis_id,
|
|
710
|
+
run_id,
|
|
711
|
+
analysis_type,
|
|
712
|
+
json.dumps(payload, ensure_ascii=False),
|
|
713
|
+
datetime.now().isoformat(),
|
|
714
|
+
),
|
|
715
|
+
)
|
|
716
|
+
conn.commit()
|
|
717
|
+
return analysis_id
|
|
718
|
+
|
|
654
719
|
def get_analysis(self, analysis_id: str) -> StatisticalAnalysis:
|
|
655
720
|
"""분석 결과를 조회합니다.
|
|
656
721
|
|
|
@@ -842,6 +907,55 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
842
907
|
conn.commit()
|
|
843
908
|
return analysis_id
|
|
844
909
|
|
|
910
|
+
def save_dataset_feature_analysis(
|
|
911
|
+
self,
|
|
912
|
+
*,
|
|
913
|
+
run_id: str,
|
|
914
|
+
result_data: dict[str, Any],
|
|
915
|
+
analysis_id: str | None = None,
|
|
916
|
+
) -> str:
|
|
917
|
+
"""데이터셋 특성 분석 결과를 저장합니다."""
|
|
918
|
+
analysis_id = analysis_id or f"dataset-features-{run_id}-{uuid.uuid4().hex[:8]}"
|
|
919
|
+
with self._get_connection() as conn:
|
|
920
|
+
conn = cast(Any, conn)
|
|
921
|
+
cursor = conn.cursor()
|
|
922
|
+
cursor.execute(
|
|
923
|
+
"""
|
|
924
|
+
INSERT OR REPLACE INTO analysis_results (
|
|
925
|
+
analysis_id, run_id, analysis_type, result_data, created_at
|
|
926
|
+
) VALUES (?, ?, ?, ?, ?)
|
|
927
|
+
""",
|
|
928
|
+
(
|
|
929
|
+
analysis_id,
|
|
930
|
+
run_id,
|
|
931
|
+
AnalysisType.DATASET_FEATURES.value,
|
|
932
|
+
json.dumps(result_data, ensure_ascii=False),
|
|
933
|
+
datetime.now().isoformat(),
|
|
934
|
+
),
|
|
935
|
+
)
|
|
936
|
+
conn.commit()
|
|
937
|
+
return analysis_id
|
|
938
|
+
|
|
939
|
+
def get_dataset_feature_analysis(self, analysis_id: str) -> dict[str, Any]:
|
|
940
|
+
"""데이터셋 특성 분석 결과를 조회합니다."""
|
|
941
|
+
with self._get_connection() as conn:
|
|
942
|
+
conn = cast(Any, conn)
|
|
943
|
+
cursor = conn.cursor()
|
|
944
|
+
cursor.execute(
|
|
945
|
+
"""
|
|
946
|
+
SELECT analysis_id, result_data
|
|
947
|
+
FROM analysis_results
|
|
948
|
+
WHERE analysis_id = ? AND analysis_type = ?
|
|
949
|
+
""",
|
|
950
|
+
(analysis_id, AnalysisType.DATASET_FEATURES.value),
|
|
951
|
+
)
|
|
952
|
+
row = cursor.fetchone()
|
|
953
|
+
|
|
954
|
+
if not row:
|
|
955
|
+
raise KeyError(f"Dataset feature analysis not found: {analysis_id}")
|
|
956
|
+
|
|
957
|
+
return json.loads(row[1])
|
|
958
|
+
|
|
845
959
|
def get_nlp_analysis(self, analysis_id: str) -> NLPAnalysis:
|
|
846
960
|
"""NLP 분석 결과를 조회합니다.
|
|
847
961
|
|
|
@@ -21,6 +21,18 @@ class OpenRagTraceConfig:
|
|
|
21
21
|
spec_version: str = "0.1"
|
|
22
22
|
module_attribute: str = "rag.module"
|
|
23
23
|
custom_prefix: str = "custom."
|
|
24
|
+
allowed_modules: tuple[str, ...] = (
|
|
25
|
+
"ingest",
|
|
26
|
+
"chunk",
|
|
27
|
+
"embed",
|
|
28
|
+
"retrieve",
|
|
29
|
+
"rerank",
|
|
30
|
+
"prompt",
|
|
31
|
+
"llm",
|
|
32
|
+
"postprocess",
|
|
33
|
+
"eval",
|
|
34
|
+
"cache",
|
|
35
|
+
)
|
|
24
36
|
|
|
25
37
|
|
|
26
38
|
class _NoOpSpan:
|
|
@@ -126,10 +138,20 @@ class OpenRagTraceAdapter:
|
|
|
126
138
|
attributes: Mapping[str, Any] | None,
|
|
127
139
|
) -> None:
|
|
128
140
|
span.set_attribute("spec.version", self._config.spec_version)
|
|
129
|
-
span.set_attribute(self._config.module_attribute,
|
|
141
|
+
span.set_attribute(self._config.module_attribute, self._normalize_module(module))
|
|
130
142
|
if attributes:
|
|
131
143
|
self.set_span_attributes(span, attributes)
|
|
132
144
|
|
|
145
|
+
def _normalize_module(self, module: str) -> str:
|
|
146
|
+
normalized = str(module).strip().lower()
|
|
147
|
+
if not normalized:
|
|
148
|
+
return f"{self._config.custom_prefix}unknown"
|
|
149
|
+
if normalized in self._config.allowed_modules:
|
|
150
|
+
return normalized
|
|
151
|
+
if normalized.startswith(self._config.custom_prefix):
|
|
152
|
+
return normalized
|
|
153
|
+
return f"{self._config.custom_prefix}{normalized}"
|
|
154
|
+
|
|
133
155
|
|
|
134
156
|
def _coerce_attribute_value(value: Any) -> Any:
|
|
135
157
|
if value is None:
|
evalvault/config/settings.py
CHANGED
|
@@ -428,6 +428,7 @@ def apply_profile(settings: Settings, profile_name: str) -> Settings:
|
|
|
428
428
|
if not normalized:
|
|
429
429
|
return settings
|
|
430
430
|
|
|
431
|
+
model_config = None
|
|
431
432
|
try:
|
|
432
433
|
model_config = get_model_config()
|
|
433
434
|
profile = model_config.get_profile(normalized)
|
|
@@ -459,7 +460,7 @@ def apply_profile(settings: Settings, profile_name: str) -> Settings:
|
|
|
459
460
|
f"to use profile '{normalized}'."
|
|
460
461
|
) from exc
|
|
461
462
|
except KeyError as exc:
|
|
462
|
-
available = ", ".join(sorted(model_config.profiles.keys()))
|
|
463
|
+
available = ", ".join(sorted(model_config.profiles.keys())) if model_config else ""
|
|
463
464
|
raise ValueError(
|
|
464
465
|
f"Unknown profile '{normalized}'. Available profiles: {available}"
|
|
465
466
|
) from exc
|
|
@@ -41,6 +41,7 @@ class AnalysisIntent(str, Enum):
|
|
|
41
41
|
ANALYZE_TRENDS = "analyze_trends"
|
|
42
42
|
ANALYZE_STATISTICAL = "analyze_statistical"
|
|
43
43
|
ANALYZE_NLP = "analyze_nlp"
|
|
44
|
+
ANALYZE_DATASET_FEATURES = "analyze_dataset_features"
|
|
44
45
|
ANALYZE_CAUSAL = "analyze_causal"
|
|
45
46
|
ANALYZE_NETWORK = "analyze_network"
|
|
46
47
|
ANALYZE_PLAYBOOK = "analyze_playbook"
|
|
@@ -54,6 +54,19 @@ class StageEvent:
|
|
|
54
54
|
span_id: str | None = None
|
|
55
55
|
|
|
56
56
|
def __post_init__(self) -> None:
|
|
57
|
+
if not isinstance(self.attributes, dict):
|
|
58
|
+
raise ValueError("StageEvent requires attributes dict")
|
|
59
|
+
if not isinstance(self.metadata, dict):
|
|
60
|
+
raise ValueError("StageEvent requires metadata dict")
|
|
61
|
+
self.stage_type = str(self.stage_type).strip().lower()
|
|
62
|
+
if not self.stage_type:
|
|
63
|
+
raise ValueError("StageEvent requires non-empty 'stage_type'")
|
|
64
|
+
if self.attempt < 1:
|
|
65
|
+
raise ValueError("StageEvent requires attempt >= 1")
|
|
66
|
+
if self.duration_ms is not None and self.duration_ms < 0:
|
|
67
|
+
raise ValueError("StageEvent requires non-negative duration_ms")
|
|
68
|
+
if self.started_at and self.finished_at and self.finished_at < self.started_at:
|
|
69
|
+
raise ValueError("StageEvent requires finished_at >= started_at")
|
|
57
70
|
if self.duration_ms is None and self.started_at and self.finished_at:
|
|
58
71
|
delta = self.finished_at - self.started_at
|
|
59
72
|
self.duration_ms = delta.total_seconds() * 1000
|
|
@@ -181,6 +181,19 @@ class IntentKeywordRegistry:
|
|
|
181
181
|
"토픽",
|
|
182
182
|
"topic",
|
|
183
183
|
}
|
|
184
|
+
self._keywords[AnalysisIntent.ANALYZE_DATASET_FEATURES] = {
|
|
185
|
+
"데이터셋",
|
|
186
|
+
"dataset",
|
|
187
|
+
"특성",
|
|
188
|
+
"feature",
|
|
189
|
+
"features",
|
|
190
|
+
"분포",
|
|
191
|
+
"distribution",
|
|
192
|
+
"상관",
|
|
193
|
+
"correlation",
|
|
194
|
+
"중요도",
|
|
195
|
+
"importance",
|
|
196
|
+
}
|
|
184
197
|
self._keywords[AnalysisIntent.ANALYZE_CAUSAL] = {
|
|
185
198
|
"인과",
|
|
186
199
|
"causal",
|