evalvault 1.73.2__py3-none-any.whl → 1.75.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. evalvault/adapters/inbound/api/adapter.py +66 -17
  2. evalvault/adapters/inbound/api/routers/calibration.py +9 -9
  3. evalvault/adapters/inbound/api/routers/chat.py +604 -37
  4. evalvault/adapters/inbound/api/routers/domain.py +10 -5
  5. evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
  6. evalvault/adapters/inbound/api/routers/runs.py +23 -4
  7. evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
  8. evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
  9. evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
  10. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
  11. evalvault/adapters/inbound/cli/commands/compare.py +2 -7
  12. evalvault/adapters/inbound/cli/commands/debug.py +3 -2
  13. evalvault/adapters/inbound/cli/commands/domain.py +12 -12
  14. evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
  15. evalvault/adapters/inbound/cli/commands/gate.py +3 -2
  16. evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
  17. evalvault/adapters/inbound/cli/commands/history.py +3 -12
  18. evalvault/adapters/inbound/cli/commands/method.py +1 -2
  19. evalvault/adapters/inbound/cli/commands/ops.py +2 -2
  20. evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
  21. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
  22. evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
  23. evalvault/adapters/inbound/cli/commands/regress.py +5 -4
  24. evalvault/adapters/inbound/cli/commands/run.py +42 -31
  25. evalvault/adapters/inbound/cli/commands/run_helpers.py +24 -15
  26. evalvault/adapters/inbound/cli/commands/stage.py +6 -25
  27. evalvault/adapters/inbound/cli/utils/options.py +10 -4
  28. evalvault/adapters/inbound/mcp/tools.py +11 -8
  29. evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
  30. evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
  31. evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
  32. evalvault/adapters/outbound/domain_memory/factory.py +68 -0
  33. evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
  34. evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
  35. evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
  36. evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
  37. evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
  38. evalvault/adapters/outbound/ops/__init__.py +5 -0
  39. evalvault/adapters/outbound/ops/report_renderer.py +159 -0
  40. evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
  41. evalvault/adapters/outbound/storage/base_sql.py +3 -2
  42. evalvault/adapters/outbound/storage/factory.py +53 -0
  43. evalvault/adapters/outbound/storage/postgres_adapter.py +90 -0
  44. evalvault/adapters/outbound/storage/postgres_schema.sql +15 -0
  45. evalvault/adapters/outbound/storage/schema.sql +14 -0
  46. evalvault/adapters/outbound/storage/sqlite_adapter.py +77 -0
  47. evalvault/config/settings.py +31 -7
  48. evalvault/domain/entities/ops_report.py +40 -0
  49. evalvault/domain/services/domain_learning_hook.py +2 -1
  50. evalvault/domain/services/ops_report_service.py +192 -0
  51. evalvault/ports/inbound/web_port.py +3 -1
  52. evalvault/ports/outbound/storage_port.py +2 -0
  53. evalvault-1.75.0.dist-info/METADATA +221 -0
  54. {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/RECORD +57 -48
  55. evalvault-1.73.2.dist-info/METADATA +0 -585
  56. {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/WHEEL +0 -0
  57. {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/entry_points.txt +0 -0
  58. {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from evalvault.adapters.outbound.storage.postgres_adapter import PostgreSQLStorageAdapter
7
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
8
+ from evalvault.config.settings import Settings
9
+ from evalvault.ports.outbound.storage_port import StoragePort
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def build_storage_adapter(
15
+ *,
16
+ settings: Settings | None = None,
17
+ db_path: Path | None = None,
18
+ fallback_to_sqlite: bool = True,
19
+ ) -> StoragePort:
20
+ resolved_settings = settings or Settings()
21
+
22
+ if db_path is not None:
23
+ return SQLiteStorageAdapter(db_path=db_path)
24
+
25
+ backend = getattr(resolved_settings, "db_backend", "postgres")
26
+ if backend == "sqlite":
27
+ resolved_db_path = resolved_settings.evalvault_db_path
28
+ if resolved_db_path is None:
29
+ raise RuntimeError("SQLite backend selected but evalvault_db_path is not set.")
30
+ return SQLiteStorageAdapter(db_path=resolved_db_path)
31
+
32
+ conn_string = resolved_settings.postgres_connection_string
33
+ if not conn_string:
34
+ host = resolved_settings.postgres_host or "localhost"
35
+ port = resolved_settings.postgres_port
36
+ database = resolved_settings.postgres_database
37
+ user = resolved_settings.postgres_user or "postgres"
38
+ password = resolved_settings.postgres_password or ""
39
+ conn_string = f"host={host} port={port} dbname={database} user={user} password={password}"
40
+
41
+ try:
42
+ return PostgreSQLStorageAdapter(connection_string=conn_string)
43
+ except Exception as exc:
44
+ if not fallback_to_sqlite:
45
+ raise
46
+ logger.warning("PostgreSQL adapter failed (%s). Falling back to SQLite.", exc)
47
+ resolved_db_path = resolved_settings.evalvault_db_path
48
+ if resolved_db_path is None:
49
+ raise
50
+ return SQLiteStorageAdapter(db_path=resolved_db_path)
51
+
52
+
53
+ __all__ = ["build_storage_adapter"]
@@ -1128,6 +1128,96 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
1128
1128
  )
1129
1129
  return reports
1130
1130
 
1131
+ def save_ops_report(
1132
+ self,
1133
+ *,
1134
+ report_id: str | None,
1135
+ run_id: str | None,
1136
+ report_type: str,
1137
+ format: str,
1138
+ content: str | None,
1139
+ metadata: dict[str, Any] | None = None,
1140
+ created_at: str | None = None,
1141
+ ) -> str:
1142
+ report_id = report_id or str(uuid.uuid4())
1143
+ if created_at is None:
1144
+ created_at_value = datetime.now(UTC)
1145
+ else:
1146
+ created_at_value = (
1147
+ datetime.fromisoformat(created_at) if isinstance(created_at, str) else created_at
1148
+ )
1149
+
1150
+ with self._get_connection() as conn:
1151
+ conn.execute(
1152
+ """
1153
+ INSERT INTO ops_reports (
1154
+ report_id, run_id, report_type, format, content, metadata, created_at
1155
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s)
1156
+ ON CONFLICT (report_id) DO UPDATE SET
1157
+ run_id = EXCLUDED.run_id,
1158
+ report_type = EXCLUDED.report_type,
1159
+ format = EXCLUDED.format,
1160
+ content = EXCLUDED.content,
1161
+ metadata = EXCLUDED.metadata,
1162
+ created_at = EXCLUDED.created_at
1163
+ """,
1164
+ (
1165
+ report_id,
1166
+ run_id,
1167
+ report_type,
1168
+ format,
1169
+ content,
1170
+ self._serialize_pipeline_json(metadata),
1171
+ created_at_value,
1172
+ ),
1173
+ )
1174
+ conn.commit()
1175
+
1176
+ return report_id
1177
+
1178
+ def list_ops_reports(
1179
+ self,
1180
+ *,
1181
+ run_id: str,
1182
+ report_type: str | None = None,
1183
+ format: str | None = None,
1184
+ limit: int = 20,
1185
+ ) -> list[dict[str, Any]]:
1186
+ clauses = ["run_id = %s"]
1187
+ params: list[Any] = [run_id]
1188
+ if report_type:
1189
+ clauses.append("report_type = %s")
1190
+ params.append(report_type)
1191
+ if format:
1192
+ clauses.append("format = %s")
1193
+ params.append(format)
1194
+ params.append(limit)
1195
+
1196
+ query = (
1197
+ "SELECT report_id, run_id, report_type, format, content, metadata, created_at "
1198
+ "FROM ops_reports WHERE " + " AND ".join(clauses) + " ORDER BY created_at DESC LIMIT %s"
1199
+ )
1200
+
1201
+ with self._get_connection() as conn:
1202
+ rows = conn.execute(query, tuple(params)).fetchall()
1203
+
1204
+ reports: list[dict[str, Any]] = []
1205
+ for row in rows:
1206
+ reports.append(
1207
+ {
1208
+ "report_id": row["report_id"],
1209
+ "run_id": row["run_id"],
1210
+ "report_type": row["report_type"],
1211
+ "format": row["format"],
1212
+ "content": row["content"],
1213
+ "metadata": self._deserialize_json(row["metadata"]),
1214
+ "created_at": row["created_at"].isoformat()
1215
+ if isinstance(row["created_at"], datetime)
1216
+ else row["created_at"],
1217
+ }
1218
+ )
1219
+ return reports
1220
+
1131
1221
  def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
1132
1222
  """파이프라인 분석 결과 목록을 조회합니다."""
1133
1223
  query = """
@@ -1,6 +1,8 @@
1
1
  -- EvalVault PostgreSQL Database Schema
2
2
  -- Stores evaluation runs, test case results, and metric scores
3
3
 
4
+ CREATE EXTENSION IF NOT EXISTS vector;
5
+
4
6
  -- Main evaluation runs table
5
7
  CREATE TABLE IF NOT EXISTS evaluation_runs (
6
8
  run_id UUID PRIMARY KEY,
@@ -241,6 +243,19 @@ CREATE TABLE IF NOT EXISTS analysis_reports (
241
243
  CREATE INDEX IF NOT EXISTS idx_reports_run_id ON analysis_reports(run_id);
242
244
  CREATE INDEX IF NOT EXISTS idx_reports_experiment_id ON analysis_reports(experiment_id);
243
245
 
246
+ -- Ops reports table
247
+ CREATE TABLE IF NOT EXISTS ops_reports (
248
+ report_id UUID PRIMARY KEY,
249
+ run_id UUID REFERENCES evaluation_runs(run_id) ON DELETE SET NULL,
250
+ report_type VARCHAR(50) NOT NULL,
251
+ format VARCHAR(20) NOT NULL,
252
+ content TEXT,
253
+ metadata JSONB,
254
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
255
+ );
256
+
257
+ CREATE INDEX IF NOT EXISTS idx_ops_reports_run_id ON ops_reports(run_id);
258
+
244
259
  -- Analysis pipeline results table
245
260
  CREATE TABLE IF NOT EXISTS pipeline_results (
246
261
  result_id UUID PRIMARY KEY,
@@ -271,6 +271,20 @@ CREATE TABLE IF NOT EXISTS analysis_reports (
271
271
  CREATE INDEX IF NOT EXISTS idx_reports_run_id ON analysis_reports(run_id);
272
272
  CREATE INDEX IF NOT EXISTS idx_reports_experiment_id ON analysis_reports(experiment_id);
273
273
 
274
+ -- Ops reports table
275
+ CREATE TABLE IF NOT EXISTS ops_reports (
276
+ report_id TEXT PRIMARY KEY,
277
+ run_id TEXT,
278
+ report_type TEXT NOT NULL, -- 'ops_report', 'ops_snapshot'
279
+ format TEXT NOT NULL, -- 'markdown', 'json'
280
+ content TEXT, -- Report content (markdown/json) or file path
281
+ metadata TEXT, -- JSON metadata
282
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
283
+ FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE SET NULL
284
+ );
285
+
286
+ CREATE INDEX IF NOT EXISTS idx_ops_reports_run_id ON ops_reports(run_id);
287
+
274
288
  -- Analysis pipeline results table
275
289
  CREATE TABLE IF NOT EXISTS pipeline_results (
276
290
  result_id TEXT PRIMARY KEY,
@@ -1211,6 +1211,83 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1211
1211
  )
1212
1212
  return reports
1213
1213
 
1214
+ def save_ops_report(
1215
+ self,
1216
+ *,
1217
+ report_id: str | None,
1218
+ run_id: str | None,
1219
+ report_type: str,
1220
+ format: str,
1221
+ content: str | None,
1222
+ metadata: dict[str, Any] | None = None,
1223
+ created_at: str | None = None,
1224
+ ) -> str:
1225
+ report_id = report_id or str(uuid.uuid4())
1226
+ created_at = created_at or datetime.now().isoformat()
1227
+
1228
+ with self._get_connection() as conn:
1229
+ conn = cast(Any, conn)
1230
+ conn.execute(
1231
+ """
1232
+ INSERT OR REPLACE INTO ops_reports (
1233
+ report_id, run_id, report_type, format, content, metadata, created_at
1234
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)
1235
+ """,
1236
+ (
1237
+ report_id,
1238
+ run_id,
1239
+ report_type,
1240
+ format,
1241
+ content,
1242
+ self._serialize_json(metadata),
1243
+ created_at,
1244
+ ),
1245
+ )
1246
+ conn.commit()
1247
+
1248
+ return report_id
1249
+
1250
+ def list_ops_reports(
1251
+ self,
1252
+ *,
1253
+ run_id: str,
1254
+ report_type: str | None = None,
1255
+ format: str | None = None,
1256
+ limit: int = 20,
1257
+ ) -> list[dict[str, Any]]:
1258
+ query = (
1259
+ "SELECT report_id, run_id, report_type, format, content, metadata, created_at "
1260
+ "FROM ops_reports WHERE run_id = ?"
1261
+ )
1262
+ params: list[Any] = [run_id]
1263
+ if report_type:
1264
+ query += " AND report_type = ?"
1265
+ params.append(report_type)
1266
+ if format:
1267
+ query += " AND format = ?"
1268
+ params.append(format)
1269
+ query += " ORDER BY created_at DESC LIMIT ?"
1270
+ params.append(limit)
1271
+
1272
+ with self._get_connection() as conn:
1273
+ conn = cast(Any, conn)
1274
+ rows = conn.execute(query, tuple(params)).fetchall()
1275
+
1276
+ reports: list[dict[str, Any]] = []
1277
+ for row in rows:
1278
+ reports.append(
1279
+ {
1280
+ "report_id": row["report_id"],
1281
+ "run_id": row["run_id"],
1282
+ "report_type": row["report_type"],
1283
+ "format": row["format"],
1284
+ "content": row["content"],
1285
+ "metadata": self._deserialize_json(row["metadata"]),
1286
+ "created_at": row["created_at"],
1287
+ }
1288
+ )
1289
+ return reports
1290
+
1214
1291
  def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
1215
1292
  """파이프라인 분석 결과 목록을 조회합니다."""
1216
1293
  query = """
@@ -3,7 +3,7 @@
3
3
  from pathlib import Path
4
4
  from typing import Any
5
5
 
6
- from pydantic import Field, PrivateAttr
6
+ from pydantic import AliasChoices, Field, PrivateAttr
7
7
  from pydantic_settings import BaseSettings, SettingsConfigDict
8
8
 
9
9
  from evalvault.config.secret_manager import (
@@ -179,6 +179,10 @@ class Settings(BaseSettings):
179
179
  default="data/db/evalvault.db",
180
180
  description="SQLite database path for API/CLI storage.",
181
181
  )
182
+ db_backend: str = Field(
183
+ default="postgres",
184
+ description="Storage backend: 'postgres' or 'sqlite'.",
185
+ )
182
186
  evalvault_memory_db_path: str = Field(
183
187
  default="data/db/evalvault_memory.db",
184
188
  description="SQLite database path for Domain Memory storage.",
@@ -242,7 +246,7 @@ class Settings(BaseSettings):
242
246
  description="Ollama server URL",
243
247
  )
244
248
  ollama_model: str = Field(
245
- default="gpt-oss-safeguard:20b",
249
+ default="qwen3:14b",
246
250
  description="Ollama model name for evaluation",
247
251
  )
248
252
  ollama_embedding_model: str = Field(
@@ -395,11 +399,31 @@ class Settings(BaseSettings):
395
399
  )
396
400
 
397
401
  # PostgreSQL Configuration (optional)
398
- postgres_host: str | None = Field(default=None, description="PostgreSQL server host")
399
- postgres_port: int = Field(default=5432, description="PostgreSQL server port")
400
- postgres_database: str = Field(default="evalvault", description="PostgreSQL database name")
401
- postgres_user: str | None = Field(default=None, description="PostgreSQL user")
402
- postgres_password: str | None = Field(default=None, description="PostgreSQL password")
402
+ postgres_host: str | None = Field(
403
+ default=None,
404
+ validation_alias=AliasChoices("POSTGRES_HOST", "EVALVAULT_DB_HOST"),
405
+ description="PostgreSQL server host",
406
+ )
407
+ postgres_port: int = Field(
408
+ default=5432,
409
+ validation_alias=AliasChoices("POSTGRES_PORT", "EVALVAULT_DB_PORT"),
410
+ description="PostgreSQL server port",
411
+ )
412
+ postgres_database: str = Field(
413
+ default="evalvault",
414
+ validation_alias=AliasChoices("POSTGRES_DATABASE", "EVALVAULT_DB_NAME"),
415
+ description="PostgreSQL database name",
416
+ )
417
+ postgres_user: str | None = Field(
418
+ default=None,
419
+ validation_alias=AliasChoices("POSTGRES_USER", "EVALVAULT_DB_USER"),
420
+ description="PostgreSQL user",
421
+ )
422
+ postgres_password: str | None = Field(
423
+ default=None,
424
+ validation_alias=AliasChoices("POSTGRES_PASSWORD", "EVALVAULT_DB_PASSWORD"),
425
+ description="PostgreSQL password",
426
+ )
403
427
  postgres_connection_string: str | None = Field(
404
428
  default=None, description="PostgreSQL connection string (overrides other postgres settings)"
405
429
  )
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+ from evalvault.domain.entities.stage import StageMetric, StageSummary
7
+
8
+
9
+ @dataclass
10
+ class OpsReport:
11
+ run_summary: dict[str, Any]
12
+ ops_kpis: dict[str, Any]
13
+ stage_summary: StageSummary | None
14
+ stage_metrics: list[StageMetric]
15
+ bottlenecks: list[dict[str, Any]]
16
+ recommendations: list[str]
17
+ metadata: dict[str, Any] = field(default_factory=dict)
18
+
19
+ def to_dict(self) -> dict[str, Any]:
20
+ return {
21
+ "run_summary": self.run_summary,
22
+ "ops_kpis": self.ops_kpis,
23
+ "stage_summary": _stage_summary_to_dict(self.stage_summary),
24
+ "stage_metrics": [metric.to_dict() for metric in self.stage_metrics],
25
+ "bottlenecks": self.bottlenecks,
26
+ "recommendations": self.recommendations,
27
+ "metadata": self.metadata,
28
+ }
29
+
30
+
31
+ def _stage_summary_to_dict(summary: StageSummary | None) -> dict[str, Any] | None:
32
+ if summary is None:
33
+ return None
34
+ return {
35
+ "run_id": summary.run_id,
36
+ "total_events": summary.total_events,
37
+ "stage_type_counts": summary.stage_type_counts,
38
+ "stage_type_avg_durations": summary.stage_type_avg_durations,
39
+ "missing_required_stage_types": summary.missing_required_stage_types,
40
+ }
@@ -20,7 +20,8 @@ class DomainLearningHook:
20
20
  Formation dynamics를 구현합니다.
21
21
 
22
22
  사용 예시:
23
- memory_adapter = SQLiteDomainMemoryAdapter()
23
+ from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
24
+ memory_adapter = build_domain_memory_adapter()
24
25
  hook = DomainLearningHook(memory_adapter)
25
26
 
26
27
  # 평가 후 메모리 형성
@@ -0,0 +1,192 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from evalvault.config.langfuse_support import get_langfuse_trace_url
6
+ from evalvault.config.phoenix_support import get_phoenix_trace_url
7
+ from evalvault.domain.entities.ops_report import OpsReport
8
+ from evalvault.domain.entities.stage import StageEvent, StageMetric, StageSummary
9
+ from evalvault.domain.services.stage_metric_guide_service import StageMetricGuideService
10
+ from evalvault.domain.services.stage_metric_service import StageMetricService
11
+ from evalvault.domain.services.stage_summary_service import StageSummaryService
12
+ from evalvault.ports.outbound.stage_storage_port import StageStoragePort
13
+ from evalvault.ports.outbound.storage_port import StoragePort
14
+
15
+
16
+ class OpsReportService:
17
+ """Build an operational report for an evaluation run."""
18
+
19
+ def __init__(
20
+ self,
21
+ *,
22
+ metric_service: StageMetricService | None = None,
23
+ summary_service: StageSummaryService | None = None,
24
+ guide_service: StageMetricGuideService | None = None,
25
+ ) -> None:
26
+ self._metric_service = metric_service or StageMetricService()
27
+ self._summary_service = summary_service or StageSummaryService()
28
+ self._guide_service = guide_service or StageMetricGuideService()
29
+
30
+ def build_report(
31
+ self,
32
+ run_id: str,
33
+ *,
34
+ storage: StoragePort,
35
+ stage_storage: StageStoragePort,
36
+ ) -> OpsReport:
37
+ run = storage.get_run(run_id)
38
+ run_summary = run.to_summary_dict()
39
+ phoenix_trace_url = get_phoenix_trace_url(run.tracker_metadata)
40
+ langfuse_trace_url = get_langfuse_trace_url(run.tracker_metadata)
41
+
42
+ events = stage_storage.list_stage_events(run_id)
43
+ stage_summary = self._summarize_events(events)
44
+
45
+ stage_metrics = stage_storage.list_stage_metrics(run_id)
46
+ if not stage_metrics and events:
47
+ stage_metrics = self._metric_service.build_metrics(events)
48
+
49
+ bottlenecks = self._build_bottlenecks(stage_summary)
50
+ recommendations = self._build_recommendations(stage_metrics)
51
+
52
+ ops_kpis = self._build_ops_kpis(run, events)
53
+
54
+ metadata = {
55
+ "phoenix_trace_url": phoenix_trace_url,
56
+ "langfuse_trace_url": langfuse_trace_url,
57
+ }
58
+
59
+ return OpsReport(
60
+ run_summary=run_summary,
61
+ ops_kpis=ops_kpis,
62
+ stage_summary=stage_summary,
63
+ stage_metrics=stage_metrics,
64
+ bottlenecks=bottlenecks,
65
+ recommendations=recommendations,
66
+ metadata=metadata,
67
+ )
68
+
69
+ def _summarize_events(self, events: list[StageEvent]) -> StageSummary | None:
70
+ if not events:
71
+ return None
72
+ return self._summary_service.summarize(events)
73
+
74
+ def _build_bottlenecks(self, summary: StageSummary | None) -> list[dict[str, Any]]:
75
+ if summary is None:
76
+ return []
77
+ bottlenecks: list[dict[str, Any]] = []
78
+
79
+ for stage_type in summary.missing_required_stage_types:
80
+ bottlenecks.append(
81
+ {
82
+ "type": "missing_stage",
83
+ "stage_type": stage_type,
84
+ "detail": "required stage missing",
85
+ }
86
+ )
87
+
88
+ durations = summary.stage_type_avg_durations
89
+ if durations:
90
+ top = sorted(durations.items(), key=lambda item: item[1], reverse=True)[:3]
91
+ for stage_type, duration in top:
92
+ bottlenecks.append(
93
+ {
94
+ "type": "latency",
95
+ "stage_type": stage_type,
96
+ "avg_duration_ms": round(duration, 3),
97
+ }
98
+ )
99
+ return bottlenecks
100
+
101
+ def _build_recommendations(self, metrics: list[StageMetric]) -> list[str]:
102
+ if not metrics:
103
+ return []
104
+ guides = self._guide_service.build_guides(metrics)
105
+ recommendations: list[str] = []
106
+ for guide in guides:
107
+ top_action = guide.top_action
108
+ if top_action is None:
109
+ continue
110
+ hint = top_action.implementation_hint or top_action.description
111
+ label = f"[{guide.priority.value}] {guide.component.value}"
112
+ if hint:
113
+ recommendations.append(f"{label}: {top_action.title} - {hint}")
114
+ else:
115
+ recommendations.append(f"{label}: {top_action.title}")
116
+ return recommendations
117
+
118
+ def _build_ops_kpis(self, run, events: list[StageEvent]) -> dict[str, Any]:
119
+ total_cases = run.total_test_cases
120
+ latencies = [r.latency_ms for r in run.results if r.latency_ms]
121
+ tokens_used = [r.tokens_used for r in run.results if r.tokens_used]
122
+ costs = [r.cost_usd for r in run.results if r.cost_usd is not None]
123
+
124
+ avg_latency = _average(latencies)
125
+ p95_latency = _percentile(latencies, 0.95)
126
+ avg_tokens = _average(tokens_used)
127
+ avg_cost = _average(costs)
128
+ pass_rate = run.pass_rate
129
+ failure_rate = None if pass_rate is None else max(0.0, 1.0 - pass_rate)
130
+
131
+ error_rate = _stage_error_rate(events)
132
+ error_severity = _stage_error_severity(error_rate)
133
+
134
+ return {
135
+ "total_test_cases": total_cases,
136
+ "pass_rate": pass_rate,
137
+ "failure_rate": failure_rate,
138
+ "stage_error_rate": error_rate,
139
+ "stage_error_severity": error_severity,
140
+ "duration_seconds": run.duration_seconds,
141
+ "total_tokens": run.total_tokens,
142
+ "total_cost_usd": run.total_cost_usd,
143
+ "avg_latency_ms": avg_latency,
144
+ "p95_latency_ms": p95_latency,
145
+ "avg_tokens_per_case": avg_tokens,
146
+ "avg_cost_per_case_usd": avg_cost,
147
+ }
148
+
149
+
150
+ def _average(values: list[float | int]) -> float | None:
151
+ if not values:
152
+ return None
153
+ return float(sum(values)) / len(values)
154
+
155
+
156
+ def _percentile(values: list[float | int], ratio: float) -> float | None:
157
+ if not values:
158
+ return None
159
+ if ratio <= 0:
160
+ return float(min(values))
161
+ if ratio >= 1:
162
+ return float(max(values))
163
+ sorted_values = sorted(values)
164
+ index = int(round((len(sorted_values) - 1) * ratio))
165
+ return float(sorted_values[index])
166
+
167
+
168
+ def _stage_error_rate(events: list[StageEvent]) -> float | None:
169
+ if not events:
170
+ return None
171
+ total = len(events)
172
+ failure_statuses = {"failed", "error", "timeout", "aborted"}
173
+ success_statuses = {"success", "ok", "completed", "pass"}
174
+ failures = 0
175
+ for event in events:
176
+ status = str(event.status or "").strip().lower()
177
+ if status in failure_statuses:
178
+ failures += 1
179
+ continue
180
+ if status and status not in success_statuses:
181
+ failures += 1
182
+ return failures / total
183
+
184
+
185
+ def _stage_error_severity(rate: float | None) -> str | None:
186
+ if rate is None:
187
+ return None
188
+ if rate >= 0.05:
189
+ return "critical"
190
+ if rate >= 0.02:
191
+ return "warning"
192
+ return "ok"
@@ -18,7 +18,7 @@ class EvalRequest:
18
18
 
19
19
  dataset_path: str
20
20
  metrics: list[str]
21
- model_name: str = "ollama/gpt-oss-safeguard:20b"
21
+ model_name: str = "ollama/qwen3:14b"
22
22
  evaluation_task: str = "qa"
23
23
  langfuse_enabled: bool = False
24
24
  thresholds: dict[str, float] = field(default_factory=dict)
@@ -121,12 +121,14 @@ class WebUIPort(Protocol):
121
121
  def list_runs(
122
122
  self,
123
123
  limit: int = 50,
124
+ offset: int = 0,
124
125
  filters: RunFilters | None = None,
125
126
  ) -> list[RunSummary]:
126
127
  """평가 목록 조회.
127
128
 
128
129
  Args:
129
130
  limit: 최대 조회 개수
131
+ offset: 조회 시작 위치
130
132
  filters: 필터 조건
131
133
 
132
134
  Returns:
@@ -83,6 +83,7 @@ class StoragePort(Protocol):
83
83
  def list_runs(
84
84
  self,
85
85
  limit: int = 100,
86
+ offset: int = 0,
86
87
  dataset_name: str | None = None,
87
88
  model_name: str | None = None,
88
89
  ) -> list[EvaluationRun]:
@@ -90,6 +91,7 @@ class StoragePort(Protocol):
90
91
 
91
92
  Args:
92
93
  limit: 최대 조회 개수
94
+ offset: 조회 시작 위치 (선택)
93
95
  dataset_name: 필터링할 데이터셋 이름 (선택)
94
96
  model_name: 필터링할 모델 이름 (선택)
95
97