evalvault 1.73.2__py3-none-any.whl → 1.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +66 -17
- evalvault/adapters/inbound/api/routers/calibration.py +9 -9
- evalvault/adapters/inbound/api/routers/chat.py +604 -37
- evalvault/adapters/inbound/api/routers/domain.py +10 -5
- evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
- evalvault/adapters/inbound/api/routers/runs.py +23 -4
- evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
- evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
- evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
- evalvault/adapters/inbound/cli/commands/compare.py +2 -7
- evalvault/adapters/inbound/cli/commands/debug.py +3 -2
- evalvault/adapters/inbound/cli/commands/domain.py +12 -12
- evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
- evalvault/adapters/inbound/cli/commands/gate.py +3 -2
- evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
- evalvault/adapters/inbound/cli/commands/history.py +3 -12
- evalvault/adapters/inbound/cli/commands/method.py +1 -2
- evalvault/adapters/inbound/cli/commands/ops.py +2 -2
- evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
- evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
- evalvault/adapters/inbound/cli/commands/regress.py +5 -4
- evalvault/adapters/inbound/cli/commands/run.py +42 -31
- evalvault/adapters/inbound/cli/commands/run_helpers.py +24 -15
- evalvault/adapters/inbound/cli/commands/stage.py +6 -25
- evalvault/adapters/inbound/cli/utils/options.py +10 -4
- evalvault/adapters/inbound/mcp/tools.py +11 -8
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
- evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
- evalvault/adapters/outbound/domain_memory/factory.py +68 -0
- evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
- evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
- evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
- evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
- evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
- evalvault/adapters/outbound/ops/__init__.py +5 -0
- evalvault/adapters/outbound/ops/report_renderer.py +159 -0
- evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
- evalvault/adapters/outbound/storage/base_sql.py +3 -2
- evalvault/adapters/outbound/storage/factory.py +53 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +90 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +15 -0
- evalvault/adapters/outbound/storage/schema.sql +14 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +77 -0
- evalvault/config/settings.py +31 -7
- evalvault/domain/entities/ops_report.py +40 -0
- evalvault/domain/services/domain_learning_hook.py +2 -1
- evalvault/domain/services/ops_report_service.py +192 -0
- evalvault/ports/inbound/web_port.py +3 -1
- evalvault/ports/outbound/storage_port.py +2 -0
- evalvault-1.75.0.dist-info/METADATA +221 -0
- {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/RECORD +57 -48
- evalvault-1.73.2.dist-info/METADATA +0 -585
- {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/WHEEL +0 -0
- {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -21,6 +21,7 @@ from evalvault.adapters.outbound.analysis import (
|
|
|
21
21
|
)
|
|
22
22
|
from evalvault.adapters.outbound.cache import MemoryCacheAdapter
|
|
23
23
|
from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
|
|
24
|
+
from evalvault.adapters.outbound.ops.report_renderer import render_json, render_markdown
|
|
24
25
|
from evalvault.adapters.outbound.report import MarkdownReportAdapter
|
|
25
26
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
26
27
|
from evalvault.config.settings import Settings
|
|
@@ -43,6 +44,7 @@ from evalvault.domain.services.analysis_service import AnalysisService
|
|
|
43
44
|
from evalvault.domain.services.cluster_map_builder import build_cluster_map
|
|
44
45
|
from evalvault.domain.services.debug_report_service import DebugReportService
|
|
45
46
|
from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
|
|
47
|
+
from evalvault.domain.services.ops_report_service import OpsReportService
|
|
46
48
|
from evalvault.domain.services.prompt_registry import (
|
|
47
49
|
PromptInput,
|
|
48
50
|
build_prompt_bundle,
|
|
@@ -125,14 +127,15 @@ class WebUIAdapter:
|
|
|
125
127
|
llm_adapter: LLM 어댑터 (선택적)
|
|
126
128
|
data_loader: 데이터 로더 (선택적)
|
|
127
129
|
"""
|
|
128
|
-
resolved_settings = settings
|
|
130
|
+
resolved_settings = settings or Settings()
|
|
129
131
|
if storage is None:
|
|
130
|
-
|
|
131
|
-
db_path = getattr(resolved_settings, "evalvault_db_path", None)
|
|
132
|
-
if db_path:
|
|
133
|
-
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
132
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
134
133
|
|
|
135
|
-
|
|
134
|
+
try:
|
|
135
|
+
storage = build_storage_adapter(settings=resolved_settings)
|
|
136
|
+
except Exception as exc:
|
|
137
|
+
logger.warning("Storage initialization failed: %s", exc)
|
|
138
|
+
storage = None
|
|
136
139
|
|
|
137
140
|
self._storage = storage
|
|
138
141
|
self._evaluator = evaluator
|
|
@@ -448,7 +451,12 @@ class WebUIAdapter:
|
|
|
448
451
|
memory_domain = memory_config.get("domain") or dataset.metadata.get("domain") or "default"
|
|
449
452
|
memory_language = memory_config.get("language") or "ko"
|
|
450
453
|
memory_augment = bool(memory_config.get("augment_context"))
|
|
451
|
-
|
|
454
|
+
if memory_config.get("db_path"):
|
|
455
|
+
memory_db_path = memory_config.get("db_path")
|
|
456
|
+
elif settings.db_backend == "sqlite":
|
|
457
|
+
memory_db_path = settings.evalvault_memory_db_path
|
|
458
|
+
else:
|
|
459
|
+
memory_db_path = None
|
|
452
460
|
memory_evaluator = None
|
|
453
461
|
requested_thresholds = request.thresholds or {}
|
|
454
462
|
if request.threshold_profile or requested_thresholds:
|
|
@@ -470,16 +478,17 @@ class WebUIAdapter:
|
|
|
470
478
|
memory_active = False
|
|
471
479
|
if memory_enabled:
|
|
472
480
|
try:
|
|
473
|
-
from evalvault.adapters.outbound.domain_memory
|
|
474
|
-
SQLiteDomainMemoryAdapter,
|
|
475
|
-
)
|
|
481
|
+
from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
|
|
476
482
|
from evalvault.adapters.outbound.tracer.phoenix_tracer_adapter import (
|
|
477
483
|
PhoenixTracerAdapter,
|
|
478
484
|
)
|
|
479
485
|
from evalvault.domain.services.memory_aware_evaluator import MemoryAwareEvaluator
|
|
480
486
|
|
|
481
487
|
tracer = PhoenixTracerAdapter() if tracker_provider == "phoenix" else None
|
|
482
|
-
memory_adapter =
|
|
488
|
+
memory_adapter = build_domain_memory_adapter(
|
|
489
|
+
settings=self._settings,
|
|
490
|
+
db_path=Path(memory_db_path) if memory_db_path else None,
|
|
491
|
+
)
|
|
483
492
|
memory_evaluator = MemoryAwareEvaluator(
|
|
484
493
|
evaluator=self._evaluator,
|
|
485
494
|
memory_port=memory_adapter,
|
|
@@ -812,6 +821,7 @@ class WebUIAdapter:
|
|
|
812
821
|
def list_runs(
|
|
813
822
|
self,
|
|
814
823
|
limit: int = 50,
|
|
824
|
+
offset: int = 0,
|
|
815
825
|
filters: RunFilters | None = None,
|
|
816
826
|
) -> list[RunSummary]:
|
|
817
827
|
"""평가 목록 조회.
|
|
@@ -831,7 +841,7 @@ class WebUIAdapter:
|
|
|
831
841
|
|
|
832
842
|
try:
|
|
833
843
|
# 저장소에서 평가 목록 조회
|
|
834
|
-
runs = self._storage.list_runs(limit=limit)
|
|
844
|
+
runs = self._storage.list_runs(limit=limit, offset=offset)
|
|
835
845
|
|
|
836
846
|
# RunSummary로 변환
|
|
837
847
|
summaries = []
|
|
@@ -1027,7 +1037,11 @@ class WebUIAdapter:
|
|
|
1027
1037
|
run = self.get_run_details(run_id)
|
|
1028
1038
|
feedbacks = storage.list_feedback(run_id)
|
|
1029
1039
|
if labels_source in {"feedback", "hybrid"} and not feedbacks:
|
|
1030
|
-
raise ValueError(
|
|
1040
|
+
raise ValueError(
|
|
1041
|
+
f"No feedback labels found for run '{run_id}'. "
|
|
1042
|
+
f"Calibration with labels_source='{labels_source}' requires at least one feedback label. "
|
|
1043
|
+
"Please add feedback labels via the UI or API, or use labels_source='gold' if gold labels are available."
|
|
1044
|
+
)
|
|
1031
1045
|
resolved_metrics = metrics or list(run.metrics_evaluated)
|
|
1032
1046
|
if not resolved_metrics:
|
|
1033
1047
|
raise ValueError("No metrics available for calibration")
|
|
@@ -1329,6 +1343,42 @@ class WebUIAdapter:
|
|
|
1329
1343
|
stage_storage=stage_storage,
|
|
1330
1344
|
)
|
|
1331
1345
|
|
|
1346
|
+
def generate_ops_report(
|
|
1347
|
+
self,
|
|
1348
|
+
run_id: str,
|
|
1349
|
+
*,
|
|
1350
|
+
output_format: str,
|
|
1351
|
+
save: bool,
|
|
1352
|
+
) -> dict[str, Any] | str:
|
|
1353
|
+
if self._storage is None:
|
|
1354
|
+
raise RuntimeError("Storage not configured")
|
|
1355
|
+
if not hasattr(self._storage, "list_stage_events"):
|
|
1356
|
+
raise RuntimeError("Stage storage not configured")
|
|
1357
|
+
|
|
1358
|
+
service = OpsReportService()
|
|
1359
|
+
stage_storage = cast(StageStoragePort, self._storage)
|
|
1360
|
+
report = service.build_report(
|
|
1361
|
+
run_id,
|
|
1362
|
+
storage=self._storage,
|
|
1363
|
+
stage_storage=stage_storage,
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
content = render_markdown(report) if output_format == "markdown" else render_json(report)
|
|
1367
|
+
|
|
1368
|
+
if save:
|
|
1369
|
+
self._storage.save_ops_report(
|
|
1370
|
+
report_id=None,
|
|
1371
|
+
run_id=run_id,
|
|
1372
|
+
report_type="ops_report",
|
|
1373
|
+
format=output_format,
|
|
1374
|
+
content=content,
|
|
1375
|
+
metadata={"source": "api"},
|
|
1376
|
+
)
|
|
1377
|
+
|
|
1378
|
+
if output_format == "markdown":
|
|
1379
|
+
return content
|
|
1380
|
+
return report.to_dict()
|
|
1381
|
+
|
|
1332
1382
|
def delete_run(self, run_id: str) -> bool:
|
|
1333
1383
|
"""평가 삭제.
|
|
1334
1384
|
|
|
@@ -2160,16 +2210,15 @@ def create_adapter() -> WebUIAdapter:
|
|
|
2160
2210
|
"""
|
|
2161
2211
|
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
2162
2212
|
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
2163
|
-
from evalvault.adapters.outbound.storage.
|
|
2213
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
2164
2214
|
from evalvault.config.settings import get_settings
|
|
2165
2215
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
2166
2216
|
|
|
2167
2217
|
# 설정 로드
|
|
2168
2218
|
settings = get_settings()
|
|
2169
2219
|
|
|
2170
|
-
# Storage 생성
|
|
2171
|
-
|
|
2172
|
-
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
2220
|
+
# Storage 생성
|
|
2221
|
+
storage = build_storage_adapter(settings=settings)
|
|
2173
2222
|
|
|
2174
2223
|
# LLM adapter 생성 (API 키 없으면 None)
|
|
2175
2224
|
llm_adapter = None
|
|
@@ -113,6 +113,15 @@ def run_judge_calibration(
|
|
|
113
113
|
return JudgeCalibrationResponse.model_validate(payload)
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
@router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
|
|
117
|
+
def list_calibrations(
|
|
118
|
+
adapter: AdapterDep,
|
|
119
|
+
limit: int = Query(20, ge=1, le=200),
|
|
120
|
+
) -> list[JudgeCalibrationHistoryItem]:
|
|
121
|
+
entries = adapter.list_judge_calibrations(limit=limit)
|
|
122
|
+
return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
|
|
123
|
+
|
|
124
|
+
|
|
116
125
|
@router.get("/judge/{calibration_id}", response_model=JudgeCalibrationResponse)
|
|
117
126
|
def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCalibrationResponse:
|
|
118
127
|
try:
|
|
@@ -122,12 +131,3 @@ def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCal
|
|
|
122
131
|
except RuntimeError as exc:
|
|
123
132
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
124
133
|
return JudgeCalibrationResponse.model_validate(payload)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
@router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
|
|
128
|
-
def list_calibrations(
|
|
129
|
-
adapter: AdapterDep,
|
|
130
|
-
limit: int = Query(20, ge=1, le=200),
|
|
131
|
-
) -> list[JudgeCalibrationHistoryItem]:
|
|
132
|
-
entries = adapter.list_judge_calibrations(limit=limit)
|
|
133
|
-
return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
|