evalvault 1.74.0__py3-none-any.whl → 1.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +28 -17
- evalvault/adapters/inbound/api/routers/calibration.py +9 -9
- evalvault/adapters/inbound/api/routers/chat.py +303 -17
- evalvault/adapters/inbound/api/routers/domain.py +10 -5
- evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
- evalvault/adapters/inbound/api/routers/runs.py +23 -4
- evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
- evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
- evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
- evalvault/adapters/inbound/cli/commands/compare.py +2 -7
- evalvault/adapters/inbound/cli/commands/debug.py +3 -2
- evalvault/adapters/inbound/cli/commands/domain.py +12 -12
- evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
- evalvault/adapters/inbound/cli/commands/gate.py +3 -2
- evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
- evalvault/adapters/inbound/cli/commands/history.py +3 -12
- evalvault/adapters/inbound/cli/commands/method.py +1 -2
- evalvault/adapters/inbound/cli/commands/ops.py +2 -2
- evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
- evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
- evalvault/adapters/inbound/cli/commands/regress.py +5 -4
- evalvault/adapters/inbound/cli/commands/run.py +42 -31
- evalvault/adapters/inbound/cli/commands/run_helpers.py +24 -15
- evalvault/adapters/inbound/cli/commands/stage.py +6 -25
- evalvault/adapters/inbound/cli/utils/options.py +10 -4
- evalvault/adapters/inbound/mcp/tools.py +11 -8
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
- evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
- evalvault/adapters/outbound/domain_memory/factory.py +68 -0
- evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
- evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
- evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
- evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
- evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
- evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
- evalvault/adapters/outbound/storage/base_sql.py +3 -2
- evalvault/adapters/outbound/storage/factory.py +53 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
- evalvault/config/settings.py +31 -7
- evalvault/domain/services/domain_learning_hook.py +2 -1
- evalvault/ports/inbound/web_port.py +3 -1
- evalvault/ports/outbound/storage_port.py +2 -0
- evalvault-1.75.0.dist-info/METADATA +221 -0
- {evalvault-1.74.0.dist-info → evalvault-1.75.0.dist-info}/RECORD +50 -45
- evalvault-1.74.0.dist-info/METADATA +0 -585
- {evalvault-1.74.0.dist-info → evalvault-1.75.0.dist-info}/WHEEL +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.75.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.75.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -127,14 +127,15 @@ class WebUIAdapter:
|
|
|
127
127
|
llm_adapter: LLM 어댑터 (선택적)
|
|
128
128
|
data_loader: 데이터 로더 (선택적)
|
|
129
129
|
"""
|
|
130
|
-
resolved_settings = settings
|
|
130
|
+
resolved_settings = settings or Settings()
|
|
131
131
|
if storage is None:
|
|
132
|
-
|
|
133
|
-
db_path = getattr(resolved_settings, "evalvault_db_path", None)
|
|
134
|
-
if db_path:
|
|
135
|
-
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
132
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
136
133
|
|
|
137
|
-
|
|
134
|
+
try:
|
|
135
|
+
storage = build_storage_adapter(settings=resolved_settings)
|
|
136
|
+
except Exception as exc:
|
|
137
|
+
logger.warning("Storage initialization failed: %s", exc)
|
|
138
|
+
storage = None
|
|
138
139
|
|
|
139
140
|
self._storage = storage
|
|
140
141
|
self._evaluator = evaluator
|
|
@@ -450,7 +451,12 @@ class WebUIAdapter:
|
|
|
450
451
|
memory_domain = memory_config.get("domain") or dataset.metadata.get("domain") or "default"
|
|
451
452
|
memory_language = memory_config.get("language") or "ko"
|
|
452
453
|
memory_augment = bool(memory_config.get("augment_context"))
|
|
453
|
-
|
|
454
|
+
if memory_config.get("db_path"):
|
|
455
|
+
memory_db_path = memory_config.get("db_path")
|
|
456
|
+
elif settings.db_backend == "sqlite":
|
|
457
|
+
memory_db_path = settings.evalvault_memory_db_path
|
|
458
|
+
else:
|
|
459
|
+
memory_db_path = None
|
|
454
460
|
memory_evaluator = None
|
|
455
461
|
requested_thresholds = request.thresholds or {}
|
|
456
462
|
if request.threshold_profile or requested_thresholds:
|
|
@@ -472,16 +478,17 @@ class WebUIAdapter:
|
|
|
472
478
|
memory_active = False
|
|
473
479
|
if memory_enabled:
|
|
474
480
|
try:
|
|
475
|
-
from evalvault.adapters.outbound.domain_memory
|
|
476
|
-
SQLiteDomainMemoryAdapter,
|
|
477
|
-
)
|
|
481
|
+
from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
|
|
478
482
|
from evalvault.adapters.outbound.tracer.phoenix_tracer_adapter import (
|
|
479
483
|
PhoenixTracerAdapter,
|
|
480
484
|
)
|
|
481
485
|
from evalvault.domain.services.memory_aware_evaluator import MemoryAwareEvaluator
|
|
482
486
|
|
|
483
487
|
tracer = PhoenixTracerAdapter() if tracker_provider == "phoenix" else None
|
|
484
|
-
memory_adapter =
|
|
488
|
+
memory_adapter = build_domain_memory_adapter(
|
|
489
|
+
settings=self._settings,
|
|
490
|
+
db_path=Path(memory_db_path) if memory_db_path else None,
|
|
491
|
+
)
|
|
485
492
|
memory_evaluator = MemoryAwareEvaluator(
|
|
486
493
|
evaluator=self._evaluator,
|
|
487
494
|
memory_port=memory_adapter,
|
|
@@ -814,6 +821,7 @@ class WebUIAdapter:
|
|
|
814
821
|
def list_runs(
|
|
815
822
|
self,
|
|
816
823
|
limit: int = 50,
|
|
824
|
+
offset: int = 0,
|
|
817
825
|
filters: RunFilters | None = None,
|
|
818
826
|
) -> list[RunSummary]:
|
|
819
827
|
"""평가 목록 조회.
|
|
@@ -833,7 +841,7 @@ class WebUIAdapter:
|
|
|
833
841
|
|
|
834
842
|
try:
|
|
835
843
|
# 저장소에서 평가 목록 조회
|
|
836
|
-
runs = self._storage.list_runs(limit=limit)
|
|
844
|
+
runs = self._storage.list_runs(limit=limit, offset=offset)
|
|
837
845
|
|
|
838
846
|
# RunSummary로 변환
|
|
839
847
|
summaries = []
|
|
@@ -1029,7 +1037,11 @@ class WebUIAdapter:
|
|
|
1029
1037
|
run = self.get_run_details(run_id)
|
|
1030
1038
|
feedbacks = storage.list_feedback(run_id)
|
|
1031
1039
|
if labels_source in {"feedback", "hybrid"} and not feedbacks:
|
|
1032
|
-
raise ValueError(
|
|
1040
|
+
raise ValueError(
|
|
1041
|
+
f"No feedback labels found for run '{run_id}'. "
|
|
1042
|
+
f"Calibration with labels_source='{labels_source}' requires at least one feedback label. "
|
|
1043
|
+
"Please add feedback labels via the UI or API, or use labels_source='gold' if gold labels are available."
|
|
1044
|
+
)
|
|
1033
1045
|
resolved_metrics = metrics or list(run.metrics_evaluated)
|
|
1034
1046
|
if not resolved_metrics:
|
|
1035
1047
|
raise ValueError("No metrics available for calibration")
|
|
@@ -2198,16 +2210,15 @@ def create_adapter() -> WebUIAdapter:
|
|
|
2198
2210
|
"""
|
|
2199
2211
|
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
2200
2212
|
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
2201
|
-
from evalvault.adapters.outbound.storage.
|
|
2213
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
2202
2214
|
from evalvault.config.settings import get_settings
|
|
2203
2215
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
2204
2216
|
|
|
2205
2217
|
# 설정 로드
|
|
2206
2218
|
settings = get_settings()
|
|
2207
2219
|
|
|
2208
|
-
# Storage 생성
|
|
2209
|
-
|
|
2210
|
-
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
2220
|
+
# Storage 생성
|
|
2221
|
+
storage = build_storage_adapter(settings=settings)
|
|
2211
2222
|
|
|
2212
2223
|
# LLM adapter 생성 (API 키 없으면 None)
|
|
2213
2224
|
llm_adapter = None
|
|
@@ -113,6 +113,15 @@ def run_judge_calibration(
|
|
|
113
113
|
return JudgeCalibrationResponse.model_validate(payload)
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
@router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
|
|
117
|
+
def list_calibrations(
|
|
118
|
+
adapter: AdapterDep,
|
|
119
|
+
limit: int = Query(20, ge=1, le=200),
|
|
120
|
+
) -> list[JudgeCalibrationHistoryItem]:
|
|
121
|
+
entries = adapter.list_judge_calibrations(limit=limit)
|
|
122
|
+
return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
|
|
123
|
+
|
|
124
|
+
|
|
116
125
|
@router.get("/judge/{calibration_id}", response_model=JudgeCalibrationResponse)
|
|
117
126
|
def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCalibrationResponse:
|
|
118
127
|
try:
|
|
@@ -122,12 +131,3 @@ def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCal
|
|
|
122
131
|
except RuntimeError as exc:
|
|
123
132
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
124
133
|
return JudgeCalibrationResponse.model_validate(payload)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
@router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
|
|
128
|
-
def list_calibrations(
|
|
129
|
-
adapter: AdapterDep,
|
|
130
|
-
limit: int = Query(20, ge=1, le=200),
|
|
131
|
-
) -> list[JudgeCalibrationHistoryItem]:
|
|
132
|
-
entries = adapter.list_judge_calibrations(limit=limit)
|
|
133
|
-
return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
|
|
@@ -8,6 +8,7 @@ import os
|
|
|
8
8
|
import re
|
|
9
9
|
import time
|
|
10
10
|
from collections.abc import AsyncGenerator
|
|
11
|
+
from dataclasses import dataclass
|
|
11
12
|
from datetime import UTC, datetime
|
|
12
13
|
from pathlib import Path
|
|
13
14
|
from typing import Any
|
|
@@ -34,6 +35,13 @@ _RAG_TEXTS: list[str] = []
|
|
|
34
35
|
_RAG_INITIALIZED = False
|
|
35
36
|
|
|
36
37
|
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class _RagHit:
|
|
40
|
+
document: str
|
|
41
|
+
score: float
|
|
42
|
+
doc_id: int
|
|
43
|
+
|
|
44
|
+
|
|
37
45
|
class ChatMessage(BaseModel):
|
|
38
46
|
role: str
|
|
39
47
|
content: str
|
|
@@ -315,14 +323,121 @@ async def _get_rag_retriever() -> tuple[Any | None, int]:
|
|
|
315
323
|
if not _RAG_TEXTS:
|
|
316
324
|
return None, 0
|
|
317
325
|
|
|
318
|
-
from evalvault.adapters.outbound.nlp.korean.
|
|
319
|
-
|
|
326
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit import KoreanNLPToolkit
|
|
327
|
+
|
|
328
|
+
use_hybrid = os.getenv("EVALVAULT_RAG_USE_HYBRID", "true").lower() == "true"
|
|
329
|
+
embedding_profile = os.getenv("EVALVAULT_RAG_EMBEDDING_PROFILE", "dev")
|
|
330
|
+
vector_store = os.getenv("EVALVAULT_RAG_VECTOR_STORE", "pgvector").lower()
|
|
331
|
+
pgvector_index = os.getenv("EVALVAULT_RAG_PGVECTOR_INDEX", "hnsw").lower()
|
|
332
|
+
pgvector_index_lists = int(os.getenv("EVALVAULT_RAG_PGVECTOR_INDEX_LISTS", "100"))
|
|
333
|
+
pgvector_hnsw_m = int(os.getenv("EVALVAULT_RAG_PGVECTOR_HNSW_M", "16"))
|
|
334
|
+
pgvector_hnsw_ef = int(os.getenv("EVALVAULT_RAG_PGVECTOR_HNSW_EF_CONSTRUCTION", "64"))
|
|
320
335
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
336
|
+
def _build_conn_string() -> str | None:
|
|
337
|
+
try:
|
|
338
|
+
from evalvault.config.settings import Settings
|
|
339
|
+
|
|
340
|
+
settings = Settings()
|
|
341
|
+
if settings.postgres_connection_string:
|
|
342
|
+
return settings.postgres_connection_string
|
|
343
|
+
if settings.postgres_host:
|
|
344
|
+
return "host={host} port={port} dbname={dbname} user={user} password={password}".format(
|
|
345
|
+
host=settings.postgres_host,
|
|
346
|
+
port=settings.postgres_port,
|
|
347
|
+
dbname=settings.postgres_database,
|
|
348
|
+
user=settings.postgres_user or "postgres",
|
|
349
|
+
password=settings.postgres_password or "",
|
|
350
|
+
)
|
|
351
|
+
except Exception as exc:
|
|
352
|
+
logger.warning("Failed to build Postgres connection string: %s", exc)
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
ollama_adapter = None
|
|
356
|
+
dense_retriever = None
|
|
357
|
+
embedding_func = None
|
|
358
|
+
if embedding_profile:
|
|
359
|
+
try:
|
|
360
|
+
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
361
|
+
from evalvault.adapters.outbound.nlp.korean.dense_retriever import KoreanDenseRetriever
|
|
362
|
+
from evalvault.config.settings import Settings
|
|
363
|
+
|
|
364
|
+
settings = Settings()
|
|
365
|
+
ollama_adapter = OllamaAdapter(settings)
|
|
366
|
+
dense_retriever = KoreanDenseRetriever(
|
|
367
|
+
profile=embedding_profile,
|
|
368
|
+
ollama_adapter=ollama_adapter,
|
|
369
|
+
)
|
|
370
|
+
embedding_func = dense_retriever.get_embedding_func()
|
|
371
|
+
except Exception as exc: # pragma: no cover - runtime dependency
|
|
372
|
+
logger.warning("Failed to initialize dense retriever: %s", exc)
|
|
373
|
+
|
|
374
|
+
if vector_store == "pgvector" and embedding_func is not None:
|
|
375
|
+
conn_string = _build_conn_string()
|
|
376
|
+
if conn_string:
|
|
377
|
+
try:
|
|
378
|
+
from evalvault.adapters.outbound.nlp.korean.bm25_retriever import (
|
|
379
|
+
KoreanBM25Retriever,
|
|
380
|
+
)
|
|
381
|
+
from evalvault.adapters.outbound.nlp.korean.kiwi_tokenizer import KiwiTokenizer
|
|
382
|
+
from evalvault.adapters.outbound.retriever.pgvector_store import PgvectorStore
|
|
383
|
+
|
|
384
|
+
store = PgvectorStore(
|
|
385
|
+
conn_string,
|
|
386
|
+
index_type=pgvector_index,
|
|
387
|
+
index_lists=pgvector_index_lists,
|
|
388
|
+
hnsw_m=pgvector_hnsw_m,
|
|
389
|
+
hnsw_ef_construction=pgvector_hnsw_ef,
|
|
390
|
+
)
|
|
391
|
+
embedding_dim = (
|
|
392
|
+
dense_retriever.dimension if dense_retriever else len(embedding_func(["x"])[0])
|
|
393
|
+
)
|
|
394
|
+
store.ensure_schema(dimension=embedding_dim)
|
|
395
|
+
source_hash = _hash_text(content)
|
|
396
|
+
existing_hash, existing_count = store.get_source_state(source="user_guide")
|
|
397
|
+
if existing_hash != source_hash or existing_count != len(_RAG_TEXTS):
|
|
398
|
+
embeddings = embedding_func(list(_RAG_TEXTS))
|
|
399
|
+
store.replace_documents(
|
|
400
|
+
source="user_guide",
|
|
401
|
+
source_hash=source_hash,
|
|
402
|
+
documents=list(_RAG_TEXTS),
|
|
403
|
+
embeddings=embeddings,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
tokenizer = KiwiTokenizer()
|
|
407
|
+
bm25_retriever = KoreanBM25Retriever(tokenizer=tokenizer)
|
|
408
|
+
bm25_retriever.index(list(_RAG_TEXTS))
|
|
409
|
+
if tokens and len(tokens) == len(_RAG_TEXTS):
|
|
410
|
+
bm25_retriever._tokenized_docs = tokens
|
|
411
|
+
|
|
412
|
+
if use_hybrid:
|
|
413
|
+
retriever = _PgvectorHybridRetriever(
|
|
414
|
+
bm25_retriever=bm25_retriever,
|
|
415
|
+
store=store,
|
|
416
|
+
embedding_func=embedding_func,
|
|
417
|
+
documents=list(_RAG_TEXTS),
|
|
418
|
+
)
|
|
419
|
+
else:
|
|
420
|
+
retriever = _PgvectorDenseRetriever(
|
|
421
|
+
store=store,
|
|
422
|
+
embedding_func=embedding_func,
|
|
423
|
+
documents=list(_RAG_TEXTS),
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
_RAG_RETRIEVER = retriever
|
|
427
|
+
return retriever, _RAG_DOCS_COUNT
|
|
428
|
+
except Exception as exc:
|
|
429
|
+
logger.warning("pgvector retriever setup failed: %s", exc)
|
|
430
|
+
|
|
431
|
+
toolkit = KoreanNLPToolkit()
|
|
432
|
+
retriever = toolkit.build_retriever(
|
|
433
|
+
list(_RAG_TEXTS),
|
|
434
|
+
use_hybrid=use_hybrid,
|
|
435
|
+
ollama_adapter=ollama_adapter,
|
|
436
|
+
embedding_profile=embedding_profile,
|
|
437
|
+
verbose=True,
|
|
438
|
+
)
|
|
439
|
+
if retriever is None:
|
|
440
|
+
return None, 0
|
|
326
441
|
|
|
327
442
|
_RAG_RETRIEVER = retriever
|
|
328
443
|
return retriever, _RAG_DOCS_COUNT
|
|
@@ -384,11 +499,153 @@ def _simple_retrieve(texts: list[str], query: str, top_k: int) -> list[str]:
|
|
|
384
499
|
return [text for _, text in scored[:top_k]]
|
|
385
500
|
|
|
386
501
|
|
|
502
|
+
def _rrf_fuse(
|
|
503
|
+
*,
|
|
504
|
+
bm25_results: list[Any],
|
|
505
|
+
dense_results: list[Any],
|
|
506
|
+
documents: list[str],
|
|
507
|
+
top_k: int,
|
|
508
|
+
bm25_weight: float = 0.4,
|
|
509
|
+
dense_weight: float = 0.6,
|
|
510
|
+
rrf_k: int = 60,
|
|
511
|
+
) -> list[_RagHit]:
|
|
512
|
+
scores: dict[int, float] = {}
|
|
513
|
+
|
|
514
|
+
for rank, result in enumerate(bm25_results, 1):
|
|
515
|
+
doc_id = int(result.doc_id)
|
|
516
|
+
scores[doc_id] = scores.get(doc_id, 0.0) + (bm25_weight / (rrf_k + rank))
|
|
517
|
+
|
|
518
|
+
for rank, result in enumerate(dense_results, 1):
|
|
519
|
+
doc_id = int(result.doc_id)
|
|
520
|
+
scores[doc_id] = scores.get(doc_id, 0.0) + (dense_weight / (rrf_k + rank))
|
|
521
|
+
|
|
522
|
+
ranked = sorted(scores.items(), key=lambda item: item[1], reverse=True)
|
|
523
|
+
hits: list[_RagHit] = []
|
|
524
|
+
for doc_id, score in ranked[:top_k]:
|
|
525
|
+
if 0 <= doc_id < len(documents):
|
|
526
|
+
hits.append(_RagHit(document=documents[doc_id], score=score, doc_id=doc_id))
|
|
527
|
+
return hits
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
class _PgvectorDenseRetriever:
|
|
531
|
+
def __init__(self, store: Any, embedding_func: Any, documents: list[str]) -> None:
|
|
532
|
+
self._store = store
|
|
533
|
+
self._embedding_func = embedding_func
|
|
534
|
+
self._documents = documents
|
|
535
|
+
|
|
536
|
+
def search(self, query: str, top_k: int = 5) -> list[_RagHit]:
|
|
537
|
+
query_embedding = self._embedding_func([query])[0]
|
|
538
|
+
results = self._store.search(
|
|
539
|
+
source="user_guide", query_embedding=query_embedding, top_k=top_k
|
|
540
|
+
)
|
|
541
|
+
hits: list[_RagHit] = []
|
|
542
|
+
for result in results:
|
|
543
|
+
if 0 <= result.doc_id < len(self._documents):
|
|
544
|
+
hits.append(
|
|
545
|
+
_RagHit(
|
|
546
|
+
document=self._documents[result.doc_id],
|
|
547
|
+
score=float(result.score),
|
|
548
|
+
doc_id=result.doc_id,
|
|
549
|
+
)
|
|
550
|
+
)
|
|
551
|
+
return hits
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
class _PgvectorHybridRetriever:
|
|
555
|
+
def __init__(
|
|
556
|
+
self,
|
|
557
|
+
*,
|
|
558
|
+
bm25_retriever: Any,
|
|
559
|
+
store: Any,
|
|
560
|
+
embedding_func: Any,
|
|
561
|
+
documents: list[str],
|
|
562
|
+
) -> None:
|
|
563
|
+
self._bm25 = bm25_retriever
|
|
564
|
+
self._store = store
|
|
565
|
+
self._embedding_func = embedding_func
|
|
566
|
+
self._documents = documents
|
|
567
|
+
|
|
568
|
+
def search(self, query: str, top_k: int = 5) -> list[_RagHit]:
|
|
569
|
+
bm25_results = self._bm25.search(query, top_k=len(self._documents))
|
|
570
|
+
query_embedding = self._embedding_func([query])[0]
|
|
571
|
+
dense_results = self._store.search(
|
|
572
|
+
source="user_guide", query_embedding=query_embedding, top_k=len(self._documents)
|
|
573
|
+
)
|
|
574
|
+
dense_results = sorted(dense_results, key=lambda item: item.score)
|
|
575
|
+
return _rrf_fuse(
|
|
576
|
+
bm25_results=bm25_results,
|
|
577
|
+
dense_results=dense_results,
|
|
578
|
+
documents=self._documents,
|
|
579
|
+
top_k=top_k,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _read_text_limited(path: Path, limit: int = 4000) -> str | None:
|
|
584
|
+
try:
|
|
585
|
+
if not path.exists():
|
|
586
|
+
return None
|
|
587
|
+
content = path.read_text(encoding="utf-8", errors="ignore")
|
|
588
|
+
except Exception as exc:
|
|
589
|
+
logger.warning("Failed to read %s: %s", path, exc)
|
|
590
|
+
return None
|
|
591
|
+
content = content.strip()
|
|
592
|
+
if not content:
|
|
593
|
+
return None
|
|
594
|
+
if len(content) > limit:
|
|
595
|
+
return content[:limit] + "..."
|
|
596
|
+
return content
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
async def _build_run_context(run_id: str) -> list[str]:
|
|
600
|
+
contexts: list[str] = []
|
|
601
|
+
try:
|
|
602
|
+
summary_result = await _call_mcp_tool("get_run_summary", {"run_id": run_id})
|
|
603
|
+
payload = _extract_json_content(summary_result)
|
|
604
|
+
if isinstance(payload, dict):
|
|
605
|
+
contexts.append("[RUN 요약]\n" + _summarize_run_summary(payload))
|
|
606
|
+
except Exception as exc:
|
|
607
|
+
logger.warning("Failed to fetch run summary: %s", exc)
|
|
608
|
+
|
|
609
|
+
try:
|
|
610
|
+
artifacts_result = await _call_mcp_tool(
|
|
611
|
+
"get_artifacts", {"run_id": run_id, "kind": "analysis"}
|
|
612
|
+
)
|
|
613
|
+
payload = _extract_json_content(artifacts_result)
|
|
614
|
+
if isinstance(payload, dict):
|
|
615
|
+
contexts.append("[RUN 아티팩트]\n" + _summarize_artifacts(payload))
|
|
616
|
+
artifacts = payload.get("artifacts") or {}
|
|
617
|
+
report_path = artifacts.get("report_path")
|
|
618
|
+
if isinstance(report_path, str) and report_path:
|
|
619
|
+
report_text = _read_text_limited(Path(report_path))
|
|
620
|
+
if report_text:
|
|
621
|
+
contexts.append("[REPORT 발췌]\n" + report_text)
|
|
622
|
+
except Exception as exc:
|
|
623
|
+
logger.warning("Failed to fetch run artifacts: %s", exc)
|
|
624
|
+
|
|
625
|
+
return contexts
|
|
626
|
+
|
|
627
|
+
|
|
387
628
|
async def _rag_answer(
|
|
388
629
|
user_text: str, run_id: str | None = None, category: str | None = None
|
|
389
630
|
) -> str | None:
|
|
390
|
-
retriever, _ = await _get_rag_retriever()
|
|
391
631
|
contexts: list[str] = []
|
|
632
|
+
rag_llm_enabled = os.getenv("EVALVAULT_RAG_LLM_ENABLED", "true").lower() == "true"
|
|
633
|
+
run_context_enabled = os.getenv("EVALVAULT_CHAT_RUN_CONTEXT_ENABLED", "true").lower() == "true"
|
|
634
|
+
|
|
635
|
+
if run_id and rag_llm_enabled and run_context_enabled:
|
|
636
|
+
contexts.extend(await _build_run_context(run_id))
|
|
637
|
+
|
|
638
|
+
if not rag_llm_enabled and contexts:
|
|
639
|
+
return "\n\n".join(contexts[:3])
|
|
640
|
+
|
|
641
|
+
if not rag_llm_enabled:
|
|
642
|
+
content = _load_user_guide_text()
|
|
643
|
+
if content:
|
|
644
|
+
chunks = [chunk.strip() for chunk in content.split("\n\n") if chunk.strip()]
|
|
645
|
+
contexts.extend(_simple_retrieve(chunks, user_text, top_k=5))
|
|
646
|
+
return "\n\n".join(contexts[:3]) if contexts else None
|
|
647
|
+
|
|
648
|
+
retriever, _ = await _get_rag_retriever()
|
|
392
649
|
|
|
393
650
|
if retriever is not None:
|
|
394
651
|
results = retriever.search(user_text, top_k=5)
|
|
@@ -403,7 +660,7 @@ async def _rag_answer(
|
|
|
403
660
|
if not contexts:
|
|
404
661
|
return None
|
|
405
662
|
|
|
406
|
-
if
|
|
663
|
+
if not rag_llm_enabled:
|
|
407
664
|
return "\n\n".join(contexts[:3])
|
|
408
665
|
|
|
409
666
|
prompt = (
|
|
@@ -431,15 +688,24 @@ async def _rag_answer(
|
|
|
431
688
|
if options:
|
|
432
689
|
payload["options"] = options
|
|
433
690
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
691
|
+
fallback = "\n\n".join(contexts[:3])
|
|
692
|
+
chat_timeout = int(os.getenv("OLLAMA_CHAT_TIMEOUT_SECONDS", "180"))
|
|
693
|
+
try:
|
|
694
|
+
async with httpx.AsyncClient(timeout=chat_timeout) as client:
|
|
695
|
+
response = await client.post(
|
|
696
|
+
f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
|
|
697
|
+
json=payload,
|
|
698
|
+
)
|
|
699
|
+
response.raise_for_status()
|
|
700
|
+
data = response.json()
|
|
701
|
+
except httpx.ReadTimeout:
|
|
702
|
+
logger.warning("Ollama chat timed out; returning retrieved contexts")
|
|
703
|
+
return fallback or None
|
|
704
|
+
except httpx.HTTPError as exc:
|
|
705
|
+
logger.warning("Ollama chat failed: %s", exc)
|
|
706
|
+
return fallback or None
|
|
441
707
|
|
|
442
|
-
return data.get("message", {}).get("content", "").strip() or None
|
|
708
|
+
return data.get("message", {}).get("content", "").strip() or fallback or None
|
|
443
709
|
|
|
444
710
|
|
|
445
711
|
async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
|
|
@@ -665,6 +931,17 @@ async def _chat_stream(
|
|
|
665
931
|
user_text: str, run_id: str | None = None, category: str | None = None
|
|
666
932
|
) -> AsyncGenerator[str, None]:
|
|
667
933
|
started_at = time.perf_counter()
|
|
934
|
+
simple_mode = os.getenv("EVALVAULT_CHAT_SIMPLE_MODE", "false").lower() == "true"
|
|
935
|
+
run_context_enabled = os.getenv("EVALVAULT_CHAT_RUN_CONTEXT_ENABLED", "true").lower() == "true"
|
|
936
|
+
if simple_mode:
|
|
937
|
+
yield _event({"type": "status", "message": "간단 채팅 처리 중..."})
|
|
938
|
+
answer = await _direct_chat_answer(user_text)
|
|
939
|
+
if answer:
|
|
940
|
+
async for item in _emit_answer(answer):
|
|
941
|
+
yield item
|
|
942
|
+
else:
|
|
943
|
+
yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
|
|
944
|
+
return
|
|
668
945
|
if category in {"result_interpretation", "improvement_direction"} and not run_id:
|
|
669
946
|
yield _event(
|
|
670
947
|
{
|
|
@@ -700,6 +977,7 @@ async def _chat_stream(
|
|
|
700
977
|
_is_verb_only(user_text)
|
|
701
978
|
and category in {"result_interpretation", "improvement_direction"}
|
|
702
979
|
and run_id
|
|
980
|
+
and run_context_enabled
|
|
703
981
|
):
|
|
704
982
|
yield _event({"type": "status", "message": "선택한 run 요약 중..."})
|
|
705
983
|
try:
|
|
@@ -807,6 +1085,14 @@ async def _chat_stream(
|
|
|
807
1085
|
if tool_name == "get_artifacts" and not (tool_args.get("run_id") or run_id):
|
|
808
1086
|
yield _event({"type": "final", "content": "아티팩트 조회를 위해 run_id가 필요합니다."})
|
|
809
1087
|
return
|
|
1088
|
+
if not run_context_enabled and tool_name in {"get_run_summary", "get_artifacts"}:
|
|
1089
|
+
yield _event(
|
|
1090
|
+
{
|
|
1091
|
+
"type": "final",
|
|
1092
|
+
"content": "run 요약/아티팩트 조회가 비활성화되어 있습니다.",
|
|
1093
|
+
}
|
|
1094
|
+
)
|
|
1095
|
+
return
|
|
810
1096
|
if tool_name == "analyze_compare" and (
|
|
811
1097
|
not tool_args.get("run_id_a") or not tool_args.get("run_id_b")
|
|
812
1098
|
):
|
|
@@ -5,17 +5,22 @@ from __future__ import annotations
|
|
|
5
5
|
from fastapi import APIRouter, HTTPException
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
|
-
from evalvault.adapters.outbound.domain_memory
|
|
8
|
+
from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
|
|
9
9
|
from evalvault.config.settings import get_settings
|
|
10
|
+
from evalvault.ports.outbound.domain_memory_port import DomainMemoryPort
|
|
10
11
|
|
|
11
12
|
router = APIRouter()
|
|
12
|
-
|
|
13
|
+
_settings = get_settings()
|
|
14
|
+
DEFAULT_MEMORY_DB_PATH = (
|
|
15
|
+
_settings.evalvault_memory_db_path if _settings.db_backend == "sqlite" else None
|
|
16
|
+
)
|
|
13
17
|
|
|
14
18
|
|
|
15
|
-
|
|
16
|
-
def get_memory_adapter(db_path: str = DEFAULT_MEMORY_DB_PATH) -> SQLiteDomainMemoryAdapter:
|
|
19
|
+
def get_memory_adapter(db_path: str | None = DEFAULT_MEMORY_DB_PATH) -> DomainMemoryPort:
|
|
17
20
|
"""Get memory adapter instance."""
|
|
18
|
-
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
return build_domain_memory_adapter(db_path=Path(db_path) if db_path else None)
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
# --- Pydantic Models ---
|
|
@@ -8,7 +8,7 @@ from fastapi.encoders import jsonable_encoder
|
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
|
|
10
10
|
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
11
|
-
from evalvault.adapters.outbound.storage.
|
|
11
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
12
12
|
from evalvault.config.settings import get_settings
|
|
13
13
|
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
|
|
14
14
|
from evalvault.domain.metrics.analysis_registry import list_analysis_metric_specs
|
|
@@ -264,9 +264,9 @@ def _intent_label(intent_value: str) -> str:
|
|
|
264
264
|
return meta["label"] if meta else intent.value
|
|
265
265
|
|
|
266
266
|
|
|
267
|
-
def _build_pipeline_service() -> tuple[AnalysisPipelineService,
|
|
267
|
+
def _build_pipeline_service() -> tuple[AnalysisPipelineService, Any]:
|
|
268
268
|
settings = get_settings()
|
|
269
|
-
storage =
|
|
269
|
+
storage = build_storage_adapter(settings=settings)
|
|
270
270
|
llm_adapter = None
|
|
271
271
|
try:
|
|
272
272
|
llm_adapter = get_llm_adapter(settings)
|
|
@@ -21,7 +21,7 @@ from evalvault.adapters.outbound.dataset.templates import (
|
|
|
21
21
|
render_dataset_template_xlsx,
|
|
22
22
|
)
|
|
23
23
|
from evalvault.adapters.outbound.debug.report_renderer import render_markdown
|
|
24
|
-
from evalvault.adapters.outbound.domain_memory
|
|
24
|
+
from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
|
|
25
25
|
from evalvault.adapters.outbound.report import DashboardGenerator
|
|
26
26
|
from evalvault.config.settings import get_settings
|
|
27
27
|
from evalvault.domain.entities import (
|
|
@@ -64,6 +64,7 @@ class RunSummaryResponse(BaseModel):
|
|
|
64
64
|
phoenix_precision: float | None = None
|
|
65
65
|
phoenix_drift: float | None = None
|
|
66
66
|
phoenix_experiment_url: str | None = None
|
|
67
|
+
feedback_count: int | None = None
|
|
67
68
|
|
|
68
69
|
model_config = {"from_attributes": True}
|
|
69
70
|
|
|
@@ -908,11 +909,20 @@ async def start_evaluation_endpoint(
|
|
|
908
909
|
)
|
|
909
910
|
|
|
910
911
|
try:
|
|
912
|
+
from pathlib import Path
|
|
913
|
+
|
|
911
914
|
settings = get_settings()
|
|
912
|
-
|
|
915
|
+
if memory_config.get("db_path"):
|
|
916
|
+
memory_db = memory_config.get("db_path")
|
|
917
|
+
elif settings.db_backend == "sqlite":
|
|
918
|
+
memory_db = settings.evalvault_memory_db_path
|
|
919
|
+
else:
|
|
920
|
+
memory_db = None
|
|
913
921
|
domain = memory_config.get("domain") or "default"
|
|
914
922
|
language = memory_config.get("language") or "ko"
|
|
915
|
-
memory_adapter =
|
|
923
|
+
memory_adapter = build_domain_memory_adapter(
|
|
924
|
+
settings=settings, db_path=Path(memory_db) if memory_db else None
|
|
925
|
+
)
|
|
916
926
|
hook = DomainLearningHook(memory_adapter)
|
|
917
927
|
await hook.on_evaluation_complete(
|
|
918
928
|
evaluation_run=result,
|
|
@@ -944,14 +954,22 @@ async def start_evaluation_endpoint(
|
|
|
944
954
|
def list_runs(
|
|
945
955
|
adapter: AdapterDep,
|
|
946
956
|
limit: int = 50,
|
|
957
|
+
offset: int = Query(0, ge=0, description="Pagination offset"),
|
|
947
958
|
dataset_name: str | None = Query(None, description="Filter by dataset name"),
|
|
948
959
|
model_name: str | None = Query(None, description="Filter by model name"),
|
|
960
|
+
include_feedback: bool = Query(False, description="Include feedback count"),
|
|
949
961
|
) -> list[Any]:
|
|
950
962
|
"""List evaluation runs."""
|
|
951
963
|
from evalvault.ports.inbound.web_port import RunFilters
|
|
952
964
|
|
|
953
965
|
filters = RunFilters(dataset_name=dataset_name, model_name=model_name)
|
|
954
|
-
summaries = adapter.list_runs(limit=limit, filters=filters)
|
|
966
|
+
summaries = adapter.list_runs(limit=limit, offset=offset, filters=filters)
|
|
967
|
+
feedback_counts: dict[str, int] = {}
|
|
968
|
+
if include_feedback:
|
|
969
|
+
feedback_counts = {
|
|
970
|
+
summary.run_id: adapter.get_feedback_summary(summary.run_id).total_feedback
|
|
971
|
+
for summary in summaries
|
|
972
|
+
}
|
|
955
973
|
|
|
956
974
|
# Convert RunSummary dataclass to dict/Pydantic compatible format
|
|
957
975
|
# The adapter returns RunSummary objects which matches our response model mostly
|
|
@@ -975,6 +993,7 @@ def list_runs(
|
|
|
975
993
|
"phoenix_precision": s.phoenix_precision,
|
|
976
994
|
"phoenix_drift": s.phoenix_drift,
|
|
977
995
|
"phoenix_experiment_url": s.phoenix_experiment_url,
|
|
996
|
+
"feedback_count": feedback_counts.get(s.run_id) if include_feedback else None,
|
|
978
997
|
}
|
|
979
998
|
for s in summaries
|
|
980
999
|
]
|