evalvault 1.74.0__py3-none-any.whl → 1.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +127 -80
- evalvault/adapters/inbound/api/routers/calibration.py +9 -9
- evalvault/adapters/inbound/api/routers/chat.py +303 -17
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/api/routers/domain.py +10 -5
- evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
- evalvault/adapters/inbound/api/routers/runs.py +23 -4
- evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
- evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
- evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
- evalvault/adapters/inbound/cli/commands/compare.py +2 -7
- evalvault/adapters/inbound/cli/commands/debug.py +3 -2
- evalvault/adapters/inbound/cli/commands/domain.py +12 -12
- evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
- evalvault/adapters/inbound/cli/commands/gate.py +3 -2
- evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
- evalvault/adapters/inbound/cli/commands/history.py +3 -12
- evalvault/adapters/inbound/cli/commands/method.py +3 -4
- evalvault/adapters/inbound/cli/commands/ops.py +2 -2
- evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
- evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
- evalvault/adapters/inbound/cli/commands/regress.py +5 -4
- evalvault/adapters/inbound/cli/commands/run.py +188 -59
- evalvault/adapters/inbound/cli/commands/run_helpers.py +181 -70
- evalvault/adapters/inbound/cli/commands/stage.py +6 -25
- evalvault/adapters/inbound/cli/utils/options.py +10 -4
- evalvault/adapters/inbound/mcp/tools.py +11 -8
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
- evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
- evalvault/adapters/outbound/domain_memory/factory.py +68 -0
- evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
- evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
- evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
- evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
- evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
- evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
- evalvault/adapters/outbound/storage/base_sql.py +3 -2
- evalvault/adapters/outbound/storage/factory.py +53 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/settings.py +71 -11
- evalvault/domain/services/domain_learning_hook.py +2 -1
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/ports/inbound/web_port.py +3 -1
- evalvault/ports/outbound/storage_port.py +2 -0
- evalvault-1.76.0.dist-info/METADATA +221 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +58 -53
- evalvault-1.74.0.dist-info/METADATA +0 -585
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -8,6 +8,7 @@ import os
|
|
|
8
8
|
import re
|
|
9
9
|
import time
|
|
10
10
|
from collections.abc import AsyncGenerator
|
|
11
|
+
from dataclasses import dataclass
|
|
11
12
|
from datetime import UTC, datetime
|
|
12
13
|
from pathlib import Path
|
|
13
14
|
from typing import Any
|
|
@@ -34,6 +35,13 @@ _RAG_TEXTS: list[str] = []
|
|
|
34
35
|
_RAG_INITIALIZED = False
|
|
35
36
|
|
|
36
37
|
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class _RagHit:
|
|
40
|
+
document: str
|
|
41
|
+
score: float
|
|
42
|
+
doc_id: int
|
|
43
|
+
|
|
44
|
+
|
|
37
45
|
class ChatMessage(BaseModel):
|
|
38
46
|
role: str
|
|
39
47
|
content: str
|
|
@@ -315,14 +323,121 @@ async def _get_rag_retriever() -> tuple[Any | None, int]:
|
|
|
315
323
|
if not _RAG_TEXTS:
|
|
316
324
|
return None, 0
|
|
317
325
|
|
|
318
|
-
from evalvault.adapters.outbound.nlp.korean.
|
|
319
|
-
|
|
326
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit import KoreanNLPToolkit
|
|
327
|
+
|
|
328
|
+
use_hybrid = os.getenv("EVALVAULT_RAG_USE_HYBRID", "true").lower() == "true"
|
|
329
|
+
embedding_profile = os.getenv("EVALVAULT_RAG_EMBEDDING_PROFILE", "dev")
|
|
330
|
+
vector_store = os.getenv("EVALVAULT_RAG_VECTOR_STORE", "pgvector").lower()
|
|
331
|
+
pgvector_index = os.getenv("EVALVAULT_RAG_PGVECTOR_INDEX", "hnsw").lower()
|
|
332
|
+
pgvector_index_lists = int(os.getenv("EVALVAULT_RAG_PGVECTOR_INDEX_LISTS", "100"))
|
|
333
|
+
pgvector_hnsw_m = int(os.getenv("EVALVAULT_RAG_PGVECTOR_HNSW_M", "16"))
|
|
334
|
+
pgvector_hnsw_ef = int(os.getenv("EVALVAULT_RAG_PGVECTOR_HNSW_EF_CONSTRUCTION", "64"))
|
|
320
335
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
336
|
+
def _build_conn_string() -> str | None:
|
|
337
|
+
try:
|
|
338
|
+
from evalvault.config.settings import Settings
|
|
339
|
+
|
|
340
|
+
settings = Settings()
|
|
341
|
+
if settings.postgres_connection_string:
|
|
342
|
+
return settings.postgres_connection_string
|
|
343
|
+
if settings.postgres_host:
|
|
344
|
+
return "host={host} port={port} dbname={dbname} user={user} password={password}".format(
|
|
345
|
+
host=settings.postgres_host,
|
|
346
|
+
port=settings.postgres_port,
|
|
347
|
+
dbname=settings.postgres_database,
|
|
348
|
+
user=settings.postgres_user or "postgres",
|
|
349
|
+
password=settings.postgres_password or "",
|
|
350
|
+
)
|
|
351
|
+
except Exception as exc:
|
|
352
|
+
logger.warning("Failed to build Postgres connection string: %s", exc)
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
ollama_adapter = None
|
|
356
|
+
dense_retriever = None
|
|
357
|
+
embedding_func = None
|
|
358
|
+
if embedding_profile:
|
|
359
|
+
try:
|
|
360
|
+
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
361
|
+
from evalvault.adapters.outbound.nlp.korean.dense_retriever import KoreanDenseRetriever
|
|
362
|
+
from evalvault.config.settings import Settings
|
|
363
|
+
|
|
364
|
+
settings = Settings()
|
|
365
|
+
ollama_adapter = OllamaAdapter(settings)
|
|
366
|
+
dense_retriever = KoreanDenseRetriever(
|
|
367
|
+
profile=embedding_profile,
|
|
368
|
+
ollama_adapter=ollama_adapter,
|
|
369
|
+
)
|
|
370
|
+
embedding_func = dense_retriever.get_embedding_func()
|
|
371
|
+
except Exception as exc: # pragma: no cover - runtime dependency
|
|
372
|
+
logger.warning("Failed to initialize dense retriever: %s", exc)
|
|
373
|
+
|
|
374
|
+
if vector_store == "pgvector" and embedding_func is not None:
|
|
375
|
+
conn_string = _build_conn_string()
|
|
376
|
+
if conn_string:
|
|
377
|
+
try:
|
|
378
|
+
from evalvault.adapters.outbound.nlp.korean.bm25_retriever import (
|
|
379
|
+
KoreanBM25Retriever,
|
|
380
|
+
)
|
|
381
|
+
from evalvault.adapters.outbound.nlp.korean.kiwi_tokenizer import KiwiTokenizer
|
|
382
|
+
from evalvault.adapters.outbound.retriever.pgvector_store import PgvectorStore
|
|
383
|
+
|
|
384
|
+
store = PgvectorStore(
|
|
385
|
+
conn_string,
|
|
386
|
+
index_type=pgvector_index,
|
|
387
|
+
index_lists=pgvector_index_lists,
|
|
388
|
+
hnsw_m=pgvector_hnsw_m,
|
|
389
|
+
hnsw_ef_construction=pgvector_hnsw_ef,
|
|
390
|
+
)
|
|
391
|
+
embedding_dim = (
|
|
392
|
+
dense_retriever.dimension if dense_retriever else len(embedding_func(["x"])[0])
|
|
393
|
+
)
|
|
394
|
+
store.ensure_schema(dimension=embedding_dim)
|
|
395
|
+
source_hash = _hash_text(content)
|
|
396
|
+
existing_hash, existing_count = store.get_source_state(source="user_guide")
|
|
397
|
+
if existing_hash != source_hash or existing_count != len(_RAG_TEXTS):
|
|
398
|
+
embeddings = embedding_func(list(_RAG_TEXTS))
|
|
399
|
+
store.replace_documents(
|
|
400
|
+
source="user_guide",
|
|
401
|
+
source_hash=source_hash,
|
|
402
|
+
documents=list(_RAG_TEXTS),
|
|
403
|
+
embeddings=embeddings,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
tokenizer = KiwiTokenizer()
|
|
407
|
+
bm25_retriever = KoreanBM25Retriever(tokenizer=tokenizer)
|
|
408
|
+
bm25_retriever.index(list(_RAG_TEXTS))
|
|
409
|
+
if tokens and len(tokens) == len(_RAG_TEXTS):
|
|
410
|
+
bm25_retriever._tokenized_docs = tokens
|
|
411
|
+
|
|
412
|
+
if use_hybrid:
|
|
413
|
+
retriever = _PgvectorHybridRetriever(
|
|
414
|
+
bm25_retriever=bm25_retriever,
|
|
415
|
+
store=store,
|
|
416
|
+
embedding_func=embedding_func,
|
|
417
|
+
documents=list(_RAG_TEXTS),
|
|
418
|
+
)
|
|
419
|
+
else:
|
|
420
|
+
retriever = _PgvectorDenseRetriever(
|
|
421
|
+
store=store,
|
|
422
|
+
embedding_func=embedding_func,
|
|
423
|
+
documents=list(_RAG_TEXTS),
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
_RAG_RETRIEVER = retriever
|
|
427
|
+
return retriever, _RAG_DOCS_COUNT
|
|
428
|
+
except Exception as exc:
|
|
429
|
+
logger.warning("pgvector retriever setup failed: %s", exc)
|
|
430
|
+
|
|
431
|
+
toolkit = KoreanNLPToolkit()
|
|
432
|
+
retriever = toolkit.build_retriever(
|
|
433
|
+
list(_RAG_TEXTS),
|
|
434
|
+
use_hybrid=use_hybrid,
|
|
435
|
+
ollama_adapter=ollama_adapter,
|
|
436
|
+
embedding_profile=embedding_profile,
|
|
437
|
+
verbose=True,
|
|
438
|
+
)
|
|
439
|
+
if retriever is None:
|
|
440
|
+
return None, 0
|
|
326
441
|
|
|
327
442
|
_RAG_RETRIEVER = retriever
|
|
328
443
|
return retriever, _RAG_DOCS_COUNT
|
|
@@ -384,11 +499,153 @@ def _simple_retrieve(texts: list[str], query: str, top_k: int) -> list[str]:
|
|
|
384
499
|
return [text for _, text in scored[:top_k]]
|
|
385
500
|
|
|
386
501
|
|
|
502
|
+
def _rrf_fuse(
|
|
503
|
+
*,
|
|
504
|
+
bm25_results: list[Any],
|
|
505
|
+
dense_results: list[Any],
|
|
506
|
+
documents: list[str],
|
|
507
|
+
top_k: int,
|
|
508
|
+
bm25_weight: float = 0.4,
|
|
509
|
+
dense_weight: float = 0.6,
|
|
510
|
+
rrf_k: int = 60,
|
|
511
|
+
) -> list[_RagHit]:
|
|
512
|
+
scores: dict[int, float] = {}
|
|
513
|
+
|
|
514
|
+
for rank, result in enumerate(bm25_results, 1):
|
|
515
|
+
doc_id = int(result.doc_id)
|
|
516
|
+
scores[doc_id] = scores.get(doc_id, 0.0) + (bm25_weight / (rrf_k + rank))
|
|
517
|
+
|
|
518
|
+
for rank, result in enumerate(dense_results, 1):
|
|
519
|
+
doc_id = int(result.doc_id)
|
|
520
|
+
scores[doc_id] = scores.get(doc_id, 0.0) + (dense_weight / (rrf_k + rank))
|
|
521
|
+
|
|
522
|
+
ranked = sorted(scores.items(), key=lambda item: item[1], reverse=True)
|
|
523
|
+
hits: list[_RagHit] = []
|
|
524
|
+
for doc_id, score in ranked[:top_k]:
|
|
525
|
+
if 0 <= doc_id < len(documents):
|
|
526
|
+
hits.append(_RagHit(document=documents[doc_id], score=score, doc_id=doc_id))
|
|
527
|
+
return hits
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
class _PgvectorDenseRetriever:
|
|
531
|
+
def __init__(self, store: Any, embedding_func: Any, documents: list[str]) -> None:
|
|
532
|
+
self._store = store
|
|
533
|
+
self._embedding_func = embedding_func
|
|
534
|
+
self._documents = documents
|
|
535
|
+
|
|
536
|
+
def search(self, query: str, top_k: int = 5) -> list[_RagHit]:
|
|
537
|
+
query_embedding = self._embedding_func([query])[0]
|
|
538
|
+
results = self._store.search(
|
|
539
|
+
source="user_guide", query_embedding=query_embedding, top_k=top_k
|
|
540
|
+
)
|
|
541
|
+
hits: list[_RagHit] = []
|
|
542
|
+
for result in results:
|
|
543
|
+
if 0 <= result.doc_id < len(self._documents):
|
|
544
|
+
hits.append(
|
|
545
|
+
_RagHit(
|
|
546
|
+
document=self._documents[result.doc_id],
|
|
547
|
+
score=float(result.score),
|
|
548
|
+
doc_id=result.doc_id,
|
|
549
|
+
)
|
|
550
|
+
)
|
|
551
|
+
return hits
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
class _PgvectorHybridRetriever:
|
|
555
|
+
def __init__(
|
|
556
|
+
self,
|
|
557
|
+
*,
|
|
558
|
+
bm25_retriever: Any,
|
|
559
|
+
store: Any,
|
|
560
|
+
embedding_func: Any,
|
|
561
|
+
documents: list[str],
|
|
562
|
+
) -> None:
|
|
563
|
+
self._bm25 = bm25_retriever
|
|
564
|
+
self._store = store
|
|
565
|
+
self._embedding_func = embedding_func
|
|
566
|
+
self._documents = documents
|
|
567
|
+
|
|
568
|
+
def search(self, query: str, top_k: int = 5) -> list[_RagHit]:
|
|
569
|
+
bm25_results = self._bm25.search(query, top_k=len(self._documents))
|
|
570
|
+
query_embedding = self._embedding_func([query])[0]
|
|
571
|
+
dense_results = self._store.search(
|
|
572
|
+
source="user_guide", query_embedding=query_embedding, top_k=len(self._documents)
|
|
573
|
+
)
|
|
574
|
+
dense_results = sorted(dense_results, key=lambda item: item.score)
|
|
575
|
+
return _rrf_fuse(
|
|
576
|
+
bm25_results=bm25_results,
|
|
577
|
+
dense_results=dense_results,
|
|
578
|
+
documents=self._documents,
|
|
579
|
+
top_k=top_k,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _read_text_limited(path: Path, limit: int = 4000) -> str | None:
|
|
584
|
+
try:
|
|
585
|
+
if not path.exists():
|
|
586
|
+
return None
|
|
587
|
+
content = path.read_text(encoding="utf-8", errors="ignore")
|
|
588
|
+
except Exception as exc:
|
|
589
|
+
logger.warning("Failed to read %s: %s", path, exc)
|
|
590
|
+
return None
|
|
591
|
+
content = content.strip()
|
|
592
|
+
if not content:
|
|
593
|
+
return None
|
|
594
|
+
if len(content) > limit:
|
|
595
|
+
return content[:limit] + "..."
|
|
596
|
+
return content
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
async def _build_run_context(run_id: str) -> list[str]:
|
|
600
|
+
contexts: list[str] = []
|
|
601
|
+
try:
|
|
602
|
+
summary_result = await _call_mcp_tool("get_run_summary", {"run_id": run_id})
|
|
603
|
+
payload = _extract_json_content(summary_result)
|
|
604
|
+
if isinstance(payload, dict):
|
|
605
|
+
contexts.append("[RUN 요약]\n" + _summarize_run_summary(payload))
|
|
606
|
+
except Exception as exc:
|
|
607
|
+
logger.warning("Failed to fetch run summary: %s", exc)
|
|
608
|
+
|
|
609
|
+
try:
|
|
610
|
+
artifacts_result = await _call_mcp_tool(
|
|
611
|
+
"get_artifacts", {"run_id": run_id, "kind": "analysis"}
|
|
612
|
+
)
|
|
613
|
+
payload = _extract_json_content(artifacts_result)
|
|
614
|
+
if isinstance(payload, dict):
|
|
615
|
+
contexts.append("[RUN 아티팩트]\n" + _summarize_artifacts(payload))
|
|
616
|
+
artifacts = payload.get("artifacts") or {}
|
|
617
|
+
report_path = artifacts.get("report_path")
|
|
618
|
+
if isinstance(report_path, str) and report_path:
|
|
619
|
+
report_text = _read_text_limited(Path(report_path))
|
|
620
|
+
if report_text:
|
|
621
|
+
contexts.append("[REPORT 발췌]\n" + report_text)
|
|
622
|
+
except Exception as exc:
|
|
623
|
+
logger.warning("Failed to fetch run artifacts: %s", exc)
|
|
624
|
+
|
|
625
|
+
return contexts
|
|
626
|
+
|
|
627
|
+
|
|
387
628
|
async def _rag_answer(
|
|
388
629
|
user_text: str, run_id: str | None = None, category: str | None = None
|
|
389
630
|
) -> str | None:
|
|
390
|
-
retriever, _ = await _get_rag_retriever()
|
|
391
631
|
contexts: list[str] = []
|
|
632
|
+
rag_llm_enabled = os.getenv("EVALVAULT_RAG_LLM_ENABLED", "true").lower() == "true"
|
|
633
|
+
run_context_enabled = os.getenv("EVALVAULT_CHAT_RUN_CONTEXT_ENABLED", "true").lower() == "true"
|
|
634
|
+
|
|
635
|
+
if run_id and rag_llm_enabled and run_context_enabled:
|
|
636
|
+
contexts.extend(await _build_run_context(run_id))
|
|
637
|
+
|
|
638
|
+
if not rag_llm_enabled and contexts:
|
|
639
|
+
return "\n\n".join(contexts[:3])
|
|
640
|
+
|
|
641
|
+
if not rag_llm_enabled:
|
|
642
|
+
content = _load_user_guide_text()
|
|
643
|
+
if content:
|
|
644
|
+
chunks = [chunk.strip() for chunk in content.split("\n\n") if chunk.strip()]
|
|
645
|
+
contexts.extend(_simple_retrieve(chunks, user_text, top_k=5))
|
|
646
|
+
return "\n\n".join(contexts[:3]) if contexts else None
|
|
647
|
+
|
|
648
|
+
retriever, _ = await _get_rag_retriever()
|
|
392
649
|
|
|
393
650
|
if retriever is not None:
|
|
394
651
|
results = retriever.search(user_text, top_k=5)
|
|
@@ -403,7 +660,7 @@ async def _rag_answer(
|
|
|
403
660
|
if not contexts:
|
|
404
661
|
return None
|
|
405
662
|
|
|
406
|
-
if
|
|
663
|
+
if not rag_llm_enabled:
|
|
407
664
|
return "\n\n".join(contexts[:3])
|
|
408
665
|
|
|
409
666
|
prompt = (
|
|
@@ -431,15 +688,24 @@ async def _rag_answer(
|
|
|
431
688
|
if options:
|
|
432
689
|
payload["options"] = options
|
|
433
690
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
691
|
+
fallback = "\n\n".join(contexts[:3])
|
|
692
|
+
chat_timeout = int(os.getenv("OLLAMA_CHAT_TIMEOUT_SECONDS", "180"))
|
|
693
|
+
try:
|
|
694
|
+
async with httpx.AsyncClient(timeout=chat_timeout) as client:
|
|
695
|
+
response = await client.post(
|
|
696
|
+
f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
|
|
697
|
+
json=payload,
|
|
698
|
+
)
|
|
699
|
+
response.raise_for_status()
|
|
700
|
+
data = response.json()
|
|
701
|
+
except httpx.ReadTimeout:
|
|
702
|
+
logger.warning("Ollama chat timed out; returning retrieved contexts")
|
|
703
|
+
return fallback or None
|
|
704
|
+
except httpx.HTTPError as exc:
|
|
705
|
+
logger.warning("Ollama chat failed: %s", exc)
|
|
706
|
+
return fallback or None
|
|
441
707
|
|
|
442
|
-
return data.get("message", {}).get("content", "").strip() or None
|
|
708
|
+
return data.get("message", {}).get("content", "").strip() or fallback or None
|
|
443
709
|
|
|
444
710
|
|
|
445
711
|
async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
|
|
@@ -665,6 +931,17 @@ async def _chat_stream(
|
|
|
665
931
|
user_text: str, run_id: str | None = None, category: str | None = None
|
|
666
932
|
) -> AsyncGenerator[str, None]:
|
|
667
933
|
started_at = time.perf_counter()
|
|
934
|
+
simple_mode = os.getenv("EVALVAULT_CHAT_SIMPLE_MODE", "false").lower() == "true"
|
|
935
|
+
run_context_enabled = os.getenv("EVALVAULT_CHAT_RUN_CONTEXT_ENABLED", "true").lower() == "true"
|
|
936
|
+
if simple_mode:
|
|
937
|
+
yield _event({"type": "status", "message": "간단 채팅 처리 중..."})
|
|
938
|
+
answer = await _direct_chat_answer(user_text)
|
|
939
|
+
if answer:
|
|
940
|
+
async for item in _emit_answer(answer):
|
|
941
|
+
yield item
|
|
942
|
+
else:
|
|
943
|
+
yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
|
|
944
|
+
return
|
|
668
945
|
if category in {"result_interpretation", "improvement_direction"} and not run_id:
|
|
669
946
|
yield _event(
|
|
670
947
|
{
|
|
@@ -700,6 +977,7 @@ async def _chat_stream(
|
|
|
700
977
|
_is_verb_only(user_text)
|
|
701
978
|
and category in {"result_interpretation", "improvement_direction"}
|
|
702
979
|
and run_id
|
|
980
|
+
and run_context_enabled
|
|
703
981
|
):
|
|
704
982
|
yield _event({"type": "status", "message": "선택한 run 요약 중..."})
|
|
705
983
|
try:
|
|
@@ -807,6 +1085,14 @@ async def _chat_stream(
|
|
|
807
1085
|
if tool_name == "get_artifacts" and not (tool_args.get("run_id") or run_id):
|
|
808
1086
|
yield _event({"type": "final", "content": "아티팩트 조회를 위해 run_id가 필요합니다."})
|
|
809
1087
|
return
|
|
1088
|
+
if not run_context_enabled and tool_name in {"get_run_summary", "get_artifacts"}:
|
|
1089
|
+
yield _event(
|
|
1090
|
+
{
|
|
1091
|
+
"type": "final",
|
|
1092
|
+
"content": "run 요약/아티팩트 조회가 비활성화되어 있습니다.",
|
|
1093
|
+
}
|
|
1094
|
+
)
|
|
1095
|
+
return
|
|
810
1096
|
if tool_name == "analyze_compare" and (
|
|
811
1097
|
not tool_args.get("run_id_a") or not tool_args.get("run_id_b")
|
|
812
1098
|
):
|
|
@@ -71,7 +71,9 @@ class ConfigUpdateRequest(BaseModel):
|
|
|
71
71
|
phoenix_endpoint: str | None = None
|
|
72
72
|
phoenix_enabled: bool | None = None
|
|
73
73
|
phoenix_sample_rate: float | None = None
|
|
74
|
-
|
|
74
|
+
phoenix_project_name: str | None = None
|
|
75
|
+
phoenix_annotations_enabled: bool | None = None
|
|
76
|
+
tracker_provider: str | None = None
|
|
75
77
|
postgres_host: str | None = None
|
|
76
78
|
postgres_port: int | None = None
|
|
77
79
|
postgres_database: str | None = None
|
|
@@ -5,17 +5,22 @@ from __future__ import annotations
|
|
|
5
5
|
from fastapi import APIRouter, HTTPException
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
|
-
from evalvault.adapters.outbound.domain_memory
|
|
8
|
+
from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
|
|
9
9
|
from evalvault.config.settings import get_settings
|
|
10
|
+
from evalvault.ports.outbound.domain_memory_port import DomainMemoryPort
|
|
10
11
|
|
|
11
12
|
router = APIRouter()
|
|
12
|
-
|
|
13
|
+
_settings = get_settings()
|
|
14
|
+
DEFAULT_MEMORY_DB_PATH = (
|
|
15
|
+
_settings.evalvault_memory_db_path if _settings.db_backend == "sqlite" else None
|
|
16
|
+
)
|
|
13
17
|
|
|
14
18
|
|
|
15
|
-
|
|
16
|
-
def get_memory_adapter(db_path: str = DEFAULT_MEMORY_DB_PATH) -> SQLiteDomainMemoryAdapter:
|
|
19
|
+
def get_memory_adapter(db_path: str | None = DEFAULT_MEMORY_DB_PATH) -> DomainMemoryPort:
|
|
17
20
|
"""Get memory adapter instance."""
|
|
18
|
-
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
return build_domain_memory_adapter(db_path=Path(db_path) if db_path else None)
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
# --- Pydantic Models ---
|
|
@@ -8,7 +8,7 @@ from fastapi.encoders import jsonable_encoder
|
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
|
|
10
10
|
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
11
|
-
from evalvault.adapters.outbound.storage.
|
|
11
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
12
12
|
from evalvault.config.settings import get_settings
|
|
13
13
|
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
|
|
14
14
|
from evalvault.domain.metrics.analysis_registry import list_analysis_metric_specs
|
|
@@ -264,9 +264,9 @@ def _intent_label(intent_value: str) -> str:
|
|
|
264
264
|
return meta["label"] if meta else intent.value
|
|
265
265
|
|
|
266
266
|
|
|
267
|
-
def _build_pipeline_service() -> tuple[AnalysisPipelineService,
|
|
267
|
+
def _build_pipeline_service() -> tuple[AnalysisPipelineService, Any]:
|
|
268
268
|
settings = get_settings()
|
|
269
|
-
storage =
|
|
269
|
+
storage = build_storage_adapter(settings=settings)
|
|
270
270
|
llm_adapter = None
|
|
271
271
|
try:
|
|
272
272
|
llm_adapter = get_llm_adapter(settings)
|
|
@@ -21,7 +21,7 @@ from evalvault.adapters.outbound.dataset.templates import (
|
|
|
21
21
|
render_dataset_template_xlsx,
|
|
22
22
|
)
|
|
23
23
|
from evalvault.adapters.outbound.debug.report_renderer import render_markdown
|
|
24
|
-
from evalvault.adapters.outbound.domain_memory
|
|
24
|
+
from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
|
|
25
25
|
from evalvault.adapters.outbound.report import DashboardGenerator
|
|
26
26
|
from evalvault.config.settings import get_settings
|
|
27
27
|
from evalvault.domain.entities import (
|
|
@@ -64,6 +64,7 @@ class RunSummaryResponse(BaseModel):
|
|
|
64
64
|
phoenix_precision: float | None = None
|
|
65
65
|
phoenix_drift: float | None = None
|
|
66
66
|
phoenix_experiment_url: str | None = None
|
|
67
|
+
feedback_count: int | None = None
|
|
67
68
|
|
|
68
69
|
model_config = {"from_attributes": True}
|
|
69
70
|
|
|
@@ -908,11 +909,20 @@ async def start_evaluation_endpoint(
|
|
|
908
909
|
)
|
|
909
910
|
|
|
910
911
|
try:
|
|
912
|
+
from pathlib import Path
|
|
913
|
+
|
|
911
914
|
settings = get_settings()
|
|
912
|
-
|
|
915
|
+
if memory_config.get("db_path"):
|
|
916
|
+
memory_db = memory_config.get("db_path")
|
|
917
|
+
elif settings.db_backend == "sqlite":
|
|
918
|
+
memory_db = settings.evalvault_memory_db_path
|
|
919
|
+
else:
|
|
920
|
+
memory_db = None
|
|
913
921
|
domain = memory_config.get("domain") or "default"
|
|
914
922
|
language = memory_config.get("language") or "ko"
|
|
915
|
-
memory_adapter =
|
|
923
|
+
memory_adapter = build_domain_memory_adapter(
|
|
924
|
+
settings=settings, db_path=Path(memory_db) if memory_db else None
|
|
925
|
+
)
|
|
916
926
|
hook = DomainLearningHook(memory_adapter)
|
|
917
927
|
await hook.on_evaluation_complete(
|
|
918
928
|
evaluation_run=result,
|
|
@@ -944,14 +954,22 @@ async def start_evaluation_endpoint(
|
|
|
944
954
|
def list_runs(
|
|
945
955
|
adapter: AdapterDep,
|
|
946
956
|
limit: int = 50,
|
|
957
|
+
offset: int = Query(0, ge=0, description="Pagination offset"),
|
|
947
958
|
dataset_name: str | None = Query(None, description="Filter by dataset name"),
|
|
948
959
|
model_name: str | None = Query(None, description="Filter by model name"),
|
|
960
|
+
include_feedback: bool = Query(False, description="Include feedback count"),
|
|
949
961
|
) -> list[Any]:
|
|
950
962
|
"""List evaluation runs."""
|
|
951
963
|
from evalvault.ports.inbound.web_port import RunFilters
|
|
952
964
|
|
|
953
965
|
filters = RunFilters(dataset_name=dataset_name, model_name=model_name)
|
|
954
|
-
summaries = adapter.list_runs(limit=limit, filters=filters)
|
|
966
|
+
summaries = adapter.list_runs(limit=limit, offset=offset, filters=filters)
|
|
967
|
+
feedback_counts: dict[str, int] = {}
|
|
968
|
+
if include_feedback:
|
|
969
|
+
feedback_counts = {
|
|
970
|
+
summary.run_id: adapter.get_feedback_summary(summary.run_id).total_feedback
|
|
971
|
+
for summary in summaries
|
|
972
|
+
}
|
|
955
973
|
|
|
956
974
|
# Convert RunSummary dataclass to dict/Pydantic compatible format
|
|
957
975
|
# The adapter returns RunSummary objects which matches our response model mostly
|
|
@@ -975,6 +993,7 @@ def list_runs(
|
|
|
975
993
|
"phoenix_precision": s.phoenix_precision,
|
|
976
994
|
"phoenix_drift": s.phoenix_drift,
|
|
977
995
|
"phoenix_experiment_url": s.phoenix_experiment_url,
|
|
996
|
+
"feedback_count": feedback_counts.get(s.run_id) if include_feedback else None,
|
|
978
997
|
}
|
|
979
998
|
for s in summaries
|
|
980
999
|
]
|
|
@@ -26,7 +26,8 @@ from evalvault.adapters.outbound.analysis.pipeline_helpers import to_serializabl
|
|
|
26
26
|
from evalvault.adapters.outbound.cache import MemoryCacheAdapter
|
|
27
27
|
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
28
28
|
from evalvault.adapters.outbound.report import DashboardGenerator, MarkdownReportAdapter
|
|
29
|
-
from evalvault.adapters.outbound.storage.
|
|
29
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
30
|
+
from evalvault.adapters.outbound.storage.postgres_adapter import PostgreSQLStorageAdapter
|
|
30
31
|
from evalvault.config.phoenix_support import get_phoenix_trace_url
|
|
31
32
|
from evalvault.config.settings import Settings, apply_profile
|
|
32
33
|
from evalvault.domain.entities import EvaluationRun
|
|
@@ -115,11 +116,7 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
115
116
|
) -> None:
|
|
116
117
|
"""평가 실행 결과를 분석하고 통계 인사이트를 표시합니다."""
|
|
117
118
|
|
|
118
|
-
|
|
119
|
-
if resolved_db_path is None:
|
|
120
|
-
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
121
|
-
raise typer.Exit(1)
|
|
122
|
-
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
119
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
123
120
|
|
|
124
121
|
try:
|
|
125
122
|
run = storage.get_run(run_id)
|
|
@@ -217,7 +214,12 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
217
214
|
_save_analysis_payload(bundle.causal, "causal")
|
|
218
215
|
if improvement_report is not None:
|
|
219
216
|
_save_analysis_payload(improvement_report, "playbook")
|
|
220
|
-
|
|
217
|
+
storage_label = (
|
|
218
|
+
"PostgreSQL"
|
|
219
|
+
if isinstance(storage, PostgreSQLStorageAdapter)
|
|
220
|
+
else f"SQLite ({db_path})"
|
|
221
|
+
)
|
|
222
|
+
_console.print(f"\n[green]분석 결과 DB 저장: {storage_label}[/green]")
|
|
221
223
|
|
|
222
224
|
if dashboard:
|
|
223
225
|
dashboard_gen = DashboardGenerator()
|
|
@@ -359,11 +361,7 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
359
361
|
) -> None:
|
|
360
362
|
"""두 실행을 통계적으로 비교합니다."""
|
|
361
363
|
|
|
362
|
-
|
|
363
|
-
if resolved_db_path is None:
|
|
364
|
-
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
365
|
-
raise typer.Exit(1)
|
|
366
|
-
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
364
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
367
365
|
|
|
368
366
|
try:
|
|
369
367
|
run_a = storage.get_run(run_id1)
|
|
@@ -385,7 +385,7 @@ def create_benchmark_app(console: Console) -> typer.Typer:
|
|
|
385
385
|
"""
|
|
386
386
|
try:
|
|
387
387
|
from evalvault.adapters.outbound.benchmark import LMEvalAdapter
|
|
388
|
-
from evalvault.adapters.outbound.storage import
|
|
388
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
389
389
|
from evalvault.config.settings import get_settings
|
|
390
390
|
from evalvault.domain.services.benchmark_service import BenchmarkService
|
|
391
391
|
from evalvault.ports.outbound.benchmark_port import BenchmarkBackend
|
|
@@ -426,7 +426,7 @@ def create_benchmark_app(console: Console) -> typer.Typer:
|
|
|
426
426
|
ensure_phoenix_instrumentation(settings, console=console, force=True)
|
|
427
427
|
|
|
428
428
|
benchmark_adapter = LMEvalAdapter(settings=settings)
|
|
429
|
-
storage_adapter =
|
|
429
|
+
storage_adapter = build_storage_adapter(settings=settings, db_path=db)
|
|
430
430
|
tracer_adapter = _create_tracer_adapter(phoenix)
|
|
431
431
|
service = BenchmarkService(
|
|
432
432
|
benchmark_adapter=benchmark_adapter,
|
|
@@ -556,9 +556,11 @@ def create_benchmark_app(console: Console) -> typer.Typer:
|
|
|
556
556
|
),
|
|
557
557
|
) -> None:
|
|
558
558
|
"""View past benchmark runs."""
|
|
559
|
-
from evalvault.adapters.outbound.storage import
|
|
559
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
560
|
+
from evalvault.config.settings import get_settings
|
|
560
561
|
|
|
561
|
-
|
|
562
|
+
settings = get_settings()
|
|
563
|
+
storage = build_storage_adapter(settings=settings, db_path=db)
|
|
562
564
|
runs = storage.list_benchmark_runs(
|
|
563
565
|
benchmark_type=benchmark_type,
|
|
564
566
|
model_name=model_name,
|
|
@@ -629,7 +631,7 @@ def create_benchmark_app(console: Console) -> typer.Typer:
|
|
|
629
631
|
evalvault benchmark report abc123
|
|
630
632
|
evalvault benchmark report abc123 -o report.md -p dev
|
|
631
633
|
"""
|
|
632
|
-
from evalvault.adapters.outbound.storage import
|
|
634
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
633
635
|
from evalvault.config.settings import get_settings
|
|
634
636
|
from evalvault.domain.services.benchmark_report_service import (
|
|
635
637
|
BenchmarkReportService,
|
|
@@ -639,7 +641,7 @@ def create_benchmark_app(console: Console) -> typer.Typer:
|
|
|
639
641
|
if profile:
|
|
640
642
|
settings.profile = profile
|
|
641
643
|
|
|
642
|
-
storage =
|
|
644
|
+
storage = build_storage_adapter(settings=settings, db_path=db)
|
|
643
645
|
benchmark_run = storage.get_benchmark_run(run_id)
|
|
644
646
|
|
|
645
647
|
if not benchmark_run:
|
|
@@ -717,7 +719,7 @@ def create_benchmark_app(console: Console) -> typer.Typer:
|
|
|
717
719
|
evalvault benchmark compare abc123 def456
|
|
718
720
|
evalvault benchmark compare abc123 def456 -o comparison.md
|
|
719
721
|
"""
|
|
720
|
-
from evalvault.adapters.outbound.storage import
|
|
722
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
721
723
|
from evalvault.config.settings import get_settings
|
|
722
724
|
from evalvault.domain.services.benchmark_report_service import (
|
|
723
725
|
BenchmarkReportService,
|
|
@@ -727,7 +729,7 @@ def create_benchmark_app(console: Console) -> typer.Typer:
|
|
|
727
729
|
if profile:
|
|
728
730
|
settings.profile = profile
|
|
729
731
|
|
|
730
|
-
storage =
|
|
732
|
+
storage = build_storage_adapter(settings=settings, db_path=db)
|
|
731
733
|
baseline = storage.get_benchmark_run(baseline_id)
|
|
732
734
|
target = storage.get_benchmark_run(target_id)
|
|
733
735
|
|
|
@@ -7,7 +7,7 @@ import typer
|
|
|
7
7
|
from rich.console import Console
|
|
8
8
|
from rich.table import Table
|
|
9
9
|
|
|
10
|
-
from evalvault.adapters.outbound.storage.
|
|
10
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
11
11
|
from evalvault.config.settings import Settings
|
|
12
12
|
from evalvault.domain.services.satisfaction_calibration_service import (
|
|
13
13
|
SatisfactionCalibrationService,
|
|
@@ -36,12 +36,7 @@ def register_calibrate_commands(app: typer.Typer, console: Console) -> None:
|
|
|
36
36
|
),
|
|
37
37
|
db_path: Path | None = db_option(help_text="DB 경로"),
|
|
38
38
|
) -> None:
|
|
39
|
-
|
|
40
|
-
if resolved_db_path is None:
|
|
41
|
-
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
42
|
-
raise typer.Exit(1)
|
|
43
|
-
|
|
44
|
-
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
39
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
45
40
|
try:
|
|
46
41
|
run = storage.get_run(run_id)
|
|
47
42
|
except KeyError:
|