PyPI - evalvault - Versions diffs - 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl - Mend

evalvault 1.62.1py3-none-any.whl → 1.63.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

evalvault/adapters/inbound/api/adapter.py +190 -19
evalvault/adapters/inbound/api/routers/runs.py +66 -2
evalvault/adapters/inbound/cli/commands/method.py +5 -2
evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
evalvault/adapters/inbound/cli/commands/run.py +88 -5
evalvault/adapters/inbound/cli/commands/run_helpers.py +12 -0
evalvault/adapters/inbound/mcp/tools.py +5 -2
evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
evalvault/adapters/outbound/improvement/pattern_detector.py +1 -1
evalvault/adapters/outbound/improvement/playbook_loader.py +1 -1
evalvault/adapters/outbound/llm/__init__.py +5 -43
evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
evalvault/adapters/outbound/llm/factory.py +103 -0
evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
evalvault/adapters/outbound/storage/base_sql.py +528 -21
evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
evalvault/debug_ragas.py +7 -1
evalvault/debug_ragas_real.py +5 -1
evalvault/domain/entities/__init__.py +10 -0
evalvault/domain/entities/prompt_suggestion.py +50 -0
evalvault/domain/services/__init__.py +6 -0
evalvault/domain/services/evaluator.py +191 -103
evalvault/domain/services/holdout_splitter.py +67 -0
evalvault/domain/services/intent_classifier.py +73 -0
evalvault/domain/services/pipeline_template_registry.py +3 -0
evalvault/domain/services/prompt_candidate_service.py +117 -0
evalvault/domain/services/prompt_registry.py +40 -2
evalvault/domain/services/prompt_scoring_service.py +286 -0
evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
evalvault/domain/services/synthetic_qa_generator.py +4 -3
evalvault/ports/inbound/learning_hook_port.py +4 -1
evalvault/ports/outbound/__init__.py +2 -0
evalvault/ports/outbound/llm_factory_port.py +13 -0
evalvault/ports/outbound/llm_port.py +34 -2
evalvault/ports/outbound/storage_port.py +38 -0
{evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/METADATA +228 -4
{evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/RECORD +48 -40
{evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/WHEEL +0 -0
{evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/entry_points.txt +0 -0
{evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/licenses/LICENSE.md +0 -0

evalvault/adapters/outbound/llm/openai_adapter.py CHANGED Viewed

@@ -10,6 +10,7 @@ from evalvault.adapters.outbound.llm.instructor_factory import create_instructor
 from evalvault.adapters.outbound.llm.token_aware_chat import TokenTrackingAsyncOpenAI
 from evalvault.config.phoenix_support import instrumentation_span, set_span_attributes
 from evalvault.config.settings import Settings
+from evalvault.ports.outbound.llm_port import GenerationOptions
 _DEFAULT_MAX_COMPLETION_TOKENS = 8192
 _GPT5_MAX_COMPLETION_TOKENS = 16384
@@ -73,7 +74,12 @@ class OpenAIAdapter(BaseLLMAdapter):
         """
         return self._embedding_model_name
-    async def agenerate_text(self, prompt: str) -> str:
+    async def agenerate_text(
+        self,
+        prompt: str,
+        *,
+        options: GenerationOptions | None = None,
+    ) -> str:
         """Generate text from a prompt (async).
         Uses the OpenAI chat completions API directly for simple text generation.
@@ -89,18 +95,35 @@ class OpenAIAdapter(BaseLLMAdapter):
             "llm.model": self._model_name,
             "llm.mode": "async",
         }
+        max_tokens = options.max_tokens if options and options.max_tokens is not None else None
+        api_kwargs = {
+            "model": self._model_name,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_completion_tokens": max_tokens
+            or _max_completion_tokens_for_model(self._model_name),
+        }
+        if options and options.temperature is not None:
+            api_kwargs["temperature"] = options.temperature
+        if options and options.top_p is not None:
+            api_kwargs["top_p"] = options.top_p
+        if options and options.n is not None:
+            api_kwargs["n"] = options.n
+        if options and options.seed is not None:
+            api_kwargs["seed"] = options.seed
         with instrumentation_span("llm.generate_text", attrs) as span:
-            response = await self._client.chat.completions.create(
-                model=self._model_name,
-                messages=[{"role": "user", "content": prompt}],
-                max_completion_tokens=_max_completion_tokens_for_model(self._model_name),
-            )
+            response = await self._client.chat.completions.create(**api_kwargs)
             content = response.choices[0].message.content or ""
             if span:
                 set_span_attributes(span, {"llm.response.length": len(content)})
         return content
-    def generate_text(self, prompt: str, *, json_mode: bool = False) -> str:
+    def generate_text(
+        self,
+        prompt: str,
+        *,
+        json_mode: bool = False,
+        options: GenerationOptions | None = None,
+    ) -> str:
         """Generate text from a prompt (sync).
         Uses sync OpenAI client directly.
@@ -124,11 +147,21 @@ class OpenAIAdapter(BaseLLMAdapter):
         sync_client = OpenAI(**client_kwargs)
         # API 호출 파라미터
+        max_tokens = options.max_tokens if options and options.max_tokens is not None else None
         api_kwargs: dict = {
             "model": self._model_name,
             "messages": [{"role": "user", "content": prompt}],
-            "max_completion_tokens": _max_completion_tokens_for_model(self._model_name),
+            "max_completion_tokens": max_tokens
+            or _max_completion_tokens_for_model(self._model_name),
         }
+        if options and options.temperature is not None:
+            api_kwargs["temperature"] = options.temperature
+        if options and options.top_p is not None:
+            api_kwargs["top_p"] = options.top_p
+        if options and options.n is not None:
+            api_kwargs["n"] = options.n
+        if options and options.seed is not None:
+            api_kwargs["seed"] = options.seed
         # JSON 모드 설정
         if json_mode:

evalvault/adapters/outbound/llm/token_aware_chat.py CHANGED Viewed

@@ -149,7 +149,16 @@ class TokenTrackingAsyncOpenAI(AsyncOpenAI):
                 span_attrs = _build_llm_span_attrs(provider_name, kwargs)
                 with instrumentation_span("llm.chat_completion", span_attrs) as span:
-                    response = await inner_self._completions.create(**kwargs)
+                    try:
+                        response = await inner_self._completions.create(**kwargs)
+                    except TypeError as exc:
+                        if "max_completion_tokens" in str(exc):
+                            fallback_kwargs = dict(kwargs)
+                            fallback_kwargs.pop("max_completion_tokens", None)
+                            fallback_kwargs.pop("max_tokens", None)
+                            response = await inner_self._completions.create(**fallback_kwargs)
+                        else:
+                            raise
                     if provider_name == "ollama":
                         _normalize_tool_calls(response, kwargs.get("tools"))
                     # Extract usage from response
@@ -198,6 +207,7 @@ class ThinkingTokenTrackingAsyncOpenAI(TokenTrackingAsyncOpenAI):
             async def create(inner_self, **kwargs: Any) -> Any:  # noqa: N805
                 # Ensure 충분한 출력 토큰 확보 (Ollama는 max_tokens를 사용)
+                min_tokens = None
                 if provider_name == "ollama":
                     if "max_tokens" not in kwargs or kwargs["max_tokens"] < 4096:
                         kwargs["max_tokens"] = 16384
@@ -222,7 +232,16 @@ class ThinkingTokenTrackingAsyncOpenAI(TokenTrackingAsyncOpenAI):
                 span_attrs = _build_llm_span_attrs(provider_name, kwargs)
                 with instrumentation_span("llm.chat_completion", span_attrs) as span:
-                    response = await inner_self._completions.create(**kwargs)
+                    try:
+                        response = await inner_self._completions.create(**kwargs)
+                    except TypeError as exc:
+                        if "max_completion_tokens" in str(exc):
+                            fallback_kwargs = dict(kwargs)
+                            fallback_kwargs.pop("max_completion_tokens", None)
+                            fallback_kwargs.pop("max_tokens", None)
+                            response = await inner_self._completions.create(**fallback_kwargs)
+                        else:
+                            raise
                     if provider_name == "ollama":
                         _normalize_tool_calls(response, kwargs.get("tools"))

evalvault/adapters/outbound/llm/vllm_adapter.py CHANGED Viewed

@@ -12,6 +12,7 @@ from evalvault.adapters.outbound.llm.instructor_factory import create_instructor
 from evalvault.adapters.outbound.llm.token_aware_chat import TokenTrackingAsyncOpenAI
 from evalvault.config.phoenix_support import instrumentation_span, set_span_attributes
 from evalvault.config.settings import Settings
+from evalvault.ports.outbound.llm_port import GenerationOptions
 class VLLMAdapter(BaseLLMAdapter):
@@ -63,25 +64,46 @@ class VLLMAdapter(BaseLLMAdapter):
         """Get the embedding model name being used."""
         return self._embedding_model_name
-    async def agenerate_text(self, prompt: str) -> str:
+    async def agenerate_text(
+        self,
+        prompt: str,
+        *,
+        options: GenerationOptions | None = None,
+    ) -> str:
         """Generate text from a prompt (async)."""
         attrs = {
             "llm.provider": "vllm",
             "llm.model": self._model_name,
             "llm.mode": "async",
         }
+        max_tokens = options.max_tokens if options and options.max_tokens is not None else 8192
+        api_kwargs = {
+            "model": self._model_name,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_completion_tokens": max_tokens,
+        }
+        if options and options.temperature is not None:
+            api_kwargs["temperature"] = options.temperature
+        if options and options.top_p is not None:
+            api_kwargs["top_p"] = options.top_p
+        if options and options.n is not None:
+            api_kwargs["n"] = options.n
+        if options and options.seed is not None:
+            api_kwargs["seed"] = options.seed
         with instrumentation_span("llm.generate_text", attrs) as span:
-            response = await self._client.chat.completions.create(
-                model=self._model_name,
-                messages=[{"role": "user", "content": prompt}],
-                max_completion_tokens=8192,
-            )
+            response = await self._client.chat.completions.create(**api_kwargs)
             content = response.choices[0].message.content or ""
             if span:
                 set_span_attributes(span, {"llm.response.length": len(content)})
         return content
-    def generate_text(self, prompt: str, *, json_mode: bool = False) -> str:
+    def generate_text(
+        self,
+        prompt: str,
+        *,
+        json_mode: bool = False,
+        options: GenerationOptions | None = None,
+    ) -> str:
         """Generate text from a prompt (sync)."""
         sync_client = OpenAI(
             base_url=self._settings.vllm_base_url,
@@ -89,11 +111,20 @@ class VLLMAdapter(BaseLLMAdapter):
             timeout=self._settings.vllm_timeout,
         )
+        max_tokens = options.max_tokens if options and options.max_tokens is not None else 8192
         api_kwargs: dict = {
             "model": self._model_name,
             "messages": [{"role": "user", "content": prompt}],
-            "max_completion_tokens": 8192,
+            "max_completion_tokens": max_tokens,
         }
+        if options and options.temperature is not None:
+            api_kwargs["temperature"] = options.temperature
+        if options and options.top_p is not None:
+            api_kwargs["top_p"] = options.top_p
+        if options and options.n is not None:
+            api_kwargs["n"] = options.n
+        if options and options.seed is not None:
+            api_kwargs["seed"] = options.seed
         if json_mode:
             api_kwargs["response_format"] = {"type": "json_object"}

evalvault/adapters/outbound/nlp/korean/toolkit_factory.py ADDED Viewed

@@ -0,0 +1,20 @@
+from __future__ import annotations
+import logging
+from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort
+logger = logging.getLogger(__name__)
+def try_create_korean_toolkit() -> KoreanNLPToolkitPort | None:
+    try:
+        from evalvault.adapters.outbound.nlp.korean.toolkit import KoreanNLPToolkit
+    except Exception as exc:
+        logger.debug("Korean toolkit import failed: %s", exc)
+        return None
+    try:
+        return KoreanNLPToolkit()
+    except Exception as exc:
+        logger.debug("Korean toolkit init failed: %s", exc)
+        return None

evalvault/adapters/outbound/report/llm_report_generator.py CHANGED Viewed

@@ -366,6 +366,24 @@ DEFAULT_METRIC_PROMPT = """당신은 RAG 시스템 평가 전문가입니다.
 마크다운 형식으로 작성해주세요."""
+DEFAULT_METRIC_PROMPT_EN = """You are a RAG evaluation expert.
+## Target
+- Metric: {metric_name}
+- Score: {score:.3f} / 1.0
+- Threshold: {threshold:.2f}
+- Status: {status}
+## Request
+Provide a Markdown analysis covering:
+1. **Current assessment**: what this score implies
+2. **Likely causes**: plausible root causes
+3. **Actionable improvements**: practical steps the team can take
+4. **Expected impact**: anticipated gains per action
+Respond in Markdown."""
 EXECUTIVE_SUMMARY_PROMPT = """당신은 RAG 시스템 성능 개선 전문가입니다. 평가 결과를 분석하고 구체적인 개선 방안을 제시해주세요.
@@ -423,6 +441,60 @@ EXECUTIVE_SUMMARY_PROMPT = """당신은 RAG 시스템 성능 개선 전문가입
 마크다운 형식으로 작성해주세요."""
+EXECUTIVE_SUMMARY_PROMPT_EN = """You are a RAG performance improvement expert. Analyze the evaluation results and propose concrete actions.
+Evaluation Results:
+- Dataset: {dataset_name}
+- Model: {model_name}
+- Pass rate: {pass_rate:.1%}
+- Test cases: {total_test_cases}
+Metric Scores:
+{metrics_summary}
+Analysis Request:
+Provide a RAG performance improvement-focused analysis using the structure below:
+1) Current Summary (3 sentences)
+- Overall quality level and the most urgent issue
+- Clearly distinguish strong areas vs weak areas
+2) Problem Definition
+| Problem | Metric | Current | Target | Severity |
+|------|--------|--------|--------|--------|
+| (Specific issue) | (Related metric) | (Score) | (Target) | Critical/High/Medium |
+3) Root Cause Analysis
+For each problem:
+- Direct cause: immediate cause (e.g., "relevant context is not retrieved")
+- Root cause: structural cause (e.g., "chunk size is too large and dilutes relevance")
+- Verification: how to validate (e.g., "re-run with top_k=10")
+4) Solutions
+P0 - Immediate (1-3 days)
+For each action:
+- Action: one-line description
+- Implementation: concrete steps (including code/config changes)
+- Expected impact: quantified estimate (e.g., "faithfulness +0.15")
+P1 - Short term (1-2 weeks)
+Provide 2-3 actions in the same format
+P2 - Mid term (1 month)
+Provide 1-2 actions in the same format
+5) Verification Plan
+- How to measure improvement for each action
+- Monitoring indicators to prevent regressions
+Note: Do not give abstract advice. All suggestions must be concrete and actionable.
+Respond in Markdown."""
 SUMMARY_RECOMMENDED_THRESHOLDS = {
     "summary_faithfulness": 0.90,
     "summary_score": 0.85,
@@ -643,11 +715,15 @@ class LLMReportGenerator:
         threshold: float,
     ) -> LLMReportSection:
         """개별 메트릭 분석."""
-        # 프롬프트 선택
-        prompt_template = METRIC_ANALYSIS_PROMPTS.get(metric_name, DEFAULT_METRIC_PROMPT)
+        prompt_template = (
+            DEFAULT_METRIC_PROMPT_EN
+            if self._language == "en"
+            else METRIC_ANALYSIS_PROMPTS.get(metric_name, DEFAULT_METRIC_PROMPT)
+        )
-        # 상태 계산
-        status = "통과" if score >= threshold else "미달"
+        status = "pass" if score >= threshold else "fail"
+        if self._language != "en":
+            status = "통과" if score >= threshold else "미달"
         prompt = prompt_template.format(
             metric_name=metric_name,
@@ -683,11 +759,19 @@ class LLMReportGenerator:
         for metric, score in metrics_scores.items():
             threshold = thresholds.get(metric, 0.7)
             status = "✅" if score >= threshold else "❌"
-            metrics_lines.append(f"- {metric}: {score:.3f} (임계값: {threshold:.2f}) {status}")
+            if self._language == "en":
+                metrics_lines.append(
+                    f"- {metric}: {score:.3f} (threshold: {threshold:.2f}) {status}"
+                )
+            else:
+                metrics_lines.append(f"- {metric}: {score:.3f} (임계값: {threshold:.2f}) {status}")
         metrics_summary = "\n".join(metrics_lines)
-        prompt = EXECUTIVE_SUMMARY_PROMPT.format(
+        prompt_template = (
+            EXECUTIVE_SUMMARY_PROMPT_EN if self._language == "en" else EXECUTIVE_SUMMARY_PROMPT
+        )
+        prompt = prompt_template.format(
             dataset_name=run.dataset_name,
             model_name=run.model_name,
             pass_rate=run.pass_rate,

evalvault 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl

evalvault 1.62.1py3-none-any.whl → 1.63.1py3-none-any.whl