evalvault 1.62.1__py3-none-any.whl → 1.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +43 -2
- evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +527 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -149,7 +149,16 @@ class TokenTrackingAsyncOpenAI(AsyncOpenAI):
|
|
|
149
149
|
|
|
150
150
|
span_attrs = _build_llm_span_attrs(provider_name, kwargs)
|
|
151
151
|
with instrumentation_span("llm.chat_completion", span_attrs) as span:
|
|
152
|
-
|
|
152
|
+
try:
|
|
153
|
+
response = await inner_self._completions.create(**kwargs)
|
|
154
|
+
except TypeError as exc:
|
|
155
|
+
if "max_completion_tokens" in str(exc):
|
|
156
|
+
fallback_kwargs = dict(kwargs)
|
|
157
|
+
fallback_kwargs.pop("max_completion_tokens", None)
|
|
158
|
+
fallback_kwargs.pop("max_tokens", None)
|
|
159
|
+
response = await inner_self._completions.create(**fallback_kwargs)
|
|
160
|
+
else:
|
|
161
|
+
raise
|
|
153
162
|
if provider_name == "ollama":
|
|
154
163
|
_normalize_tool_calls(response, kwargs.get("tools"))
|
|
155
164
|
# Extract usage from response
|
|
@@ -198,6 +207,7 @@ class ThinkingTokenTrackingAsyncOpenAI(TokenTrackingAsyncOpenAI):
|
|
|
198
207
|
|
|
199
208
|
async def create(inner_self, **kwargs: Any) -> Any: # noqa: N805
|
|
200
209
|
# Ensure 충분한 출력 토큰 확보 (Ollama는 max_tokens를 사용)
|
|
210
|
+
min_tokens = None
|
|
201
211
|
if provider_name == "ollama":
|
|
202
212
|
if "max_tokens" not in kwargs or kwargs["max_tokens"] < 4096:
|
|
203
213
|
kwargs["max_tokens"] = 16384
|
|
@@ -222,7 +232,16 @@ class ThinkingTokenTrackingAsyncOpenAI(TokenTrackingAsyncOpenAI):
|
|
|
222
232
|
|
|
223
233
|
span_attrs = _build_llm_span_attrs(provider_name, kwargs)
|
|
224
234
|
with instrumentation_span("llm.chat_completion", span_attrs) as span:
|
|
225
|
-
|
|
235
|
+
try:
|
|
236
|
+
response = await inner_self._completions.create(**kwargs)
|
|
237
|
+
except TypeError as exc:
|
|
238
|
+
if "max_completion_tokens" in str(exc):
|
|
239
|
+
fallback_kwargs = dict(kwargs)
|
|
240
|
+
fallback_kwargs.pop("max_completion_tokens", None)
|
|
241
|
+
fallback_kwargs.pop("max_tokens", None)
|
|
242
|
+
response = await inner_self._completions.create(**fallback_kwargs)
|
|
243
|
+
else:
|
|
244
|
+
raise
|
|
226
245
|
if provider_name == "ollama":
|
|
227
246
|
_normalize_tool_calls(response, kwargs.get("tools"))
|
|
228
247
|
|
|
@@ -12,6 +12,7 @@ from evalvault.adapters.outbound.llm.instructor_factory import create_instructor
|
|
|
12
12
|
from evalvault.adapters.outbound.llm.token_aware_chat import TokenTrackingAsyncOpenAI
|
|
13
13
|
from evalvault.config.phoenix_support import instrumentation_span, set_span_attributes
|
|
14
14
|
from evalvault.config.settings import Settings
|
|
15
|
+
from evalvault.ports.outbound.llm_port import GenerationOptions
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class VLLMAdapter(BaseLLMAdapter):
|
|
@@ -63,25 +64,46 @@ class VLLMAdapter(BaseLLMAdapter):
|
|
|
63
64
|
"""Get the embedding model name being used."""
|
|
64
65
|
return self._embedding_model_name
|
|
65
66
|
|
|
66
|
-
async def agenerate_text(
|
|
67
|
+
async def agenerate_text(
|
|
68
|
+
self,
|
|
69
|
+
prompt: str,
|
|
70
|
+
*,
|
|
71
|
+
options: GenerationOptions | None = None,
|
|
72
|
+
) -> str:
|
|
67
73
|
"""Generate text from a prompt (async)."""
|
|
68
74
|
attrs = {
|
|
69
75
|
"llm.provider": "vllm",
|
|
70
76
|
"llm.model": self._model_name,
|
|
71
77
|
"llm.mode": "async",
|
|
72
78
|
}
|
|
79
|
+
max_tokens = options.max_tokens if options and options.max_tokens is not None else 8192
|
|
80
|
+
api_kwargs = {
|
|
81
|
+
"model": self._model_name,
|
|
82
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
83
|
+
"max_completion_tokens": max_tokens,
|
|
84
|
+
}
|
|
85
|
+
if options and options.temperature is not None:
|
|
86
|
+
api_kwargs["temperature"] = options.temperature
|
|
87
|
+
if options and options.top_p is not None:
|
|
88
|
+
api_kwargs["top_p"] = options.top_p
|
|
89
|
+
if options and options.n is not None:
|
|
90
|
+
api_kwargs["n"] = options.n
|
|
91
|
+
if options and options.seed is not None:
|
|
92
|
+
api_kwargs["seed"] = options.seed
|
|
73
93
|
with instrumentation_span("llm.generate_text", attrs) as span:
|
|
74
|
-
response = await self._client.chat.completions.create(
|
|
75
|
-
model=self._model_name,
|
|
76
|
-
messages=[{"role": "user", "content": prompt}],
|
|
77
|
-
max_completion_tokens=8192,
|
|
78
|
-
)
|
|
94
|
+
response = await self._client.chat.completions.create(**api_kwargs)
|
|
79
95
|
content = response.choices[0].message.content or ""
|
|
80
96
|
if span:
|
|
81
97
|
set_span_attributes(span, {"llm.response.length": len(content)})
|
|
82
98
|
return content
|
|
83
99
|
|
|
84
|
-
def generate_text(
|
|
100
|
+
def generate_text(
|
|
101
|
+
self,
|
|
102
|
+
prompt: str,
|
|
103
|
+
*,
|
|
104
|
+
json_mode: bool = False,
|
|
105
|
+
options: GenerationOptions | None = None,
|
|
106
|
+
) -> str:
|
|
85
107
|
"""Generate text from a prompt (sync)."""
|
|
86
108
|
sync_client = OpenAI(
|
|
87
109
|
base_url=self._settings.vllm_base_url,
|
|
@@ -89,11 +111,20 @@ class VLLMAdapter(BaseLLMAdapter):
|
|
|
89
111
|
timeout=self._settings.vllm_timeout,
|
|
90
112
|
)
|
|
91
113
|
|
|
114
|
+
max_tokens = options.max_tokens if options and options.max_tokens is not None else 8192
|
|
92
115
|
api_kwargs: dict = {
|
|
93
116
|
"model": self._model_name,
|
|
94
117
|
"messages": [{"role": "user", "content": prompt}],
|
|
95
|
-
"max_completion_tokens":
|
|
118
|
+
"max_completion_tokens": max_tokens,
|
|
96
119
|
}
|
|
120
|
+
if options and options.temperature is not None:
|
|
121
|
+
api_kwargs["temperature"] = options.temperature
|
|
122
|
+
if options and options.top_p is not None:
|
|
123
|
+
api_kwargs["top_p"] = options.top_p
|
|
124
|
+
if options and options.n is not None:
|
|
125
|
+
api_kwargs["n"] = options.n
|
|
126
|
+
if options and options.seed is not None:
|
|
127
|
+
api_kwargs["seed"] = options.seed
|
|
97
128
|
if json_mode:
|
|
98
129
|
api_kwargs["response_format"] = {"type": "json_object"}
|
|
99
130
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def try_create_korean_toolkit() -> KoreanNLPToolkitPort | None:
|
|
11
|
+
try:
|
|
12
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit import KoreanNLPToolkit
|
|
13
|
+
except Exception as exc:
|
|
14
|
+
logger.debug("Korean toolkit import failed: %s", exc)
|
|
15
|
+
return None
|
|
16
|
+
try:
|
|
17
|
+
return KoreanNLPToolkit()
|
|
18
|
+
except Exception as exc:
|
|
19
|
+
logger.debug("Korean toolkit init failed: %s", exc)
|
|
20
|
+
return None
|
|
@@ -366,6 +366,24 @@ DEFAULT_METRIC_PROMPT = """당신은 RAG 시스템 평가 전문가입니다.
|
|
|
366
366
|
|
|
367
367
|
마크다운 형식으로 작성해주세요."""
|
|
368
368
|
|
|
369
|
+
DEFAULT_METRIC_PROMPT_EN = """You are a RAG evaluation expert.
|
|
370
|
+
|
|
371
|
+
## Target
|
|
372
|
+
- Metric: {metric_name}
|
|
373
|
+
- Score: {score:.3f} / 1.0
|
|
374
|
+
- Threshold: {threshold:.2f}
|
|
375
|
+
- Status: {status}
|
|
376
|
+
|
|
377
|
+
## Request
|
|
378
|
+
Provide a Markdown analysis covering:
|
|
379
|
+
|
|
380
|
+
1. **Current assessment**: what this score implies
|
|
381
|
+
2. **Likely causes**: plausible root causes
|
|
382
|
+
3. **Actionable improvements**: practical steps the team can take
|
|
383
|
+
4. **Expected impact**: anticipated gains per action
|
|
384
|
+
|
|
385
|
+
Respond in Markdown."""
|
|
386
|
+
|
|
369
387
|
|
|
370
388
|
EXECUTIVE_SUMMARY_PROMPT = """당신은 RAG 시스템 성능 개선 전문가입니다. 평가 결과를 분석하고 구체적인 개선 방안을 제시해주세요.
|
|
371
389
|
|
|
@@ -423,6 +441,60 @@ EXECUTIVE_SUMMARY_PROMPT = """당신은 RAG 시스템 성능 개선 전문가입
|
|
|
423
441
|
|
|
424
442
|
마크다운 형식으로 작성해주세요."""
|
|
425
443
|
|
|
444
|
+
EXECUTIVE_SUMMARY_PROMPT_EN = """You are a RAG performance improvement expert. Analyze the evaluation results and propose concrete actions.
|
|
445
|
+
|
|
446
|
+
Evaluation Results:
|
|
447
|
+
- Dataset: {dataset_name}
|
|
448
|
+
- Model: {model_name}
|
|
449
|
+
- Pass rate: {pass_rate:.1%}
|
|
450
|
+
- Test cases: {total_test_cases}
|
|
451
|
+
|
|
452
|
+
Metric Scores:
|
|
453
|
+
{metrics_summary}
|
|
454
|
+
|
|
455
|
+
Analysis Request:
|
|
456
|
+
|
|
457
|
+
Provide a RAG performance improvement-focused analysis using the structure below:
|
|
458
|
+
|
|
459
|
+
1) Current Summary (3 sentences)
|
|
460
|
+
- Overall quality level and the most urgent issue
|
|
461
|
+
- Clearly distinguish strong areas vs weak areas
|
|
462
|
+
|
|
463
|
+
2) Problem Definition
|
|
464
|
+
|
|
465
|
+
| Problem | Metric | Current | Target | Severity |
|
|
466
|
+
|------|--------|--------|--------|--------|
|
|
467
|
+
| (Specific issue) | (Related metric) | (Score) | (Target) | Critical/High/Medium |
|
|
468
|
+
|
|
469
|
+
3) Root Cause Analysis
|
|
470
|
+
|
|
471
|
+
For each problem:
|
|
472
|
+
- Direct cause: immediate cause (e.g., "relevant context is not retrieved")
|
|
473
|
+
- Root cause: structural cause (e.g., "chunk size is too large and dilutes relevance")
|
|
474
|
+
- Verification: how to validate (e.g., "re-run with top_k=10")
|
|
475
|
+
|
|
476
|
+
4) Solutions
|
|
477
|
+
|
|
478
|
+
P0 - Immediate (1-3 days)
|
|
479
|
+
For each action:
|
|
480
|
+
- Action: one-line description
|
|
481
|
+
- Implementation: concrete steps (including code/config changes)
|
|
482
|
+
- Expected impact: quantified estimate (e.g., "faithfulness +0.15")
|
|
483
|
+
|
|
484
|
+
P1 - Short term (1-2 weeks)
|
|
485
|
+
Provide 2-3 actions in the same format
|
|
486
|
+
|
|
487
|
+
P2 - Mid term (1 month)
|
|
488
|
+
Provide 1-2 actions in the same format
|
|
489
|
+
|
|
490
|
+
5) Verification Plan
|
|
491
|
+
- How to measure improvement for each action
|
|
492
|
+
- Monitoring indicators to prevent regressions
|
|
493
|
+
|
|
494
|
+
Note: Do not give abstract advice. All suggestions must be concrete and actionable.
|
|
495
|
+
|
|
496
|
+
Respond in Markdown."""
|
|
497
|
+
|
|
426
498
|
SUMMARY_RECOMMENDED_THRESHOLDS = {
|
|
427
499
|
"summary_faithfulness": 0.90,
|
|
428
500
|
"summary_score": 0.85,
|
|
@@ -643,11 +715,15 @@ class LLMReportGenerator:
|
|
|
643
715
|
threshold: float,
|
|
644
716
|
) -> LLMReportSection:
|
|
645
717
|
"""개별 메트릭 분석."""
|
|
646
|
-
|
|
647
|
-
|
|
718
|
+
prompt_template = (
|
|
719
|
+
DEFAULT_METRIC_PROMPT_EN
|
|
720
|
+
if self._language == "en"
|
|
721
|
+
else METRIC_ANALYSIS_PROMPTS.get(metric_name, DEFAULT_METRIC_PROMPT)
|
|
722
|
+
)
|
|
648
723
|
|
|
649
|
-
|
|
650
|
-
|
|
724
|
+
status = "pass" if score >= threshold else "fail"
|
|
725
|
+
if self._language != "en":
|
|
726
|
+
status = "통과" if score >= threshold else "미달"
|
|
651
727
|
|
|
652
728
|
prompt = prompt_template.format(
|
|
653
729
|
metric_name=metric_name,
|
|
@@ -683,11 +759,19 @@ class LLMReportGenerator:
|
|
|
683
759
|
for metric, score in metrics_scores.items():
|
|
684
760
|
threshold = thresholds.get(metric, 0.7)
|
|
685
761
|
status = "✅" if score >= threshold else "❌"
|
|
686
|
-
|
|
762
|
+
if self._language == "en":
|
|
763
|
+
metrics_lines.append(
|
|
764
|
+
f"- {metric}: {score:.3f} (threshold: {threshold:.2f}) {status}"
|
|
765
|
+
)
|
|
766
|
+
else:
|
|
767
|
+
metrics_lines.append(f"- {metric}: {score:.3f} (임계값: {threshold:.2f}) {status}")
|
|
687
768
|
|
|
688
769
|
metrics_summary = "\n".join(metrics_lines)
|
|
689
770
|
|
|
690
|
-
|
|
771
|
+
prompt_template = (
|
|
772
|
+
EXECUTIVE_SUMMARY_PROMPT_EN if self._language == "en" else EXECUTIVE_SUMMARY_PROMPT
|
|
773
|
+
)
|
|
774
|
+
prompt = prompt_template.format(
|
|
691
775
|
dataset_name=run.dataset_name,
|
|
692
776
|
model_name=run.model_name,
|
|
693
777
|
pass_rate=run.pass_rate,
|