evalvault 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. evalvault/adapters/inbound/api/adapter.py +190 -19
  2. evalvault/adapters/inbound/api/routers/runs.py +66 -2
  3. evalvault/adapters/inbound/cli/commands/method.py +5 -2
  4. evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
  5. evalvault/adapters/inbound/cli/commands/run.py +88 -5
  6. evalvault/adapters/inbound/cli/commands/run_helpers.py +12 -0
  7. evalvault/adapters/inbound/mcp/tools.py +5 -2
  8. evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
  9. evalvault/adapters/outbound/improvement/pattern_detector.py +1 -1
  10. evalvault/adapters/outbound/improvement/playbook_loader.py +1 -1
  11. evalvault/adapters/outbound/llm/__init__.py +5 -43
  12. evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
  13. evalvault/adapters/outbound/llm/factory.py +103 -0
  14. evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
  15. evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
  16. evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
  17. evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
  18. evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
  19. evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
  20. evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
  21. evalvault/adapters/outbound/storage/base_sql.py +528 -21
  22. evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
  23. evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
  24. evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
  25. evalvault/debug_ragas.py +7 -1
  26. evalvault/debug_ragas_real.py +5 -1
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/prompt_suggestion.py +50 -0
  29. evalvault/domain/services/__init__.py +6 -0
  30. evalvault/domain/services/evaluator.py +191 -103
  31. evalvault/domain/services/holdout_splitter.py +67 -0
  32. evalvault/domain/services/intent_classifier.py +73 -0
  33. evalvault/domain/services/pipeline_template_registry.py +3 -0
  34. evalvault/domain/services/prompt_candidate_service.py +117 -0
  35. evalvault/domain/services/prompt_registry.py +40 -2
  36. evalvault/domain/services/prompt_scoring_service.py +286 -0
  37. evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
  38. evalvault/domain/services/synthetic_qa_generator.py +4 -3
  39. evalvault/ports/inbound/learning_hook_port.py +4 -1
  40. evalvault/ports/outbound/__init__.py +2 -0
  41. evalvault/ports/outbound/llm_factory_port.py +13 -0
  42. evalvault/ports/outbound/llm_port.py +34 -2
  43. evalvault/ports/outbound/storage_port.py +38 -0
  44. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/METADATA +228 -4
  45. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/RECORD +48 -40
  46. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/WHEEL +0 -0
  47. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/entry_points.txt +0 -0
  48. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -10,6 +10,7 @@ from evalvault.adapters.outbound.llm.instructor_factory import create_instructor
10
10
  from evalvault.adapters.outbound.llm.token_aware_chat import TokenTrackingAsyncOpenAI
11
11
  from evalvault.config.phoenix_support import instrumentation_span, set_span_attributes
12
12
  from evalvault.config.settings import Settings
13
+ from evalvault.ports.outbound.llm_port import GenerationOptions
13
14
 
14
15
  _DEFAULT_MAX_COMPLETION_TOKENS = 8192
15
16
  _GPT5_MAX_COMPLETION_TOKENS = 16384
@@ -73,7 +74,12 @@ class OpenAIAdapter(BaseLLMAdapter):
73
74
  """
74
75
  return self._embedding_model_name
75
76
 
76
- async def agenerate_text(self, prompt: str) -> str:
77
+ async def agenerate_text(
78
+ self,
79
+ prompt: str,
80
+ *,
81
+ options: GenerationOptions | None = None,
82
+ ) -> str:
77
83
  """Generate text from a prompt (async).
78
84
 
79
85
  Uses the OpenAI chat completions API directly for simple text generation.
@@ -89,18 +95,35 @@ class OpenAIAdapter(BaseLLMAdapter):
89
95
  "llm.model": self._model_name,
90
96
  "llm.mode": "async",
91
97
  }
98
+ max_tokens = options.max_tokens if options and options.max_tokens is not None else None
99
+ api_kwargs = {
100
+ "model": self._model_name,
101
+ "messages": [{"role": "user", "content": prompt}],
102
+ "max_completion_tokens": max_tokens
103
+ or _max_completion_tokens_for_model(self._model_name),
104
+ }
105
+ if options and options.temperature is not None:
106
+ api_kwargs["temperature"] = options.temperature
107
+ if options and options.top_p is not None:
108
+ api_kwargs["top_p"] = options.top_p
109
+ if options and options.n is not None:
110
+ api_kwargs["n"] = options.n
111
+ if options and options.seed is not None:
112
+ api_kwargs["seed"] = options.seed
92
113
  with instrumentation_span("llm.generate_text", attrs) as span:
93
- response = await self._client.chat.completions.create(
94
- model=self._model_name,
95
- messages=[{"role": "user", "content": prompt}],
96
- max_completion_tokens=_max_completion_tokens_for_model(self._model_name),
97
- )
114
+ response = await self._client.chat.completions.create(**api_kwargs)
98
115
  content = response.choices[0].message.content or ""
99
116
  if span:
100
117
  set_span_attributes(span, {"llm.response.length": len(content)})
101
118
  return content
102
119
 
103
- def generate_text(self, prompt: str, *, json_mode: bool = False) -> str:
120
+ def generate_text(
121
+ self,
122
+ prompt: str,
123
+ *,
124
+ json_mode: bool = False,
125
+ options: GenerationOptions | None = None,
126
+ ) -> str:
104
127
  """Generate text from a prompt (sync).
105
128
 
106
129
  Uses sync OpenAI client directly.
@@ -124,11 +147,21 @@ class OpenAIAdapter(BaseLLMAdapter):
124
147
  sync_client = OpenAI(**client_kwargs)
125
148
 
126
149
  # API 호출 파라미터
150
+ max_tokens = options.max_tokens if options and options.max_tokens is not None else None
127
151
  api_kwargs: dict = {
128
152
  "model": self._model_name,
129
153
  "messages": [{"role": "user", "content": prompt}],
130
- "max_completion_tokens": _max_completion_tokens_for_model(self._model_name),
154
+ "max_completion_tokens": max_tokens
155
+ or _max_completion_tokens_for_model(self._model_name),
131
156
  }
157
+ if options and options.temperature is not None:
158
+ api_kwargs["temperature"] = options.temperature
159
+ if options and options.top_p is not None:
160
+ api_kwargs["top_p"] = options.top_p
161
+ if options and options.n is not None:
162
+ api_kwargs["n"] = options.n
163
+ if options and options.seed is not None:
164
+ api_kwargs["seed"] = options.seed
132
165
 
133
166
  # JSON 모드 설정
134
167
  if json_mode:
@@ -149,7 +149,16 @@ class TokenTrackingAsyncOpenAI(AsyncOpenAI):
149
149
 
150
150
  span_attrs = _build_llm_span_attrs(provider_name, kwargs)
151
151
  with instrumentation_span("llm.chat_completion", span_attrs) as span:
152
- response = await inner_self._completions.create(**kwargs)
152
+ try:
153
+ response = await inner_self._completions.create(**kwargs)
154
+ except TypeError as exc:
155
+ if "max_completion_tokens" in str(exc):
156
+ fallback_kwargs = dict(kwargs)
157
+ fallback_kwargs.pop("max_completion_tokens", None)
158
+ fallback_kwargs.pop("max_tokens", None)
159
+ response = await inner_self._completions.create(**fallback_kwargs)
160
+ else:
161
+ raise
153
162
  if provider_name == "ollama":
154
163
  _normalize_tool_calls(response, kwargs.get("tools"))
155
164
  # Extract usage from response
@@ -198,6 +207,7 @@ class ThinkingTokenTrackingAsyncOpenAI(TokenTrackingAsyncOpenAI):
198
207
 
199
208
  async def create(inner_self, **kwargs: Any) -> Any: # noqa: N805
200
209
  # Ensure 충분한 출력 토큰 확보 (Ollama는 max_tokens를 사용)
210
+ min_tokens = None
201
211
  if provider_name == "ollama":
202
212
  if "max_tokens" not in kwargs or kwargs["max_tokens"] < 4096:
203
213
  kwargs["max_tokens"] = 16384
@@ -222,7 +232,16 @@ class ThinkingTokenTrackingAsyncOpenAI(TokenTrackingAsyncOpenAI):
222
232
 
223
233
  span_attrs = _build_llm_span_attrs(provider_name, kwargs)
224
234
  with instrumentation_span("llm.chat_completion", span_attrs) as span:
225
- response = await inner_self._completions.create(**kwargs)
235
+ try:
236
+ response = await inner_self._completions.create(**kwargs)
237
+ except TypeError as exc:
238
+ if "max_completion_tokens" in str(exc):
239
+ fallback_kwargs = dict(kwargs)
240
+ fallback_kwargs.pop("max_completion_tokens", None)
241
+ fallback_kwargs.pop("max_tokens", None)
242
+ response = await inner_self._completions.create(**fallback_kwargs)
243
+ else:
244
+ raise
226
245
  if provider_name == "ollama":
227
246
  _normalize_tool_calls(response, kwargs.get("tools"))
228
247
 
@@ -12,6 +12,7 @@ from evalvault.adapters.outbound.llm.instructor_factory import create_instructor
12
12
  from evalvault.adapters.outbound.llm.token_aware_chat import TokenTrackingAsyncOpenAI
13
13
  from evalvault.config.phoenix_support import instrumentation_span, set_span_attributes
14
14
  from evalvault.config.settings import Settings
15
+ from evalvault.ports.outbound.llm_port import GenerationOptions
15
16
 
16
17
 
17
18
  class VLLMAdapter(BaseLLMAdapter):
@@ -63,25 +64,46 @@ class VLLMAdapter(BaseLLMAdapter):
63
64
  """Get the embedding model name being used."""
64
65
  return self._embedding_model_name
65
66
 
66
- async def agenerate_text(self, prompt: str) -> str:
67
+ async def agenerate_text(
68
+ self,
69
+ prompt: str,
70
+ *,
71
+ options: GenerationOptions | None = None,
72
+ ) -> str:
67
73
  """Generate text from a prompt (async)."""
68
74
  attrs = {
69
75
  "llm.provider": "vllm",
70
76
  "llm.model": self._model_name,
71
77
  "llm.mode": "async",
72
78
  }
79
+ max_tokens = options.max_tokens if options and options.max_tokens is not None else 8192
80
+ api_kwargs = {
81
+ "model": self._model_name,
82
+ "messages": [{"role": "user", "content": prompt}],
83
+ "max_completion_tokens": max_tokens,
84
+ }
85
+ if options and options.temperature is not None:
86
+ api_kwargs["temperature"] = options.temperature
87
+ if options and options.top_p is not None:
88
+ api_kwargs["top_p"] = options.top_p
89
+ if options and options.n is not None:
90
+ api_kwargs["n"] = options.n
91
+ if options and options.seed is not None:
92
+ api_kwargs["seed"] = options.seed
73
93
  with instrumentation_span("llm.generate_text", attrs) as span:
74
- response = await self._client.chat.completions.create(
75
- model=self._model_name,
76
- messages=[{"role": "user", "content": prompt}],
77
- max_completion_tokens=8192,
78
- )
94
+ response = await self._client.chat.completions.create(**api_kwargs)
79
95
  content = response.choices[0].message.content or ""
80
96
  if span:
81
97
  set_span_attributes(span, {"llm.response.length": len(content)})
82
98
  return content
83
99
 
84
- def generate_text(self, prompt: str, *, json_mode: bool = False) -> str:
100
+ def generate_text(
101
+ self,
102
+ prompt: str,
103
+ *,
104
+ json_mode: bool = False,
105
+ options: GenerationOptions | None = None,
106
+ ) -> str:
85
107
  """Generate text from a prompt (sync)."""
86
108
  sync_client = OpenAI(
87
109
  base_url=self._settings.vllm_base_url,
@@ -89,11 +111,20 @@ class VLLMAdapter(BaseLLMAdapter):
89
111
  timeout=self._settings.vllm_timeout,
90
112
  )
91
113
 
114
+ max_tokens = options.max_tokens if options and options.max_tokens is not None else 8192
92
115
  api_kwargs: dict = {
93
116
  "model": self._model_name,
94
117
  "messages": [{"role": "user", "content": prompt}],
95
- "max_completion_tokens": 8192,
118
+ "max_completion_tokens": max_tokens,
96
119
  }
120
+ if options and options.temperature is not None:
121
+ api_kwargs["temperature"] = options.temperature
122
+ if options and options.top_p is not None:
123
+ api_kwargs["top_p"] = options.top_p
124
+ if options and options.n is not None:
125
+ api_kwargs["n"] = options.n
126
+ if options and options.seed is not None:
127
+ api_kwargs["seed"] = options.seed
97
128
  if json_mode:
98
129
  api_kwargs["response_format"] = {"type": "json_object"}
99
130
 
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from evalvault.ports.outbound.korean_nlp_port import KoreanNLPToolkitPort
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def try_create_korean_toolkit() -> KoreanNLPToolkitPort | None:
11
+ try:
12
+ from evalvault.adapters.outbound.nlp.korean.toolkit import KoreanNLPToolkit
13
+ except Exception as exc:
14
+ logger.debug("Korean toolkit import failed: %s", exc)
15
+ return None
16
+ try:
17
+ return KoreanNLPToolkit()
18
+ except Exception as exc:
19
+ logger.debug("Korean toolkit init failed: %s", exc)
20
+ return None
@@ -366,6 +366,24 @@ DEFAULT_METRIC_PROMPT = """당신은 RAG 시스템 평가 전문가입니다.
366
366
 
367
367
  마크다운 형식으로 작성해주세요."""
368
368
 
369
+ DEFAULT_METRIC_PROMPT_EN = """You are a RAG evaluation expert.
370
+
371
+ ## Target
372
+ - Metric: {metric_name}
373
+ - Score: {score:.3f} / 1.0
374
+ - Threshold: {threshold:.2f}
375
+ - Status: {status}
376
+
377
+ ## Request
378
+ Provide a Markdown analysis covering:
379
+
380
+ 1. **Current assessment**: what this score implies
381
+ 2. **Likely causes**: plausible root causes
382
+ 3. **Actionable improvements**: practical steps the team can take
383
+ 4. **Expected impact**: anticipated gains per action
384
+
385
+ Respond in Markdown."""
386
+
369
387
 
370
388
  EXECUTIVE_SUMMARY_PROMPT = """당신은 RAG 시스템 성능 개선 전문가입니다. 평가 결과를 분석하고 구체적인 개선 방안을 제시해주세요.
371
389
 
@@ -423,6 +441,60 @@ EXECUTIVE_SUMMARY_PROMPT = """당신은 RAG 시스템 성능 개선 전문가입
423
441
 
424
442
  마크다운 형식으로 작성해주세요."""
425
443
 
444
+ EXECUTIVE_SUMMARY_PROMPT_EN = """You are a RAG performance improvement expert. Analyze the evaluation results and propose concrete actions.
445
+
446
+ Evaluation Results:
447
+ - Dataset: {dataset_name}
448
+ - Model: {model_name}
449
+ - Pass rate: {pass_rate:.1%}
450
+ - Test cases: {total_test_cases}
451
+
452
+ Metric Scores:
453
+ {metrics_summary}
454
+
455
+ Analysis Request:
456
+
457
+ Provide a RAG performance improvement-focused analysis using the structure below:
458
+
459
+ 1) Current Summary (3 sentences)
460
+ - Overall quality level and the most urgent issue
461
+ - Clearly distinguish strong areas vs weak areas
462
+
463
+ 2) Problem Definition
464
+
465
+ | Problem | Metric | Current | Target | Severity |
466
+ |------|--------|--------|--------|--------|
467
+ | (Specific issue) | (Related metric) | (Score) | (Target) | Critical/High/Medium |
468
+
469
+ 3) Root Cause Analysis
470
+
471
+ For each problem:
472
+ - Direct cause: immediate cause (e.g., "relevant context is not retrieved")
473
+ - Root cause: structural cause (e.g., "chunk size is too large and dilutes relevance")
474
+ - Verification: how to validate (e.g., "re-run with top_k=10")
475
+
476
+ 4) Solutions
477
+
478
+ P0 - Immediate (1-3 days)
479
+ For each action:
480
+ - Action: one-line description
481
+ - Implementation: concrete steps (including code/config changes)
482
+ - Expected impact: quantified estimate (e.g., "faithfulness +0.15")
483
+
484
+ P1 - Short term (1-2 weeks)
485
+ Provide 2-3 actions in the same format
486
+
487
+ P2 - Mid term (1 month)
488
+ Provide 1-2 actions in the same format
489
+
490
+ 5) Verification Plan
491
+ - How to measure improvement for each action
492
+ - Monitoring indicators to prevent regressions
493
+
494
+ Note: Do not give abstract advice. All suggestions must be concrete and actionable.
495
+
496
+ Respond in Markdown."""
497
+
426
498
  SUMMARY_RECOMMENDED_THRESHOLDS = {
427
499
  "summary_faithfulness": 0.90,
428
500
  "summary_score": 0.85,
@@ -643,11 +715,15 @@ class LLMReportGenerator:
643
715
  threshold: float,
644
716
  ) -> LLMReportSection:
645
717
  """개별 메트릭 분석."""
646
- # 프롬프트 선택
647
- prompt_template = METRIC_ANALYSIS_PROMPTS.get(metric_name, DEFAULT_METRIC_PROMPT)
718
+ prompt_template = (
719
+ DEFAULT_METRIC_PROMPT_EN
720
+ if self._language == "en"
721
+ else METRIC_ANALYSIS_PROMPTS.get(metric_name, DEFAULT_METRIC_PROMPT)
722
+ )
648
723
 
649
- # 상태 계산
650
- status = "통과" if score >= threshold else "미달"
724
+ status = "pass" if score >= threshold else "fail"
725
+ if self._language != "en":
726
+ status = "통과" if score >= threshold else "미달"
651
727
 
652
728
  prompt = prompt_template.format(
653
729
  metric_name=metric_name,
@@ -683,11 +759,19 @@ class LLMReportGenerator:
683
759
  for metric, score in metrics_scores.items():
684
760
  threshold = thresholds.get(metric, 0.7)
685
761
  status = "✅" if score >= threshold else "❌"
686
- metrics_lines.append(f"- {metric}: {score:.3f} (임계값: {threshold:.2f}) {status}")
762
+ if self._language == "en":
763
+ metrics_lines.append(
764
+ f"- {metric}: {score:.3f} (threshold: {threshold:.2f}) {status}"
765
+ )
766
+ else:
767
+ metrics_lines.append(f"- {metric}: {score:.3f} (임계값: {threshold:.2f}) {status}")
687
768
 
688
769
  metrics_summary = "\n".join(metrics_lines)
689
770
 
690
- prompt = EXECUTIVE_SUMMARY_PROMPT.format(
771
+ prompt_template = (
772
+ EXECUTIVE_SUMMARY_PROMPT_EN if self._language == "en" else EXECUTIVE_SUMMARY_PROMPT
773
+ )
774
+ prompt = prompt_template.format(
691
775
  dataset_name=run.dataset_name,
692
776
  model_name=run.model_name,
693
777
  pass_rate=run.pass_rate,