evalvault 1.62.0__py3-none-any.whl → 1.63.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. evalvault/adapters/inbound/api/adapter.py +190 -19
  2. evalvault/adapters/inbound/api/routers/runs.py +66 -2
  3. evalvault/adapters/inbound/cli/commands/method.py +5 -2
  4. evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
  5. evalvault/adapters/inbound/cli/commands/run.py +43 -2
  6. evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
  7. evalvault/adapters/inbound/mcp/tools.py +5 -2
  8. evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
  9. evalvault/adapters/outbound/llm/__init__.py +5 -43
  10. evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
  11. evalvault/adapters/outbound/llm/factory.py +103 -0
  12. evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
  13. evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
  14. evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
  15. evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
  16. evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
  17. evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
  18. evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
  19. evalvault/adapters/outbound/storage/base_sql.py +527 -21
  20. evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
  21. evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
  22. evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
  23. evalvault/debug_ragas.py +7 -1
  24. evalvault/debug_ragas_real.py +5 -1
  25. evalvault/domain/entities/__init__.py +10 -0
  26. evalvault/domain/entities/prompt_suggestion.py +50 -0
  27. evalvault/domain/services/__init__.py +6 -0
  28. evalvault/domain/services/evaluator.py +191 -103
  29. evalvault/domain/services/holdout_splitter.py +67 -0
  30. evalvault/domain/services/intent_classifier.py +73 -0
  31. evalvault/domain/services/pipeline_template_registry.py +3 -0
  32. evalvault/domain/services/prompt_candidate_service.py +117 -0
  33. evalvault/domain/services/prompt_registry.py +40 -2
  34. evalvault/domain/services/prompt_scoring_service.py +286 -0
  35. evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
  36. evalvault/domain/services/synthetic_qa_generator.py +4 -3
  37. evalvault/ports/inbound/learning_hook_port.py +4 -1
  38. evalvault/ports/outbound/__init__.py +2 -0
  39. evalvault/ports/outbound/llm_factory_port.py +13 -0
  40. evalvault/ports/outbound/llm_port.py +34 -2
  41. evalvault/ports/outbound/storage_port.py +38 -0
  42. {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
  43. {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
  44. {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
  45. {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
  46. {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,277 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from evalvault.domain.entities.prompt_suggestion import (
8
+ PromptCandidate,
9
+ PromptCandidateSampleScore,
10
+ PromptCandidateScore,
11
+ PromptSuggestionResult,
12
+ )
13
+ from evalvault.ports.outbound.storage_port import StoragePort
14
+
15
+
16
+ def _serialize_sample_score(sample: PromptCandidateSampleScore) -> dict[str, Any]:
17
+ return {
18
+ "sample_index": sample.sample_index,
19
+ "scores": dict(sample.scores),
20
+ "weighted_score": sample.weighted_score,
21
+ "responses": list(sample.responses),
22
+ }
23
+
24
+
25
+ class PromptSuggestionReporter:
26
+ def render_json(self, result: PromptSuggestionResult) -> dict[str, Any]:
27
+ score_map = {score.candidate_id: score for score in result.scores}
28
+ candidates_payload = [
29
+ self._serialize_candidate(candidate, score_map) for candidate in result.candidates
30
+ ]
31
+ return {
32
+ "run_id": result.run_id,
33
+ "role": result.role,
34
+ "metrics": list(result.metrics),
35
+ "weights": dict(result.weights),
36
+ "candidates": candidates_payload,
37
+ "ranking": list(result.ranking),
38
+ "holdout_ratio": result.holdout_ratio,
39
+ "metadata": dict(result.metadata),
40
+ }
41
+
42
+ def render_markdown(self, result: PromptSuggestionResult) -> str:
43
+ score_map = {score.candidate_id: score for score in result.scores}
44
+ lines = [
45
+ "# 프롬프트 추천 결과",
46
+ "",
47
+ "## 개요",
48
+ f"- run_id: {result.run_id}",
49
+ f"- role: {result.role}",
50
+ f"- metrics: {', '.join(result.metrics)}",
51
+ f"- holdout_ratio: {result.holdout_ratio:.2f}",
52
+ ]
53
+ if result.weights:
54
+ weights = ", ".join(
55
+ f"{metric}={weight:.2f}" for metric, weight in result.weights.items()
56
+ )
57
+ lines.append(f"- weights: {weights}")
58
+ if result.metadata:
59
+ lines.append(f"- metadata: {json.dumps(result.metadata, ensure_ascii=False)}")
60
+
61
+ lines.extend(
62
+ [
63
+ "",
64
+ "## 후보 순위",
65
+ "",
66
+ "| Rank | Candidate | Source | Score |",
67
+ "| --- | --- | --- | --- |",
68
+ ]
69
+ )
70
+
71
+ for rank, candidate_id in enumerate(result.ranking, start=1):
72
+ candidate = next(
73
+ (item for item in result.candidates if item.candidate_id == candidate_id), None
74
+ )
75
+ score = score_map.get(candidate_id)
76
+ if candidate is None or score is None:
77
+ continue
78
+ preview = candidate.content.replace("\n", " ")
79
+ if len(preview) > 80:
80
+ preview = preview[:77] + "..."
81
+ lines.append(
82
+ f"| {rank} | {preview} | {candidate.source} | {score.weighted_score:.4f} |"
83
+ )
84
+
85
+ lines.append("")
86
+ lines.append("## 후보 상세")
87
+ for candidate in result.candidates:
88
+ score = score_map.get(candidate.candidate_id)
89
+ lines.extend(
90
+ [
91
+ "",
92
+ f"### {candidate.candidate_id}",
93
+ f"- source: {candidate.source}",
94
+ f"- weighted_score: {score.weighted_score:.4f}" if score else "- score: -",
95
+ ]
96
+ )
97
+ if score:
98
+ lines.append(f"- selected_sample_index: {score.selected_sample_index}")
99
+ if score and score.scores:
100
+ lines.append("- metric_scores:")
101
+ for metric, value in score.scores.items():
102
+ lines.append(f" - {metric}: {value:.4f}")
103
+ if score and score.sample_scores:
104
+ lines.append("- sample_scores:")
105
+ for sample in score.sample_scores:
106
+ metrics = ", ".join(
107
+ f"{metric}={value:.4f}" for metric, value in sample.scores.items()
108
+ )
109
+ lines.append(
110
+ f" - {sample.sample_index}: {sample.weighted_score:.4f} ({metrics})"
111
+ )
112
+ selected_sample = next(
113
+ (
114
+ entry
115
+ for entry in score.sample_scores
116
+ if entry.sample_index == score.selected_sample_index
117
+ ),
118
+ None,
119
+ )
120
+ if selected_sample:
121
+ lines.append(f"- selected_sample_responses: {len(selected_sample.responses)}")
122
+ for response in selected_sample.responses:
123
+ question = response.get("question") or ""
124
+ answer = response.get("answer") or ""
125
+ ground_truth = response.get("ground_truth") or ""
126
+ contexts = list(response.get("contexts") or [])
127
+ lines.extend(
128
+ [
129
+ " - response:",
130
+ f" - test_case_id: {response.get('test_case_id')}",
131
+ f" - question: {question}",
132
+ " - contexts:",
133
+ ]
134
+ )
135
+ for ctx in contexts:
136
+ lines.append(f" - {ctx}")
137
+ lines.extend(
138
+ [
139
+ " - answer:",
140
+ " ```",
141
+ f" {answer}",
142
+ " ```",
143
+ ]
144
+ )
145
+ if ground_truth:
146
+ lines.extend(
147
+ [
148
+ " - ground_truth:",
149
+ " ```",
150
+ f" {ground_truth}",
151
+ " ```",
152
+ ]
153
+ )
154
+ if candidate.metadata:
155
+ lines.append(f"- metadata: {json.dumps(candidate.metadata, ensure_ascii=False)}")
156
+ lines.extend(["", "```", candidate.content.strip(), "```"])
157
+
158
+ return "\n".join(lines).strip() + "\n"
159
+
160
+ def write_outputs(
161
+ self,
162
+ *,
163
+ result: PromptSuggestionResult,
164
+ output_path: Path,
165
+ report_path: Path,
166
+ artifacts_dir: Path,
167
+ storage: StoragePort | None = None,
168
+ ) -> None:
169
+ output_path.parent.mkdir(parents=True, exist_ok=True)
170
+ report_path.parent.mkdir(parents=True, exist_ok=True)
171
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
172
+
173
+ json_payload = self.render_json(result)
174
+ output_path.write_text(
175
+ json.dumps(json_payload, ensure_ascii=False, indent=2),
176
+ encoding="utf-8",
177
+ )
178
+
179
+ markdown_text = self.render_markdown(result)
180
+ report_path.write_text(markdown_text, encoding="utf-8")
181
+
182
+ artifacts_index = self._write_artifacts(result, artifacts_dir)
183
+ index_path = artifacts_dir / "index.json"
184
+ index_path.write_text(
185
+ json.dumps(artifacts_index, ensure_ascii=False, indent=2),
186
+ encoding="utf-8",
187
+ )
188
+
189
+ if storage:
190
+ storage.save_analysis_report(
191
+ report_id=None,
192
+ run_id=result.run_id,
193
+ experiment_id=None,
194
+ report_type="prompt_suggestions",
195
+ format="markdown",
196
+ content=markdown_text,
197
+ metadata={
198
+ "output_path": str(output_path),
199
+ "report_path": str(report_path),
200
+ "artifacts_dir": str(artifacts_dir),
201
+ },
202
+ )
203
+
204
+ def _serialize_candidate(
205
+ self,
206
+ candidate: PromptCandidate,
207
+ score_map: dict[str, PromptCandidateScore],
208
+ ) -> dict[str, Any]:
209
+ payload: dict[str, Any] = {
210
+ "candidate_id": candidate.candidate_id,
211
+ "source": candidate.source,
212
+ "content": candidate.content,
213
+ }
214
+ score = score_map.get(candidate.candidate_id)
215
+ if score:
216
+ payload["scores"] = dict(score.scores)
217
+ payload["weighted_score"] = score.weighted_score
218
+ payload["selected_sample_index"] = score.selected_sample_index
219
+ if score.sample_scores:
220
+ payload["sample_scores"] = [
221
+ _serialize_sample_score(entry) for entry in score.sample_scores
222
+ ]
223
+ if candidate.metadata:
224
+ payload["metadata"] = dict(candidate.metadata)
225
+ return payload
226
+
227
+ def _write_artifacts(
228
+ self, result: PromptSuggestionResult, artifacts_dir: Path
229
+ ) -> dict[str, Any]:
230
+ candidates_payload = [
231
+ {
232
+ "candidate_id": candidate.candidate_id,
233
+ "source": candidate.source,
234
+ "content": candidate.content,
235
+ "metadata": dict(candidate.metadata),
236
+ }
237
+ for candidate in result.candidates
238
+ ]
239
+ scores_payload = [
240
+ {
241
+ "candidate_id": score.candidate_id,
242
+ "scores": dict(score.scores),
243
+ "weighted_score": score.weighted_score,
244
+ "selected_sample_index": score.selected_sample_index,
245
+ "sample_scores": [
246
+ _serialize_sample_score(sample) for sample in score.sample_scores
247
+ ],
248
+ }
249
+ for score in result.scores
250
+ ]
251
+ ranking_payload = list(result.ranking)
252
+
253
+ candidates_path = artifacts_dir / "candidates.json"
254
+ scores_path = artifacts_dir / "scores.json"
255
+ ranking_path = artifacts_dir / "ranking.json"
256
+
257
+ candidates_path.write_text(
258
+ json.dumps(candidates_payload, ensure_ascii=False, indent=2),
259
+ encoding="utf-8",
260
+ )
261
+ scores_path.write_text(
262
+ json.dumps(scores_payload, ensure_ascii=False, indent=2),
263
+ encoding="utf-8",
264
+ )
265
+ ranking_path.write_text(
266
+ json.dumps(ranking_payload, ensure_ascii=False, indent=2),
267
+ encoding="utf-8",
268
+ )
269
+
270
+ return {
271
+ "dir": str(artifacts_dir),
272
+ "files": {
273
+ "candidates": str(candidates_path),
274
+ "scores": str(scores_path),
275
+ "ranking": str(ranking_path),
276
+ },
277
+ }
@@ -11,6 +11,7 @@ import json
11
11
  import logging
12
12
  import random
13
13
  import re
14
+ from collections.abc import Callable
14
15
  from dataclasses import dataclass, field
15
16
  from datetime import datetime
16
17
  from typing import TYPE_CHECKING
@@ -130,8 +131,8 @@ class SyntheticQAGenerator:
130
131
  - 한국어/영어 지원
131
132
 
132
133
  Example:
133
- >>> from evalvault.adapters.outbound.llm import OpenAIAdapter
134
- >>> llm = OpenAIAdapter()
134
+ >>> from evalvault.ports.outbound.llm_port import LLMPort
135
+ >>> llm: LLMPort = ...
135
136
  >>> generator = SyntheticQAGenerator(llm)
136
137
  >>> dataset = generator.generate(documents, config)
137
138
  """
@@ -298,7 +299,7 @@ class SyntheticQAGenerator:
298
299
  self,
299
300
  documents: list[str],
300
301
  config: SyntheticQAConfig,
301
- progress_callback: callable = None,
302
+ progress_callback: Callable[[int, int], None] | None = None,
302
303
  ) -> Dataset:
303
304
  """Generate synthetic Q&A dataset from documents.
304
305
 
@@ -20,7 +20,10 @@ class DomainLearningHookPort(Protocol):
20
20
  평가 완료 후 호출되어 도메인 메모리를 형성합니다.
21
21
 
22
22
  사용 예시:
23
- evaluator = RagasEvaluator()
23
+ settings = Settings()
24
+ llm_factory = SettingsLLMFactory(settings)
25
+ korean_toolkit = try_create_korean_toolkit()
26
+ evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
24
27
  hook = InsuranceDomainLearningHook(memory_adapter)
25
28
 
26
29
  # 평가 실행
@@ -44,6 +44,7 @@ from evalvault.ports.outbound.korean_nlp_port import (
44
44
  RetrieverPort,
45
45
  RetrieverResultProtocol,
46
46
  )
47
+ from evalvault.ports.outbound.llm_factory_port import LLMFactoryPort
47
48
  from evalvault.ports.outbound.llm_port import LLMPort
48
49
  from evalvault.ports.outbound.method_port import MethodRuntime, RagMethodPort
49
50
  from evalvault.ports.outbound.nlp_analysis_port import NLPAnalysisPort
@@ -82,6 +83,7 @@ __all__ = [
82
83
  "PatternDefinitionProtocol",
83
84
  "MetricPlaybookProtocol",
84
85
  "ClaimImprovementProtocol",
86
+ "LLMFactoryPort",
85
87
  "LLMPort",
86
88
  "MethodRuntime",
87
89
  "RagMethodPort",
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+ from evalvault.ports.outbound.llm_port import LLMPort
6
+
7
+
8
+ class LLMFactoryPort(Protocol):
9
+ def create_faithfulness_fallback(
10
+ self,
11
+ active_provider: str | None,
12
+ active_model: str | None,
13
+ ) -> LLMPort | None: ...
@@ -34,6 +34,15 @@ class ThinkingConfig:
34
34
  return {"think_level": self.think_level}
35
35
 
36
36
 
37
+ @dataclass
38
+ class GenerationOptions:
39
+ temperature: float | None = None
40
+ top_p: float | None = None
41
+ max_tokens: int | None = None
42
+ n: int | None = None
43
+ seed: int | None = None
44
+
45
+
37
46
  class LLMPort(ABC):
38
47
  """LLM adapter interface for Ragas metrics evaluation.
39
48
 
@@ -62,6 +71,18 @@ class LLMPort(ABC):
62
71
  """
63
72
  pass
64
73
 
74
+ def as_ragas_embeddings(self) -> Any:
75
+ raise NotImplementedError("as_ragas_embeddings not implemented")
76
+
77
+ def get_token_usage(self) -> tuple[int, int, int]:
78
+ raise NotImplementedError("get_token_usage not implemented")
79
+
80
+ def get_and_reset_token_usage(self) -> tuple[int, int, int]:
81
+ raise NotImplementedError("get_and_reset_token_usage not implemented")
82
+
83
+ def reset_token_usage(self) -> None:
84
+ raise NotImplementedError("reset_token_usage not implemented")
85
+
65
86
  def get_thinking_config(self) -> ThinkingConfig:
66
87
  """Get thinking/reasoning configuration for this adapter.
67
88
 
@@ -81,7 +102,12 @@ class LLMPort(ABC):
81
102
  """
82
103
  return self.get_thinking_config().enabled
83
104
 
84
- async def agenerate_text(self, prompt: str) -> str:
105
+ async def agenerate_text(
106
+ self,
107
+ prompt: str,
108
+ *,
109
+ options: GenerationOptions | None = None,
110
+ ) -> str:
85
111
  """Generate text from a prompt (async).
86
112
 
87
113
  Simple text generation for use cases like report generation,
@@ -98,7 +124,13 @@ class LLMPort(ABC):
98
124
  """
99
125
  raise NotImplementedError("agenerate_text not implemented")
100
126
 
101
- def generate_text(self, prompt: str, *, json_mode: bool = False) -> str:
127
+ def generate_text(
128
+ self,
129
+ prompt: str,
130
+ *,
131
+ json_mode: bool = False,
132
+ options: GenerationOptions | None = None,
133
+ ) -> str:
102
134
  """Generate text from a prompt (sync).
103
135
 
104
136
  Simple text generation for use cases like report generation,
@@ -1,5 +1,6 @@
1
1
  """결과 저장 인터페이스."""
2
2
 
3
+ from pathlib import Path
3
4
  from typing import Any, Protocol
4
5
 
5
6
  from evalvault.domain.entities import (
@@ -11,6 +12,7 @@ from evalvault.domain.entities import (
11
12
  SatisfactionFeedback,
12
13
  )
13
14
  from evalvault.domain.entities.experiment import Experiment
15
+ from evalvault.domain.entities.stage import StageEvent, StageMetric
14
16
 
15
17
 
16
18
  class StoragePort(Protocol):
@@ -34,6 +36,8 @@ class StoragePort(Protocol):
34
36
  """Persist prompt set and prompt items."""
35
37
  ...
36
38
 
39
+ def export_run_to_excel(self, run_id: str, output_path: str | Path) -> Path: ...
40
+
37
41
  def link_prompt_set_to_run(self, run_id: str, prompt_set_id: str) -> None:
38
42
  """Attach a prompt set to an evaluation run."""
39
43
  ...
@@ -78,6 +82,27 @@ class StoragePort(Protocol):
78
82
  """
79
83
  ...
80
84
 
85
+ def delete_run(self, run_id: str) -> bool: ...
86
+
87
+ def save_stage_events(self, events: list[StageEvent]) -> int: ...
88
+
89
+ def save_stage_metrics(self, metrics: list[StageMetric]) -> int: ...
90
+
91
+ def list_stage_events(
92
+ self,
93
+ run_id: str,
94
+ *,
95
+ stage_type: str | None = None,
96
+ ) -> list[StageEvent]: ...
97
+
98
+ def list_stage_metrics(
99
+ self,
100
+ run_id: str,
101
+ *,
102
+ stage_id: str | None = None,
103
+ metric_name: str | None = None,
104
+ ) -> list[StageMetric]: ...
105
+
81
106
  def update_run_metadata(self, run_id: str, metadata: dict[str, Any]) -> None: ...
82
107
 
83
108
  def save_run_cluster_map(
@@ -166,6 +191,19 @@ class StoragePort(Protocol):
166
191
  """파이프라인 분석 결과 히스토리를 저장합니다."""
167
192
  ...
168
193
 
194
+ def save_analysis_report(
195
+ self,
196
+ *,
197
+ report_id: str | None,
198
+ run_id: str | None,
199
+ experiment_id: str | None,
200
+ report_type: str,
201
+ format: str,
202
+ content: str | None,
203
+ metadata: dict[str, Any] | None = None,
204
+ created_at: str | None = None,
205
+ ) -> str: ...
206
+
169
207
  def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
170
208
  """파이프라인 분석 결과 목록을 조회합니다."""
171
209
  ...