evalvault 1.62.0__py3-none-any.whl → 1.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +43 -2
- evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +527 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from evalvault.domain.entities.prompt_suggestion import (
|
|
8
|
+
PromptCandidate,
|
|
9
|
+
PromptCandidateSampleScore,
|
|
10
|
+
PromptCandidateScore,
|
|
11
|
+
PromptSuggestionResult,
|
|
12
|
+
)
|
|
13
|
+
from evalvault.ports.outbound.storage_port import StoragePort
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _serialize_sample_score(sample: PromptCandidateSampleScore) -> dict[str, Any]:
|
|
17
|
+
return {
|
|
18
|
+
"sample_index": sample.sample_index,
|
|
19
|
+
"scores": dict(sample.scores),
|
|
20
|
+
"weighted_score": sample.weighted_score,
|
|
21
|
+
"responses": list(sample.responses),
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PromptSuggestionReporter:
|
|
26
|
+
def render_json(self, result: PromptSuggestionResult) -> dict[str, Any]:
|
|
27
|
+
score_map = {score.candidate_id: score for score in result.scores}
|
|
28
|
+
candidates_payload = [
|
|
29
|
+
self._serialize_candidate(candidate, score_map) for candidate in result.candidates
|
|
30
|
+
]
|
|
31
|
+
return {
|
|
32
|
+
"run_id": result.run_id,
|
|
33
|
+
"role": result.role,
|
|
34
|
+
"metrics": list(result.metrics),
|
|
35
|
+
"weights": dict(result.weights),
|
|
36
|
+
"candidates": candidates_payload,
|
|
37
|
+
"ranking": list(result.ranking),
|
|
38
|
+
"holdout_ratio": result.holdout_ratio,
|
|
39
|
+
"metadata": dict(result.metadata),
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def render_markdown(self, result: PromptSuggestionResult) -> str:
|
|
43
|
+
score_map = {score.candidate_id: score for score in result.scores}
|
|
44
|
+
lines = [
|
|
45
|
+
"# 프롬프트 추천 결과",
|
|
46
|
+
"",
|
|
47
|
+
"## 개요",
|
|
48
|
+
f"- run_id: {result.run_id}",
|
|
49
|
+
f"- role: {result.role}",
|
|
50
|
+
f"- metrics: {', '.join(result.metrics)}",
|
|
51
|
+
f"- holdout_ratio: {result.holdout_ratio:.2f}",
|
|
52
|
+
]
|
|
53
|
+
if result.weights:
|
|
54
|
+
weights = ", ".join(
|
|
55
|
+
f"{metric}={weight:.2f}" for metric, weight in result.weights.items()
|
|
56
|
+
)
|
|
57
|
+
lines.append(f"- weights: {weights}")
|
|
58
|
+
if result.metadata:
|
|
59
|
+
lines.append(f"- metadata: {json.dumps(result.metadata, ensure_ascii=False)}")
|
|
60
|
+
|
|
61
|
+
lines.extend(
|
|
62
|
+
[
|
|
63
|
+
"",
|
|
64
|
+
"## 후보 순위",
|
|
65
|
+
"",
|
|
66
|
+
"| Rank | Candidate | Source | Score |",
|
|
67
|
+
"| --- | --- | --- | --- |",
|
|
68
|
+
]
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
for rank, candidate_id in enumerate(result.ranking, start=1):
|
|
72
|
+
candidate = next(
|
|
73
|
+
(item for item in result.candidates if item.candidate_id == candidate_id), None
|
|
74
|
+
)
|
|
75
|
+
score = score_map.get(candidate_id)
|
|
76
|
+
if candidate is None or score is None:
|
|
77
|
+
continue
|
|
78
|
+
preview = candidate.content.replace("\n", " ")
|
|
79
|
+
if len(preview) > 80:
|
|
80
|
+
preview = preview[:77] + "..."
|
|
81
|
+
lines.append(
|
|
82
|
+
f"| {rank} | {preview} | {candidate.source} | {score.weighted_score:.4f} |"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
lines.append("")
|
|
86
|
+
lines.append("## 후보 상세")
|
|
87
|
+
for candidate in result.candidates:
|
|
88
|
+
score = score_map.get(candidate.candidate_id)
|
|
89
|
+
lines.extend(
|
|
90
|
+
[
|
|
91
|
+
"",
|
|
92
|
+
f"### {candidate.candidate_id}",
|
|
93
|
+
f"- source: {candidate.source}",
|
|
94
|
+
f"- weighted_score: {score.weighted_score:.4f}" if score else "- score: -",
|
|
95
|
+
]
|
|
96
|
+
)
|
|
97
|
+
if score:
|
|
98
|
+
lines.append(f"- selected_sample_index: {score.selected_sample_index}")
|
|
99
|
+
if score and score.scores:
|
|
100
|
+
lines.append("- metric_scores:")
|
|
101
|
+
for metric, value in score.scores.items():
|
|
102
|
+
lines.append(f" - {metric}: {value:.4f}")
|
|
103
|
+
if score and score.sample_scores:
|
|
104
|
+
lines.append("- sample_scores:")
|
|
105
|
+
for sample in score.sample_scores:
|
|
106
|
+
metrics = ", ".join(
|
|
107
|
+
f"{metric}={value:.4f}" for metric, value in sample.scores.items()
|
|
108
|
+
)
|
|
109
|
+
lines.append(
|
|
110
|
+
f" - {sample.sample_index}: {sample.weighted_score:.4f} ({metrics})"
|
|
111
|
+
)
|
|
112
|
+
selected_sample = next(
|
|
113
|
+
(
|
|
114
|
+
entry
|
|
115
|
+
for entry in score.sample_scores
|
|
116
|
+
if entry.sample_index == score.selected_sample_index
|
|
117
|
+
),
|
|
118
|
+
None,
|
|
119
|
+
)
|
|
120
|
+
if selected_sample:
|
|
121
|
+
lines.append(f"- selected_sample_responses: {len(selected_sample.responses)}")
|
|
122
|
+
for response in selected_sample.responses:
|
|
123
|
+
question = response.get("question") or ""
|
|
124
|
+
answer = response.get("answer") or ""
|
|
125
|
+
ground_truth = response.get("ground_truth") or ""
|
|
126
|
+
contexts = list(response.get("contexts") or [])
|
|
127
|
+
lines.extend(
|
|
128
|
+
[
|
|
129
|
+
" - response:",
|
|
130
|
+
f" - test_case_id: {response.get('test_case_id')}",
|
|
131
|
+
f" - question: {question}",
|
|
132
|
+
" - contexts:",
|
|
133
|
+
]
|
|
134
|
+
)
|
|
135
|
+
for ctx in contexts:
|
|
136
|
+
lines.append(f" - {ctx}")
|
|
137
|
+
lines.extend(
|
|
138
|
+
[
|
|
139
|
+
" - answer:",
|
|
140
|
+
" ```",
|
|
141
|
+
f" {answer}",
|
|
142
|
+
" ```",
|
|
143
|
+
]
|
|
144
|
+
)
|
|
145
|
+
if ground_truth:
|
|
146
|
+
lines.extend(
|
|
147
|
+
[
|
|
148
|
+
" - ground_truth:",
|
|
149
|
+
" ```",
|
|
150
|
+
f" {ground_truth}",
|
|
151
|
+
" ```",
|
|
152
|
+
]
|
|
153
|
+
)
|
|
154
|
+
if candidate.metadata:
|
|
155
|
+
lines.append(f"- metadata: {json.dumps(candidate.metadata, ensure_ascii=False)}")
|
|
156
|
+
lines.extend(["", "```", candidate.content.strip(), "```"])
|
|
157
|
+
|
|
158
|
+
return "\n".join(lines).strip() + "\n"
|
|
159
|
+
|
|
160
|
+
def write_outputs(
|
|
161
|
+
self,
|
|
162
|
+
*,
|
|
163
|
+
result: PromptSuggestionResult,
|
|
164
|
+
output_path: Path,
|
|
165
|
+
report_path: Path,
|
|
166
|
+
artifacts_dir: Path,
|
|
167
|
+
storage: StoragePort | None = None,
|
|
168
|
+
) -> None:
|
|
169
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
170
|
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
171
|
+
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
172
|
+
|
|
173
|
+
json_payload = self.render_json(result)
|
|
174
|
+
output_path.write_text(
|
|
175
|
+
json.dumps(json_payload, ensure_ascii=False, indent=2),
|
|
176
|
+
encoding="utf-8",
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
markdown_text = self.render_markdown(result)
|
|
180
|
+
report_path.write_text(markdown_text, encoding="utf-8")
|
|
181
|
+
|
|
182
|
+
artifacts_index = self._write_artifacts(result, artifacts_dir)
|
|
183
|
+
index_path = artifacts_dir / "index.json"
|
|
184
|
+
index_path.write_text(
|
|
185
|
+
json.dumps(artifacts_index, ensure_ascii=False, indent=2),
|
|
186
|
+
encoding="utf-8",
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if storage:
|
|
190
|
+
storage.save_analysis_report(
|
|
191
|
+
report_id=None,
|
|
192
|
+
run_id=result.run_id,
|
|
193
|
+
experiment_id=None,
|
|
194
|
+
report_type="prompt_suggestions",
|
|
195
|
+
format="markdown",
|
|
196
|
+
content=markdown_text,
|
|
197
|
+
metadata={
|
|
198
|
+
"output_path": str(output_path),
|
|
199
|
+
"report_path": str(report_path),
|
|
200
|
+
"artifacts_dir": str(artifacts_dir),
|
|
201
|
+
},
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def _serialize_candidate(
|
|
205
|
+
self,
|
|
206
|
+
candidate: PromptCandidate,
|
|
207
|
+
score_map: dict[str, PromptCandidateScore],
|
|
208
|
+
) -> dict[str, Any]:
|
|
209
|
+
payload: dict[str, Any] = {
|
|
210
|
+
"candidate_id": candidate.candidate_id,
|
|
211
|
+
"source": candidate.source,
|
|
212
|
+
"content": candidate.content,
|
|
213
|
+
}
|
|
214
|
+
score = score_map.get(candidate.candidate_id)
|
|
215
|
+
if score:
|
|
216
|
+
payload["scores"] = dict(score.scores)
|
|
217
|
+
payload["weighted_score"] = score.weighted_score
|
|
218
|
+
payload["selected_sample_index"] = score.selected_sample_index
|
|
219
|
+
if score.sample_scores:
|
|
220
|
+
payload["sample_scores"] = [
|
|
221
|
+
_serialize_sample_score(entry) for entry in score.sample_scores
|
|
222
|
+
]
|
|
223
|
+
if candidate.metadata:
|
|
224
|
+
payload["metadata"] = dict(candidate.metadata)
|
|
225
|
+
return payload
|
|
226
|
+
|
|
227
|
+
def _write_artifacts(
|
|
228
|
+
self, result: PromptSuggestionResult, artifacts_dir: Path
|
|
229
|
+
) -> dict[str, Any]:
|
|
230
|
+
candidates_payload = [
|
|
231
|
+
{
|
|
232
|
+
"candidate_id": candidate.candidate_id,
|
|
233
|
+
"source": candidate.source,
|
|
234
|
+
"content": candidate.content,
|
|
235
|
+
"metadata": dict(candidate.metadata),
|
|
236
|
+
}
|
|
237
|
+
for candidate in result.candidates
|
|
238
|
+
]
|
|
239
|
+
scores_payload = [
|
|
240
|
+
{
|
|
241
|
+
"candidate_id": score.candidate_id,
|
|
242
|
+
"scores": dict(score.scores),
|
|
243
|
+
"weighted_score": score.weighted_score,
|
|
244
|
+
"selected_sample_index": score.selected_sample_index,
|
|
245
|
+
"sample_scores": [
|
|
246
|
+
_serialize_sample_score(sample) for sample in score.sample_scores
|
|
247
|
+
],
|
|
248
|
+
}
|
|
249
|
+
for score in result.scores
|
|
250
|
+
]
|
|
251
|
+
ranking_payload = list(result.ranking)
|
|
252
|
+
|
|
253
|
+
candidates_path = artifacts_dir / "candidates.json"
|
|
254
|
+
scores_path = artifacts_dir / "scores.json"
|
|
255
|
+
ranking_path = artifacts_dir / "ranking.json"
|
|
256
|
+
|
|
257
|
+
candidates_path.write_text(
|
|
258
|
+
json.dumps(candidates_payload, ensure_ascii=False, indent=2),
|
|
259
|
+
encoding="utf-8",
|
|
260
|
+
)
|
|
261
|
+
scores_path.write_text(
|
|
262
|
+
json.dumps(scores_payload, ensure_ascii=False, indent=2),
|
|
263
|
+
encoding="utf-8",
|
|
264
|
+
)
|
|
265
|
+
ranking_path.write_text(
|
|
266
|
+
json.dumps(ranking_payload, ensure_ascii=False, indent=2),
|
|
267
|
+
encoding="utf-8",
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
"dir": str(artifacts_dir),
|
|
272
|
+
"files": {
|
|
273
|
+
"candidates": str(candidates_path),
|
|
274
|
+
"scores": str(scores_path),
|
|
275
|
+
"ranking": str(ranking_path),
|
|
276
|
+
},
|
|
277
|
+
}
|
|
@@ -11,6 +11,7 @@ import json
|
|
|
11
11
|
import logging
|
|
12
12
|
import random
|
|
13
13
|
import re
|
|
14
|
+
from collections.abc import Callable
|
|
14
15
|
from dataclasses import dataclass, field
|
|
15
16
|
from datetime import datetime
|
|
16
17
|
from typing import TYPE_CHECKING
|
|
@@ -130,8 +131,8 @@ class SyntheticQAGenerator:
|
|
|
130
131
|
- 한국어/영어 지원
|
|
131
132
|
|
|
132
133
|
Example:
|
|
133
|
-
>>> from evalvault.
|
|
134
|
-
>>> llm =
|
|
134
|
+
>>> from evalvault.ports.outbound.llm_port import LLMPort
|
|
135
|
+
>>> llm: LLMPort = ...
|
|
135
136
|
>>> generator = SyntheticQAGenerator(llm)
|
|
136
137
|
>>> dataset = generator.generate(documents, config)
|
|
137
138
|
"""
|
|
@@ -298,7 +299,7 @@ class SyntheticQAGenerator:
|
|
|
298
299
|
self,
|
|
299
300
|
documents: list[str],
|
|
300
301
|
config: SyntheticQAConfig,
|
|
301
|
-
progress_callback:
|
|
302
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
302
303
|
) -> Dataset:
|
|
303
304
|
"""Generate synthetic Q&A dataset from documents.
|
|
304
305
|
|
|
@@ -20,7 +20,10 @@ class DomainLearningHookPort(Protocol):
|
|
|
20
20
|
평가 완료 후 호출되어 도메인 메모리를 형성합니다.
|
|
21
21
|
|
|
22
22
|
사용 예시:
|
|
23
|
-
|
|
23
|
+
settings = Settings()
|
|
24
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
25
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
26
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
24
27
|
hook = InsuranceDomainLearningHook(memory_adapter)
|
|
25
28
|
|
|
26
29
|
# 평가 실행
|
|
@@ -44,6 +44,7 @@ from evalvault.ports.outbound.korean_nlp_port import (
|
|
|
44
44
|
RetrieverPort,
|
|
45
45
|
RetrieverResultProtocol,
|
|
46
46
|
)
|
|
47
|
+
from evalvault.ports.outbound.llm_factory_port import LLMFactoryPort
|
|
47
48
|
from evalvault.ports.outbound.llm_port import LLMPort
|
|
48
49
|
from evalvault.ports.outbound.method_port import MethodRuntime, RagMethodPort
|
|
49
50
|
from evalvault.ports.outbound.nlp_analysis_port import NLPAnalysisPort
|
|
@@ -82,6 +83,7 @@ __all__ = [
|
|
|
82
83
|
"PatternDefinitionProtocol",
|
|
83
84
|
"MetricPlaybookProtocol",
|
|
84
85
|
"ClaimImprovementProtocol",
|
|
86
|
+
"LLMFactoryPort",
|
|
85
87
|
"LLMPort",
|
|
86
88
|
"MethodRuntime",
|
|
87
89
|
"RagMethodPort",
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from evalvault.ports.outbound.llm_port import LLMPort
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LLMFactoryPort(Protocol):
|
|
9
|
+
def create_faithfulness_fallback(
|
|
10
|
+
self,
|
|
11
|
+
active_provider: str | None,
|
|
12
|
+
active_model: str | None,
|
|
13
|
+
) -> LLMPort | None: ...
|
|
@@ -34,6 +34,15 @@ class ThinkingConfig:
|
|
|
34
34
|
return {"think_level": self.think_level}
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
@dataclass
|
|
38
|
+
class GenerationOptions:
|
|
39
|
+
temperature: float | None = None
|
|
40
|
+
top_p: float | None = None
|
|
41
|
+
max_tokens: int | None = None
|
|
42
|
+
n: int | None = None
|
|
43
|
+
seed: int | None = None
|
|
44
|
+
|
|
45
|
+
|
|
37
46
|
class LLMPort(ABC):
|
|
38
47
|
"""LLM adapter interface for Ragas metrics evaluation.
|
|
39
48
|
|
|
@@ -62,6 +71,18 @@ class LLMPort(ABC):
|
|
|
62
71
|
"""
|
|
63
72
|
pass
|
|
64
73
|
|
|
74
|
+
def as_ragas_embeddings(self) -> Any:
|
|
75
|
+
raise NotImplementedError("as_ragas_embeddings not implemented")
|
|
76
|
+
|
|
77
|
+
def get_token_usage(self) -> tuple[int, int, int]:
|
|
78
|
+
raise NotImplementedError("get_token_usage not implemented")
|
|
79
|
+
|
|
80
|
+
def get_and_reset_token_usage(self) -> tuple[int, int, int]:
|
|
81
|
+
raise NotImplementedError("get_and_reset_token_usage not implemented")
|
|
82
|
+
|
|
83
|
+
def reset_token_usage(self) -> None:
|
|
84
|
+
raise NotImplementedError("reset_token_usage not implemented")
|
|
85
|
+
|
|
65
86
|
def get_thinking_config(self) -> ThinkingConfig:
|
|
66
87
|
"""Get thinking/reasoning configuration for this adapter.
|
|
67
88
|
|
|
@@ -81,7 +102,12 @@ class LLMPort(ABC):
|
|
|
81
102
|
"""
|
|
82
103
|
return self.get_thinking_config().enabled
|
|
83
104
|
|
|
84
|
-
async def agenerate_text(
|
|
105
|
+
async def agenerate_text(
|
|
106
|
+
self,
|
|
107
|
+
prompt: str,
|
|
108
|
+
*,
|
|
109
|
+
options: GenerationOptions | None = None,
|
|
110
|
+
) -> str:
|
|
85
111
|
"""Generate text from a prompt (async).
|
|
86
112
|
|
|
87
113
|
Simple text generation for use cases like report generation,
|
|
@@ -98,7 +124,13 @@ class LLMPort(ABC):
|
|
|
98
124
|
"""
|
|
99
125
|
raise NotImplementedError("agenerate_text not implemented")
|
|
100
126
|
|
|
101
|
-
def generate_text(
|
|
127
|
+
def generate_text(
|
|
128
|
+
self,
|
|
129
|
+
prompt: str,
|
|
130
|
+
*,
|
|
131
|
+
json_mode: bool = False,
|
|
132
|
+
options: GenerationOptions | None = None,
|
|
133
|
+
) -> str:
|
|
102
134
|
"""Generate text from a prompt (sync).
|
|
103
135
|
|
|
104
136
|
Simple text generation for use cases like report generation,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""결과 저장 인터페이스."""
|
|
2
2
|
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Any, Protocol
|
|
4
5
|
|
|
5
6
|
from evalvault.domain.entities import (
|
|
@@ -11,6 +12,7 @@ from evalvault.domain.entities import (
|
|
|
11
12
|
SatisfactionFeedback,
|
|
12
13
|
)
|
|
13
14
|
from evalvault.domain.entities.experiment import Experiment
|
|
15
|
+
from evalvault.domain.entities.stage import StageEvent, StageMetric
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
class StoragePort(Protocol):
|
|
@@ -34,6 +36,8 @@ class StoragePort(Protocol):
|
|
|
34
36
|
"""Persist prompt set and prompt items."""
|
|
35
37
|
...
|
|
36
38
|
|
|
39
|
+
def export_run_to_excel(self, run_id: str, output_path: str | Path) -> Path: ...
|
|
40
|
+
|
|
37
41
|
def link_prompt_set_to_run(self, run_id: str, prompt_set_id: str) -> None:
|
|
38
42
|
"""Attach a prompt set to an evaluation run."""
|
|
39
43
|
...
|
|
@@ -78,6 +82,27 @@ class StoragePort(Protocol):
|
|
|
78
82
|
"""
|
|
79
83
|
...
|
|
80
84
|
|
|
85
|
+
def delete_run(self, run_id: str) -> bool: ...
|
|
86
|
+
|
|
87
|
+
def save_stage_events(self, events: list[StageEvent]) -> int: ...
|
|
88
|
+
|
|
89
|
+
def save_stage_metrics(self, metrics: list[StageMetric]) -> int: ...
|
|
90
|
+
|
|
91
|
+
def list_stage_events(
|
|
92
|
+
self,
|
|
93
|
+
run_id: str,
|
|
94
|
+
*,
|
|
95
|
+
stage_type: str | None = None,
|
|
96
|
+
) -> list[StageEvent]: ...
|
|
97
|
+
|
|
98
|
+
def list_stage_metrics(
|
|
99
|
+
self,
|
|
100
|
+
run_id: str,
|
|
101
|
+
*,
|
|
102
|
+
stage_id: str | None = None,
|
|
103
|
+
metric_name: str | None = None,
|
|
104
|
+
) -> list[StageMetric]: ...
|
|
105
|
+
|
|
81
106
|
def update_run_metadata(self, run_id: str, metadata: dict[str, Any]) -> None: ...
|
|
82
107
|
|
|
83
108
|
def save_run_cluster_map(
|
|
@@ -166,6 +191,19 @@ class StoragePort(Protocol):
|
|
|
166
191
|
"""파이프라인 분석 결과 히스토리를 저장합니다."""
|
|
167
192
|
...
|
|
168
193
|
|
|
194
|
+
def save_analysis_report(
|
|
195
|
+
self,
|
|
196
|
+
*,
|
|
197
|
+
report_id: str | None,
|
|
198
|
+
run_id: str | None,
|
|
199
|
+
experiment_id: str | None,
|
|
200
|
+
report_type: str,
|
|
201
|
+
format: str,
|
|
202
|
+
content: str | None,
|
|
203
|
+
metadata: dict[str, Any] | None = None,
|
|
204
|
+
created_at: str | None = None,
|
|
205
|
+
) -> str: ...
|
|
206
|
+
|
|
169
207
|
def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
170
208
|
"""파이프라인 분석 결과 목록을 조회합니다."""
|
|
171
209
|
...
|