evalvault 1.62.1__py3-none-any.whl → 1.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +43 -2
- evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +527 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -157,6 +157,79 @@ class IntentKeywordRegistry:
|
|
|
157
157
|
"추이",
|
|
158
158
|
"history",
|
|
159
159
|
}
|
|
160
|
+
self._keywords[AnalysisIntent.ANALYZE_STATISTICAL] = {
|
|
161
|
+
"통계",
|
|
162
|
+
"statistical",
|
|
163
|
+
"statistics",
|
|
164
|
+
"평균",
|
|
165
|
+
"mean",
|
|
166
|
+
"median",
|
|
167
|
+
"분산",
|
|
168
|
+
"variance",
|
|
169
|
+
"표준편차",
|
|
170
|
+
"std",
|
|
171
|
+
}
|
|
172
|
+
self._keywords[AnalysisIntent.ANALYZE_NLP] = {
|
|
173
|
+
"nlp",
|
|
174
|
+
"언어",
|
|
175
|
+
"텍스트",
|
|
176
|
+
"text",
|
|
177
|
+
"문장",
|
|
178
|
+
"sentence",
|
|
179
|
+
"키워드",
|
|
180
|
+
"keyword",
|
|
181
|
+
"토픽",
|
|
182
|
+
"topic",
|
|
183
|
+
}
|
|
184
|
+
self._keywords[AnalysisIntent.ANALYZE_CAUSAL] = {
|
|
185
|
+
"인과",
|
|
186
|
+
"causal",
|
|
187
|
+
"cause",
|
|
188
|
+
"effect",
|
|
189
|
+
"원인",
|
|
190
|
+
"영향",
|
|
191
|
+
"intervention",
|
|
192
|
+
}
|
|
193
|
+
self._keywords[AnalysisIntent.ANALYZE_NETWORK] = {
|
|
194
|
+
"네트워크",
|
|
195
|
+
"network",
|
|
196
|
+
"graph",
|
|
197
|
+
"그래프",
|
|
198
|
+
"연결",
|
|
199
|
+
"centrality",
|
|
200
|
+
}
|
|
201
|
+
self._keywords[AnalysisIntent.ANALYZE_PLAYBOOK] = {
|
|
202
|
+
"playbook",
|
|
203
|
+
"플레이북",
|
|
204
|
+
"규칙",
|
|
205
|
+
"rule",
|
|
206
|
+
"추천",
|
|
207
|
+
"recommendation",
|
|
208
|
+
}
|
|
209
|
+
self._keywords[AnalysisIntent.DETECT_ANOMALIES] = {
|
|
210
|
+
"이상",
|
|
211
|
+
"anomaly",
|
|
212
|
+
"anomalies",
|
|
213
|
+
"outlier",
|
|
214
|
+
"이상치",
|
|
215
|
+
"detect",
|
|
216
|
+
}
|
|
217
|
+
self._keywords[AnalysisIntent.FORECAST_PERFORMANCE] = {
|
|
218
|
+
"예측",
|
|
219
|
+
"forecast",
|
|
220
|
+
"predict",
|
|
221
|
+
"projection",
|
|
222
|
+
"미래",
|
|
223
|
+
"future",
|
|
224
|
+
}
|
|
225
|
+
self._keywords[AnalysisIntent.GENERATE_HYPOTHESES] = {
|
|
226
|
+
"가설",
|
|
227
|
+
"hypothesis",
|
|
228
|
+
"hypotheses",
|
|
229
|
+
"실험",
|
|
230
|
+
"experiment",
|
|
231
|
+
"검증",
|
|
232
|
+
}
|
|
160
233
|
self._keywords[AnalysisIntent.BENCHMARK_RETRIEVAL] = {
|
|
161
234
|
"벤치마크",
|
|
162
235
|
"benchmark",
|
|
@@ -81,6 +81,9 @@ class PipelineTemplateRegistry:
|
|
|
81
81
|
"""의도에 대응하는 파이프라인 템플릿 조회."""
|
|
82
82
|
return self._templates.get(intent)
|
|
83
83
|
|
|
84
|
+
def list_all(self) -> list[tuple[AnalysisIntent, AnalysisPipeline]]:
|
|
85
|
+
return list(self._templates.items())
|
|
86
|
+
|
|
84
87
|
# =========================================================================
|
|
85
88
|
# Verification Templates
|
|
86
89
|
# =========================================================================
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Candidate collection service for prompt suggestions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import replace
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from evalvault.domain.entities.prompt_suggestion import PromptCandidate
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PromptCandidateService:
|
|
13
|
+
"""Build prompt candidates from manual and auto sources."""
|
|
14
|
+
|
|
15
|
+
def build_candidates(
|
|
16
|
+
self,
|
|
17
|
+
*,
|
|
18
|
+
base_prompt: str,
|
|
19
|
+
role: str,
|
|
20
|
+
metrics: list[str],
|
|
21
|
+
manual_prompts: list[str],
|
|
22
|
+
manual_prompt_files: list[Path],
|
|
23
|
+
auto: bool,
|
|
24
|
+
auto_count: int,
|
|
25
|
+
metadata: dict[str, Any] | None = None,
|
|
26
|
+
) -> list[PromptCandidate]:
|
|
27
|
+
base_metadata = metadata or {}
|
|
28
|
+
candidates: list[PromptCandidate] = []
|
|
29
|
+
seen: set[str] = set()
|
|
30
|
+
|
|
31
|
+
def add_candidate(
|
|
32
|
+
content: str, *, source: str, extra: dict[str, Any] | None = None
|
|
33
|
+
) -> None:
|
|
34
|
+
normalized = content.strip()
|
|
35
|
+
if not normalized:
|
|
36
|
+
return
|
|
37
|
+
if normalized in seen:
|
|
38
|
+
return
|
|
39
|
+
candidate_metadata = {**base_metadata, **(extra or {})}
|
|
40
|
+
candidates.append(
|
|
41
|
+
PromptCandidate(
|
|
42
|
+
candidate_id="",
|
|
43
|
+
source=source,
|
|
44
|
+
content=normalized,
|
|
45
|
+
metadata=candidate_metadata,
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
seen.add(normalized)
|
|
49
|
+
|
|
50
|
+
for index, prompt in enumerate(manual_prompts):
|
|
51
|
+
add_candidate(prompt, source="manual", extra={"manual_index": index})
|
|
52
|
+
|
|
53
|
+
for path in manual_prompt_files:
|
|
54
|
+
for line_number, line in enumerate(self._read_prompt_file(path), start=1):
|
|
55
|
+
add_candidate(
|
|
56
|
+
line,
|
|
57
|
+
source="manual",
|
|
58
|
+
extra={"file_path": str(path), "file_line": line_number},
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if auto and auto_count > 0:
|
|
62
|
+
for name, content in self._build_auto_variants(
|
|
63
|
+
base_prompt=base_prompt,
|
|
64
|
+
role=role,
|
|
65
|
+
metrics=metrics,
|
|
66
|
+
auto_count=auto_count,
|
|
67
|
+
):
|
|
68
|
+
add_candidate(
|
|
69
|
+
content,
|
|
70
|
+
source="auto",
|
|
71
|
+
extra={"variant": name, "generator": "template"},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
return [
|
|
75
|
+
replace(candidate, candidate_id=f"cand-{index:03d}")
|
|
76
|
+
for index, candidate in enumerate(candidates, start=1)
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
def _read_prompt_file(self, path: Path) -> list[str]:
|
|
80
|
+
lines = path.read_text(encoding="utf-8").splitlines()
|
|
81
|
+
return [line.strip() for line in lines if line.strip()]
|
|
82
|
+
|
|
83
|
+
def _build_auto_variants(
|
|
84
|
+
self,
|
|
85
|
+
*,
|
|
86
|
+
base_prompt: str,
|
|
87
|
+
role: str,
|
|
88
|
+
metrics: list[str],
|
|
89
|
+
auto_count: int,
|
|
90
|
+
) -> list[tuple[str, str]]:
|
|
91
|
+
metrics_text = ", ".join(metrics) if metrics else "핵심 지표"
|
|
92
|
+
base_prompt = base_prompt.strip() or "사용자 요청에 충실히 답변하라."
|
|
93
|
+
variants = [
|
|
94
|
+
("base", base_prompt),
|
|
95
|
+
(
|
|
96
|
+
"role_focus",
|
|
97
|
+
f"{base_prompt}\n\nRole: {role}. 이 역할에 맞는 톤을 유지하라.",
|
|
98
|
+
),
|
|
99
|
+
(
|
|
100
|
+
"metric_focus",
|
|
101
|
+
f"{base_prompt}\n\n성과 지표({metrics_text})에 맞춰 응답 품질을 높여라.",
|
|
102
|
+
),
|
|
103
|
+
(
|
|
104
|
+
"concise",
|
|
105
|
+
f"{base_prompt}\n\n핵심만 간결하게 답하고 필요한 경우 bullet로 정리하라.",
|
|
106
|
+
),
|
|
107
|
+
(
|
|
108
|
+
"assumptions",
|
|
109
|
+
f"{base_prompt}\n\n불확실하면 가정과 전제를 명시하라.",
|
|
110
|
+
),
|
|
111
|
+
]
|
|
112
|
+
if auto_count <= len(variants):
|
|
113
|
+
return variants[:auto_count]
|
|
114
|
+
extra = []
|
|
115
|
+
for index in range(len(variants) + 1, auto_count + 1):
|
|
116
|
+
extra.append((f"variant_{index}", f"{base_prompt}\n\n추가 후보 {index}."))
|
|
117
|
+
return variants + extra
|
|
@@ -7,7 +7,13 @@ from datetime import datetime
|
|
|
7
7
|
from hashlib import sha256
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
|
-
from evalvault.domain.entities.prompt import
|
|
10
|
+
from evalvault.domain.entities.prompt import (
|
|
11
|
+
Prompt,
|
|
12
|
+
PromptKind,
|
|
13
|
+
PromptSet,
|
|
14
|
+
PromptSetBundle,
|
|
15
|
+
PromptSetItem,
|
|
16
|
+
)
|
|
11
17
|
|
|
12
18
|
|
|
13
19
|
@dataclass(frozen=True)
|
|
@@ -16,7 +22,7 @@ class PromptInput:
|
|
|
16
22
|
|
|
17
23
|
content: str
|
|
18
24
|
name: str
|
|
19
|
-
kind:
|
|
25
|
+
kind: PromptKind
|
|
20
26
|
role: str
|
|
21
27
|
source: str | None = None
|
|
22
28
|
notes: str | None = None
|
|
@@ -99,3 +105,35 @@ def build_prompt_summary(bundle: PromptSetBundle) -> dict[str, Any]:
|
|
|
99
105
|
if ragas_checksums:
|
|
100
106
|
summary["ragas_prompt_checksums"] = ragas_checksums
|
|
101
107
|
return summary
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def build_prompt_inputs_from_snapshots(
|
|
111
|
+
snapshots: dict[str, dict[str, Any]] | None,
|
|
112
|
+
) -> list[PromptInput]:
|
|
113
|
+
if not snapshots:
|
|
114
|
+
return []
|
|
115
|
+
prompt_inputs: list[PromptInput] = []
|
|
116
|
+
for metric_name, entry in snapshots.items():
|
|
117
|
+
prompt_text = entry.get("prompt") if isinstance(entry, dict) else None
|
|
118
|
+
if not isinstance(prompt_text, str):
|
|
119
|
+
continue
|
|
120
|
+
prompt_text = prompt_text.strip()
|
|
121
|
+
if not prompt_text:
|
|
122
|
+
continue
|
|
123
|
+
source = entry.get("source") if isinstance(entry, dict) else None
|
|
124
|
+
metadata = {
|
|
125
|
+
key: value
|
|
126
|
+
for key, value in entry.items()
|
|
127
|
+
if key != "prompt" and isinstance(entry, dict)
|
|
128
|
+
}
|
|
129
|
+
prompt_inputs.append(
|
|
130
|
+
PromptInput(
|
|
131
|
+
content=prompt_text,
|
|
132
|
+
name=f"ragas.{metric_name}",
|
|
133
|
+
kind="ragas",
|
|
134
|
+
role=str(metric_name),
|
|
135
|
+
source=source if isinstance(source, str) and source else "ragas",
|
|
136
|
+
metadata=metadata or None,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
return prompt_inputs
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import replace
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from evalvault.domain.entities import (
|
|
8
|
+
Dataset,
|
|
9
|
+
EvaluationRun,
|
|
10
|
+
PromptCandidate,
|
|
11
|
+
PromptCandidateSampleScore,
|
|
12
|
+
PromptCandidateScore,
|
|
13
|
+
TestCase,
|
|
14
|
+
)
|
|
15
|
+
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
16
|
+
from evalvault.ports.outbound.llm_port import GenerationOptions, LLMPort
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
_PROMPT_LABELS_KO = {
|
|
21
|
+
"system": "시스템",
|
|
22
|
+
"context": "컨텍스트",
|
|
23
|
+
"question": "질문",
|
|
24
|
+
"answer": "답변",
|
|
25
|
+
}
|
|
26
|
+
_PROMPT_LABELS_EN = {
|
|
27
|
+
"system": "System",
|
|
28
|
+
"context": "Context",
|
|
29
|
+
"question": "Question",
|
|
30
|
+
"answer": "Answer",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PromptScoringService:
|
|
35
|
+
def __init__(self, evaluator: RagasEvaluator, llm: LLMPort) -> None:
|
|
36
|
+
self._evaluator = evaluator
|
|
37
|
+
self._llm = llm
|
|
38
|
+
|
|
39
|
+
async def score_candidates(
|
|
40
|
+
self,
|
|
41
|
+
*,
|
|
42
|
+
base_run: EvaluationRun,
|
|
43
|
+
dev_dataset: Dataset,
|
|
44
|
+
holdout_dataset: Dataset,
|
|
45
|
+
candidates: list[PromptCandidate],
|
|
46
|
+
metrics: list[str],
|
|
47
|
+
weights: dict[str, float],
|
|
48
|
+
generation_options: GenerationOptions | None = None,
|
|
49
|
+
selection_policy: str = "best",
|
|
50
|
+
selection_index: int | None = None,
|
|
51
|
+
prompt_language: str | None = None,
|
|
52
|
+
) -> list[PromptCandidateScore]:
|
|
53
|
+
if not metrics:
|
|
54
|
+
raise ValueError("metrics must not be empty")
|
|
55
|
+
resolved_weights = _resolve_weights(metrics, weights)
|
|
56
|
+
scoring_dataset = _resolve_scoring_dataset(dev_dataset, holdout_dataset)
|
|
57
|
+
|
|
58
|
+
sample_count = _resolve_sample_count(generation_options)
|
|
59
|
+
resolved_language = _resolve_prompt_language(scoring_dataset, prompt_language)
|
|
60
|
+
|
|
61
|
+
scored: list[PromptCandidateScore] = []
|
|
62
|
+
for candidate in candidates:
|
|
63
|
+
sample_scores: list[PromptCandidateSampleScore] = []
|
|
64
|
+
for sample_index in range(sample_count):
|
|
65
|
+
sample_options = _normalize_generation_options(
|
|
66
|
+
generation_options,
|
|
67
|
+
sample_index,
|
|
68
|
+
)
|
|
69
|
+
generated, responses = await self._generate_candidate_dataset(
|
|
70
|
+
dataset=scoring_dataset,
|
|
71
|
+
system_prompt=candidate.content,
|
|
72
|
+
base_run_id=base_run.run_id,
|
|
73
|
+
generation_options=sample_options,
|
|
74
|
+
prompt_language=resolved_language,
|
|
75
|
+
)
|
|
76
|
+
run = await self._evaluator.evaluate(
|
|
77
|
+
dataset=generated,
|
|
78
|
+
metrics=metrics,
|
|
79
|
+
llm=self._llm,
|
|
80
|
+
thresholds=generated.thresholds,
|
|
81
|
+
parallel=False,
|
|
82
|
+
batch_size=5,
|
|
83
|
+
)
|
|
84
|
+
scores = _extract_scores(metrics, run)
|
|
85
|
+
weighted_score = _weighted_score(scores, resolved_weights)
|
|
86
|
+
sample_scores.append(
|
|
87
|
+
PromptCandidateSampleScore(
|
|
88
|
+
sample_index=sample_index,
|
|
89
|
+
scores=scores,
|
|
90
|
+
weighted_score=weighted_score,
|
|
91
|
+
responses=responses,
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
selected = _select_sample_score(
|
|
96
|
+
sample_scores,
|
|
97
|
+
selection_policy=selection_policy,
|
|
98
|
+
selection_index=selection_index,
|
|
99
|
+
)
|
|
100
|
+
scored.append(
|
|
101
|
+
PromptCandidateScore(
|
|
102
|
+
candidate_id=candidate.candidate_id,
|
|
103
|
+
scores=selected.scores,
|
|
104
|
+
weighted_score=selected.weighted_score,
|
|
105
|
+
sample_scores=sample_scores,
|
|
106
|
+
selected_sample_index=selected.sample_index,
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
return scored
|
|
110
|
+
|
|
111
|
+
async def _generate_candidate_dataset(
|
|
112
|
+
self,
|
|
113
|
+
*,
|
|
114
|
+
dataset: Dataset,
|
|
115
|
+
system_prompt: str,
|
|
116
|
+
base_run_id: str,
|
|
117
|
+
generation_options: GenerationOptions | None,
|
|
118
|
+
prompt_language: str,
|
|
119
|
+
) -> tuple[Dataset, list[dict[str, Any]]]:
|
|
120
|
+
test_cases: list[TestCase] = []
|
|
121
|
+
responses: list[dict[str, Any]] = []
|
|
122
|
+
for test_case in dataset.test_cases:
|
|
123
|
+
prompt = _build_generation_prompt(
|
|
124
|
+
system_prompt,
|
|
125
|
+
test_case,
|
|
126
|
+
language=prompt_language,
|
|
127
|
+
)
|
|
128
|
+
try:
|
|
129
|
+
answer = await self._llm.agenerate_text(
|
|
130
|
+
prompt,
|
|
131
|
+
options=generation_options,
|
|
132
|
+
)
|
|
133
|
+
except Exception as exc:
|
|
134
|
+
logger.warning("Prompt candidate generation failed: %s", exc)
|
|
135
|
+
answer = ""
|
|
136
|
+
test_cases.append(
|
|
137
|
+
replace(
|
|
138
|
+
test_case,
|
|
139
|
+
answer=answer,
|
|
140
|
+
metadata={
|
|
141
|
+
**(test_case.metadata or {}),
|
|
142
|
+
"prompt_candidate": True,
|
|
143
|
+
},
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
responses.append(
|
|
147
|
+
{
|
|
148
|
+
"test_case_id": test_case.id,
|
|
149
|
+
"question": test_case.question,
|
|
150
|
+
"answer": answer,
|
|
151
|
+
"contexts": list(test_case.contexts or []),
|
|
152
|
+
"ground_truth": test_case.ground_truth,
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
metadata = dict(dataset.metadata)
|
|
156
|
+
metadata.setdefault("base_run_id", base_run_id)
|
|
157
|
+
return (
|
|
158
|
+
Dataset(
|
|
159
|
+
name=dataset.name,
|
|
160
|
+
version=dataset.version,
|
|
161
|
+
test_cases=test_cases,
|
|
162
|
+
metadata=metadata,
|
|
163
|
+
source_file=dataset.source_file,
|
|
164
|
+
thresholds=dict(dataset.thresholds),
|
|
165
|
+
),
|
|
166
|
+
responses,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _resolve_prompt_language(dataset: Dataset, prompt_language: str | None) -> str:
|
|
171
|
+
normalized = _normalize_language_hint(prompt_language)
|
|
172
|
+
if normalized:
|
|
173
|
+
return normalized
|
|
174
|
+
metadata = dataset.metadata if isinstance(dataset.metadata, dict) else {}
|
|
175
|
+
for key in ("language", "lang", "locale"):
|
|
176
|
+
normalized = _normalize_language_hint(metadata.get(key))
|
|
177
|
+
if normalized:
|
|
178
|
+
return normalized
|
|
179
|
+
languages = metadata.get("languages")
|
|
180
|
+
if isinstance(languages, list | tuple | set):
|
|
181
|
+
for entry in languages:
|
|
182
|
+
normalized = _normalize_language_hint(entry)
|
|
183
|
+
if normalized:
|
|
184
|
+
return normalized
|
|
185
|
+
return "ko"
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _normalize_language_hint(value: Any) -> str | None:
|
|
189
|
+
if value is None:
|
|
190
|
+
return None
|
|
191
|
+
text = str(value).strip().lower()
|
|
192
|
+
if text in {"ko", "kor", "korean", "ko-kr", "kor-hang", "kr"}:
|
|
193
|
+
return "ko"
|
|
194
|
+
if text in {"en", "eng", "english", "en-us", "en-gb"}:
|
|
195
|
+
return "en"
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _resolve_sample_count(options: GenerationOptions | None) -> int:
|
|
200
|
+
if options is None or options.n is None:
|
|
201
|
+
return 1
|
|
202
|
+
return max(1, options.n)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _normalize_generation_options(
|
|
206
|
+
options: GenerationOptions | None,
|
|
207
|
+
sample_index: int,
|
|
208
|
+
) -> GenerationOptions | None:
|
|
209
|
+
if options is None:
|
|
210
|
+
return None
|
|
211
|
+
seed = options.seed + sample_index if options.seed is not None else None
|
|
212
|
+
return GenerationOptions(
|
|
213
|
+
temperature=options.temperature,
|
|
214
|
+
top_p=options.top_p,
|
|
215
|
+
max_tokens=options.max_tokens,
|
|
216
|
+
seed=seed,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _select_sample_score(
|
|
221
|
+
sample_scores: list[PromptCandidateSampleScore],
|
|
222
|
+
*,
|
|
223
|
+
selection_policy: str,
|
|
224
|
+
selection_index: int | None,
|
|
225
|
+
) -> PromptCandidateSampleScore:
|
|
226
|
+
if not sample_scores:
|
|
227
|
+
raise ValueError("No sample scores available")
|
|
228
|
+
if selection_policy == "best":
|
|
229
|
+
return max(sample_scores, key=lambda entry: entry.weighted_score)
|
|
230
|
+
if selection_policy == "index":
|
|
231
|
+
if selection_index is None:
|
|
232
|
+
raise ValueError("selection_index is required for index policy")
|
|
233
|
+
if selection_index < 0 or selection_index >= len(sample_scores):
|
|
234
|
+
raise ValueError("selection_index out of range")
|
|
235
|
+
return sample_scores[selection_index]
|
|
236
|
+
raise ValueError("Unsupported selection_policy")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _resolve_scoring_dataset(dev_dataset: Dataset, holdout_dataset: Dataset) -> Dataset:
|
|
240
|
+
if holdout_dataset.test_cases:
|
|
241
|
+
return holdout_dataset
|
|
242
|
+
if dev_dataset.test_cases:
|
|
243
|
+
return dev_dataset
|
|
244
|
+
raise ValueError("No test cases available for scoring")
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _resolve_weights(metrics: list[str], weights: dict[str, float]) -> dict[str, float]:
|
|
248
|
+
if not weights:
|
|
249
|
+
base = 1.0 / len(metrics)
|
|
250
|
+
return dict.fromkeys(metrics, base)
|
|
251
|
+
resolved = {metric: float(weights.get(metric, 0.0)) for metric in metrics}
|
|
252
|
+
total = sum(resolved.values())
|
|
253
|
+
if total <= 0:
|
|
254
|
+
base = 1.0 / len(metrics)
|
|
255
|
+
return dict.fromkeys(metrics, base)
|
|
256
|
+
return {metric: value / total for metric, value in resolved.items()}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _extract_scores(metrics: list[str], run: EvaluationRun) -> dict[str, float]:
|
|
260
|
+
scores: dict[str, float] = {}
|
|
261
|
+
for metric in metrics:
|
|
262
|
+
avg = run.get_avg_score(metric)
|
|
263
|
+
scores[metric] = float(avg) if avg is not None else 0.0
|
|
264
|
+
return scores
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _weighted_score(scores: dict[str, float], weights: dict[str, float]) -> float:
|
|
268
|
+
return sum(scores.get(metric, 0.0) * weight for metric, weight in weights.items())
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _build_generation_prompt(
|
|
272
|
+
system_prompt: str,
|
|
273
|
+
test_case: TestCase,
|
|
274
|
+
*,
|
|
275
|
+
language: str,
|
|
276
|
+
) -> str:
|
|
277
|
+
context_block = (
|
|
278
|
+
"\n".join(f"- {ctx}" for ctx in test_case.contexts) if test_case.contexts else "-"
|
|
279
|
+
)
|
|
280
|
+
labels = _PROMPT_LABELS_EN if language == "en" else _PROMPT_LABELS_KO
|
|
281
|
+
return (
|
|
282
|
+
f"[{labels['system']}]\n{system_prompt.strip()}\n\n"
|
|
283
|
+
f"[{labels['context']}]\n{context_block}\n\n"
|
|
284
|
+
f"[{labels['question']}]\n{test_case.question.strip()}\n\n"
|
|
285
|
+
f"[{labels['answer']}]\n"
|
|
286
|
+
)
|