evalvault 1.62.0__py3-none-any.whl → 1.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +43 -2
- evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +527 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -2,17 +2,28 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
5
6
|
import difflib
|
|
6
7
|
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
7
9
|
|
|
8
10
|
import typer
|
|
9
11
|
from rich.console import Console
|
|
10
12
|
from rich.table import Table
|
|
11
13
|
|
|
14
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
15
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
12
16
|
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
13
|
-
from evalvault.
|
|
17
|
+
from evalvault.config.settings import Settings, apply_profile
|
|
18
|
+
from evalvault.domain.entities import Dataset, EvaluationRun, PromptSetBundle, TestCase
|
|
19
|
+
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
20
|
+
from evalvault.ports.outbound.llm_port import GenerationOptions
|
|
14
21
|
|
|
22
|
+
from ..utils.analysis_io import resolve_artifact_dir, resolve_output_paths
|
|
23
|
+
from ..utils.console import print_cli_error, print_cli_warning, progress_spinner
|
|
15
24
|
from ..utils.options import db_option
|
|
25
|
+
from ..utils.validators import parse_csv_option
|
|
26
|
+
from .run_helpers import _is_oss_open_model
|
|
16
27
|
|
|
17
28
|
|
|
18
29
|
def _bundle_to_role_map(bundle: PromptSetBundle) -> dict[str, dict[str, str]]:
|
|
@@ -31,6 +42,182 @@ def _bundle_to_role_map(bundle: PromptSetBundle) -> dict[str, dict[str, str]]:
|
|
|
31
42
|
return roles
|
|
32
43
|
|
|
33
44
|
|
|
45
|
+
def _require_db_path(console: Console, db_path: Path | None) -> Path:
|
|
46
|
+
if db_path is None:
|
|
47
|
+
print_cli_error(
|
|
48
|
+
console,
|
|
49
|
+
"DB 경로가 필요합니다.",
|
|
50
|
+
fixes=["--db 옵션으로 SQLite DB 경로를 지정하세요."],
|
|
51
|
+
)
|
|
52
|
+
raise typer.Exit(1)
|
|
53
|
+
return db_path
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _default_role(bundle: PromptSetBundle) -> str | None:
|
|
57
|
+
for item in bundle.items:
|
|
58
|
+
if item.role == "system":
|
|
59
|
+
return item.role
|
|
60
|
+
return bundle.items[0].role if bundle.items else None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _build_dataset_from_run(run: EvaluationRun, console: Console) -> Dataset:
|
|
64
|
+
test_cases: list[TestCase] = []
|
|
65
|
+
skipped = 0
|
|
66
|
+
for result in run.results:
|
|
67
|
+
if not result.question or result.answer is None or result.contexts is None:
|
|
68
|
+
skipped += 1
|
|
69
|
+
continue
|
|
70
|
+
test_cases.append(
|
|
71
|
+
TestCase(
|
|
72
|
+
id=result.test_case_id,
|
|
73
|
+
question=result.question,
|
|
74
|
+
answer=result.answer,
|
|
75
|
+
contexts=result.contexts,
|
|
76
|
+
ground_truth=result.ground_truth,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
if skipped:
|
|
80
|
+
print_cli_warning(
|
|
81
|
+
console,
|
|
82
|
+
f"{skipped}개 테스트 케이스에 질문/답변/컨텍스트가 없어 제외했습니다.",
|
|
83
|
+
)
|
|
84
|
+
return Dataset(
|
|
85
|
+
name=run.dataset_name,
|
|
86
|
+
version=run.dataset_version,
|
|
87
|
+
test_cases=test_cases,
|
|
88
|
+
thresholds=dict(run.thresholds),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _parse_weights(
|
|
93
|
+
console: Console,
|
|
94
|
+
weights_raw: str | None,
|
|
95
|
+
metrics: list[str],
|
|
96
|
+
) -> dict[str, float]:
|
|
97
|
+
if not metrics:
|
|
98
|
+
return {}
|
|
99
|
+
if not weights_raw:
|
|
100
|
+
base = 1.0 / len(metrics)
|
|
101
|
+
return dict.fromkeys(metrics, base)
|
|
102
|
+
entries = parse_csv_option(weights_raw)
|
|
103
|
+
weights: dict[str, float] = {}
|
|
104
|
+
for entry in entries:
|
|
105
|
+
if "=" not in entry:
|
|
106
|
+
print_cli_error(
|
|
107
|
+
console,
|
|
108
|
+
"--weights 형식이 올바르지 않습니다.",
|
|
109
|
+
fixes=["예: --weights faithfulness=0.5,answer_relevancy=0.5"],
|
|
110
|
+
details=entry,
|
|
111
|
+
)
|
|
112
|
+
raise typer.Exit(1)
|
|
113
|
+
key, raw_value = entry.split("=", 1)
|
|
114
|
+
key = key.strip()
|
|
115
|
+
raw_value = raw_value.strip()
|
|
116
|
+
try:
|
|
117
|
+
value = float(raw_value)
|
|
118
|
+
except ValueError:
|
|
119
|
+
print_cli_error(
|
|
120
|
+
console,
|
|
121
|
+
"--weights 값은 숫자여야 합니다.",
|
|
122
|
+
details=entry,
|
|
123
|
+
)
|
|
124
|
+
raise typer.Exit(1)
|
|
125
|
+
if value < 0:
|
|
126
|
+
print_cli_error(
|
|
127
|
+
console,
|
|
128
|
+
"--weights 값은 0 이상이어야 합니다.",
|
|
129
|
+
details=entry,
|
|
130
|
+
)
|
|
131
|
+
raise typer.Exit(1)
|
|
132
|
+
weights[key] = value
|
|
133
|
+
|
|
134
|
+
missing = [metric for metric in metrics if metric not in weights]
|
|
135
|
+
if missing:
|
|
136
|
+
print_cli_error(
|
|
137
|
+
console,
|
|
138
|
+
"--weights에 모든 메트릭을 포함해야 합니다.",
|
|
139
|
+
fixes=["누락된 메트릭을 추가하거나 --weights 옵션을 제거하세요."],
|
|
140
|
+
details=", ".join(missing),
|
|
141
|
+
)
|
|
142
|
+
raise typer.Exit(1)
|
|
143
|
+
|
|
144
|
+
total = sum(weights.values())
|
|
145
|
+
if total <= 0:
|
|
146
|
+
print_cli_error(
|
|
147
|
+
console,
|
|
148
|
+
"--weights 합계는 0보다 커야 합니다.",
|
|
149
|
+
)
|
|
150
|
+
raise typer.Exit(1)
|
|
151
|
+
return {metric: weights[metric] / total for metric in metrics}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _rank_candidates(scores: list[Any]) -> list[str]:
|
|
155
|
+
return [
|
|
156
|
+
score.candidate_id
|
|
157
|
+
for score in sorted(scores, key=lambda entry: entry.weighted_score, reverse=True)
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _resolve_llm_config(
|
|
162
|
+
*,
|
|
163
|
+
settings: Settings,
|
|
164
|
+
run_model: str,
|
|
165
|
+
model_override: str | None,
|
|
166
|
+
provider_override: str | None,
|
|
167
|
+
console: Console,
|
|
168
|
+
) -> tuple[Settings, str, str]:
|
|
169
|
+
resolved_model = model_override or run_model
|
|
170
|
+
if not resolved_model:
|
|
171
|
+
print_cli_error(
|
|
172
|
+
console,
|
|
173
|
+
"LLM 모델을 결정할 수 없습니다.",
|
|
174
|
+
fixes=["--model 옵션을 지정하세요."],
|
|
175
|
+
)
|
|
176
|
+
raise typer.Exit(1)
|
|
177
|
+
|
|
178
|
+
provider = provider_override
|
|
179
|
+
if "/" in resolved_model:
|
|
180
|
+
provider, resolved_model = resolved_model.split("/", 1)
|
|
181
|
+
elif not provider:
|
|
182
|
+
if "/" in run_model:
|
|
183
|
+
provider, run_model = run_model.split("/", 1)
|
|
184
|
+
resolved_model = model_override or run_model
|
|
185
|
+
else:
|
|
186
|
+
provider = settings.llm_provider
|
|
187
|
+
|
|
188
|
+
if provider is None:
|
|
189
|
+
provider = settings.llm_provider
|
|
190
|
+
|
|
191
|
+
provider = provider.strip().lower()
|
|
192
|
+
if _is_oss_open_model(resolved_model) and provider != "vllm":
|
|
193
|
+
provider = "ollama"
|
|
194
|
+
|
|
195
|
+
settings.llm_provider = provider
|
|
196
|
+
if provider == "ollama":
|
|
197
|
+
settings.ollama_model = resolved_model
|
|
198
|
+
elif provider == "vllm":
|
|
199
|
+
settings.vllm_model = resolved_model
|
|
200
|
+
elif provider == "azure":
|
|
201
|
+
settings.azure_deployment = resolved_model
|
|
202
|
+
elif provider == "anthropic":
|
|
203
|
+
settings.anthropic_model = resolved_model
|
|
204
|
+
else:
|
|
205
|
+
settings.openai_model = resolved_model
|
|
206
|
+
|
|
207
|
+
if settings.llm_provider == "openai" and not settings.openai_api_key:
|
|
208
|
+
print_cli_error(
|
|
209
|
+
console,
|
|
210
|
+
"OPENAI_API_KEY가 설정되지 않았습니다.",
|
|
211
|
+
fixes=[
|
|
212
|
+
".env 파일 또는 환경 변수에 OPENAI_API_KEY=... 값을 추가하세요.",
|
|
213
|
+
"--provider ollama 같이 로컬 모델을 사용하세요.",
|
|
214
|
+
],
|
|
215
|
+
)
|
|
216
|
+
raise typer.Exit(1)
|
|
217
|
+
|
|
218
|
+
return settings, provider, resolved_model
|
|
219
|
+
|
|
220
|
+
|
|
34
221
|
def create_prompts_app(console: Console) -> typer.Typer:
|
|
35
222
|
"""Create the `prompts` sub-application."""
|
|
36
223
|
|
|
@@ -39,10 +226,11 @@ def create_prompts_app(console: Console) -> typer.Typer:
|
|
|
39
226
|
@app.command("show")
|
|
40
227
|
def show_prompt_set(
|
|
41
228
|
run_id: str = typer.Argument(..., help="Run ID to inspect."),
|
|
42
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
229
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
43
230
|
) -> None:
|
|
44
231
|
"""Show prompt snapshots attached to a run."""
|
|
45
|
-
|
|
232
|
+
resolved_db = _require_db_path(console, db_path)
|
|
233
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db)
|
|
46
234
|
bundle = storage.get_prompt_set_for_run(run_id)
|
|
47
235
|
if not bundle:
|
|
48
236
|
console.print("[yellow]No prompt set found for this run.[/yellow]")
|
|
@@ -72,7 +260,7 @@ def create_prompts_app(console: Console) -> typer.Typer:
|
|
|
72
260
|
def diff_prompt_sets(
|
|
73
261
|
run_id_a: str = typer.Argument(..., help="Base run ID."),
|
|
74
262
|
run_id_b: str = typer.Argument(..., help="Target run ID."),
|
|
75
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
263
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
76
264
|
max_lines: int = typer.Option(
|
|
77
265
|
40,
|
|
78
266
|
"--max-lines",
|
|
@@ -85,7 +273,8 @@ def create_prompts_app(console: Console) -> typer.Typer:
|
|
|
85
273
|
),
|
|
86
274
|
) -> None:
|
|
87
275
|
"""Compare prompt snapshots between two runs."""
|
|
88
|
-
|
|
276
|
+
resolved_db = _require_db_path(console, db_path)
|
|
277
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db)
|
|
89
278
|
bundle_a = storage.get_prompt_set_for_run(run_id_a)
|
|
90
279
|
bundle_b = storage.get_prompt_set_for_run(run_id_b)
|
|
91
280
|
|
|
@@ -107,7 +296,11 @@ def create_prompts_app(console: Console) -> typer.Typer:
|
|
|
107
296
|
for role in all_roles:
|
|
108
297
|
a = roles_a.get(role)
|
|
109
298
|
b = roles_b.get(role)
|
|
299
|
+
if not a and not b:
|
|
300
|
+
continue
|
|
110
301
|
if not a:
|
|
302
|
+
if not b:
|
|
303
|
+
continue
|
|
111
304
|
table.add_row(role, "-", b["checksum"][:12], "[yellow]missing[/yellow]")
|
|
112
305
|
continue
|
|
113
306
|
if not b:
|
|
@@ -151,6 +344,421 @@ def create_prompts_app(console: Console) -> typer.Typer:
|
|
|
151
344
|
console.print("[dim]... diff truncated ...[/dim]")
|
|
152
345
|
console.print()
|
|
153
346
|
|
|
347
|
+
@app.command("suggest")
|
|
348
|
+
def suggest_prompt_candidates(
|
|
349
|
+
run_id: str = typer.Argument(..., help="Run ID to analyze."),
|
|
350
|
+
role: str | None = typer.Option(
|
|
351
|
+
None,
|
|
352
|
+
"--role",
|
|
353
|
+
help="Prompt role to improve (system or metric name).",
|
|
354
|
+
),
|
|
355
|
+
metrics: str | None = typer.Option(
|
|
356
|
+
None,
|
|
357
|
+
"--metrics",
|
|
358
|
+
"-m",
|
|
359
|
+
help="Comma-separated list of metrics to score (default: run metrics).",
|
|
360
|
+
),
|
|
361
|
+
model: str | None = typer.Option(
|
|
362
|
+
None,
|
|
363
|
+
"--model",
|
|
364
|
+
help="Override LLM model for regeneration/scoring.",
|
|
365
|
+
),
|
|
366
|
+
provider: str | None = typer.Option(
|
|
367
|
+
None,
|
|
368
|
+
"--provider",
|
|
369
|
+
help="Override LLM provider (openai|ollama|vllm|azure|anthropic).",
|
|
370
|
+
),
|
|
371
|
+
temperature: float | None = typer.Option(
|
|
372
|
+
None,
|
|
373
|
+
"--temperature",
|
|
374
|
+
help="Sampling temperature for regeneration.",
|
|
375
|
+
),
|
|
376
|
+
top_p: float | None = typer.Option(
|
|
377
|
+
None,
|
|
378
|
+
"--top-p",
|
|
379
|
+
help="Nucleus sampling top-p for regeneration.",
|
|
380
|
+
),
|
|
381
|
+
max_tokens: int | None = typer.Option(
|
|
382
|
+
None,
|
|
383
|
+
"--max-tokens",
|
|
384
|
+
help="Max completion tokens for regeneration.",
|
|
385
|
+
),
|
|
386
|
+
generation_n: int | None = typer.Option(
|
|
387
|
+
None,
|
|
388
|
+
"--generation-n",
|
|
389
|
+
help="Number of samples per regeneration.",
|
|
390
|
+
),
|
|
391
|
+
generation_seed: int | None = typer.Option(
|
|
392
|
+
None,
|
|
393
|
+
"--generation-seed",
|
|
394
|
+
help="Seed for regeneration sampling.",
|
|
395
|
+
),
|
|
396
|
+
selection_policy: str = typer.Option(
|
|
397
|
+
"best",
|
|
398
|
+
"--selection-policy",
|
|
399
|
+
help="Sample selection policy (best|index).",
|
|
400
|
+
),
|
|
401
|
+
selection_index: int | None = typer.Option(
|
|
402
|
+
None,
|
|
403
|
+
"--selection-index",
|
|
404
|
+
help="Sample index when using selection-policy=index.",
|
|
405
|
+
),
|
|
406
|
+
weights: str | None = typer.Option(
|
|
407
|
+
None,
|
|
408
|
+
"--weights",
|
|
409
|
+
help="Comma-separated metric weights (e.g. faithfulness=0.5,answer_relevancy=0.5).",
|
|
410
|
+
),
|
|
411
|
+
candidates: int = typer.Option(
|
|
412
|
+
5,
|
|
413
|
+
"--candidates",
|
|
414
|
+
help="Number of auto-generated candidates (default: 5).",
|
|
415
|
+
),
|
|
416
|
+
manual_prompts: list[str] = typer.Option(
|
|
417
|
+
[],
|
|
418
|
+
"--prompt",
|
|
419
|
+
help="Manual prompt candidate (repeatable).",
|
|
420
|
+
show_default=False,
|
|
421
|
+
),
|
|
422
|
+
manual_prompt_files: list[Path] = typer.Option(
|
|
423
|
+
[],
|
|
424
|
+
"--prompt-file",
|
|
425
|
+
help="Manual prompt candidate file (repeatable).",
|
|
426
|
+
exists=True,
|
|
427
|
+
readable=True,
|
|
428
|
+
show_default=False,
|
|
429
|
+
),
|
|
430
|
+
auto: bool = typer.Option(
|
|
431
|
+
True,
|
|
432
|
+
"--auto/--no-auto",
|
|
433
|
+
help="Enable auto candidate generation.",
|
|
434
|
+
),
|
|
435
|
+
holdout_ratio: float = typer.Option(
|
|
436
|
+
0.2,
|
|
437
|
+
"--holdout-ratio",
|
|
438
|
+
help="Holdout ratio for scoring (default: 0.2).",
|
|
439
|
+
),
|
|
440
|
+
seed: int | None = typer.Option(
|
|
441
|
+
None,
|
|
442
|
+
"--seed",
|
|
443
|
+
help="Random seed for holdout split.",
|
|
444
|
+
),
|
|
445
|
+
output_path: Path | None = typer.Option(
|
|
446
|
+
None,
|
|
447
|
+
"--output",
|
|
448
|
+
"-o",
|
|
449
|
+
help="Output JSON path.",
|
|
450
|
+
),
|
|
451
|
+
report_path: Path | None = typer.Option(
|
|
452
|
+
None,
|
|
453
|
+
"--report",
|
|
454
|
+
help="Markdown report path.",
|
|
455
|
+
),
|
|
456
|
+
analysis_dir: Path | None = typer.Option(
|
|
457
|
+
None,
|
|
458
|
+
"--analysis-dir",
|
|
459
|
+
help="Base directory for analysis outputs.",
|
|
460
|
+
),
|
|
461
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
462
|
+
) -> None:
|
|
463
|
+
"""Suggest prompt improvements by scoring candidate prompts."""
|
|
464
|
+
|
|
465
|
+
resolved_db = _require_db_path(console, db_path)
|
|
466
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db)
|
|
467
|
+
|
|
468
|
+
try:
|
|
469
|
+
run = storage.get_run(run_id)
|
|
470
|
+
except KeyError as exc:
|
|
471
|
+
print_cli_error(
|
|
472
|
+
console,
|
|
473
|
+
"Run을 찾지 못했습니다.",
|
|
474
|
+
details=str(exc),
|
|
475
|
+
)
|
|
476
|
+
raise typer.Exit(1)
|
|
477
|
+
|
|
478
|
+
bundle = storage.get_prompt_set_for_run(run_id)
|
|
479
|
+
if not bundle:
|
|
480
|
+
print_cli_error(
|
|
481
|
+
console,
|
|
482
|
+
"이 run에 연결된 프롬프트 스냅샷이 없습니다.",
|
|
483
|
+
fixes=["`evalvault run` 실행 시 --db 옵션을 지정했는지 확인하세요."],
|
|
484
|
+
)
|
|
485
|
+
raise typer.Exit(1)
|
|
486
|
+
|
|
487
|
+
roles = _bundle_to_role_map(bundle)
|
|
488
|
+
resolved_role = role or _default_role(bundle)
|
|
489
|
+
if not resolved_role:
|
|
490
|
+
print_cli_error(
|
|
491
|
+
console,
|
|
492
|
+
"프롬프트 role을 결정할 수 없습니다.",
|
|
493
|
+
)
|
|
494
|
+
raise typer.Exit(1)
|
|
495
|
+
if resolved_role not in roles:
|
|
496
|
+
print_cli_error(
|
|
497
|
+
console,
|
|
498
|
+
"지정한 role의 프롬프트를 찾을 수 없습니다.",
|
|
499
|
+
details=resolved_role,
|
|
500
|
+
fixes=[f"사용 가능한 role: {', '.join(sorted(roles))}"],
|
|
501
|
+
)
|
|
502
|
+
raise typer.Exit(1)
|
|
503
|
+
|
|
504
|
+
metric_list = parse_csv_option(metrics) or list(run.metrics_evaluated)
|
|
505
|
+
if not metric_list:
|
|
506
|
+
print_cli_error(
|
|
507
|
+
console,
|
|
508
|
+
"평가 메트릭이 없습니다.",
|
|
509
|
+
fixes=["--metrics 옵션을 지정하세요."],
|
|
510
|
+
)
|
|
511
|
+
raise typer.Exit(1)
|
|
512
|
+
|
|
513
|
+
if candidates <= 0:
|
|
514
|
+
print_cli_error(
|
|
515
|
+
console,
|
|
516
|
+
"--candidates 값은 1 이상이어야 합니다.",
|
|
517
|
+
)
|
|
518
|
+
raise typer.Exit(1)
|
|
519
|
+
|
|
520
|
+
if holdout_ratio <= 0 or holdout_ratio >= 1:
|
|
521
|
+
print_cli_error(
|
|
522
|
+
console,
|
|
523
|
+
"--holdout-ratio 값은 0과 1 사이여야 합니다.",
|
|
524
|
+
)
|
|
525
|
+
raise typer.Exit(1)
|
|
526
|
+
|
|
527
|
+
if not auto and not manual_prompts and not manual_prompt_files:
|
|
528
|
+
print_cli_error(
|
|
529
|
+
console,
|
|
530
|
+
"자동 후보를 끌 경우 수동 후보가 필요합니다.",
|
|
531
|
+
fixes=["--prompt 또는 --prompt-file을 추가하세요."],
|
|
532
|
+
)
|
|
533
|
+
raise typer.Exit(1)
|
|
534
|
+
|
|
535
|
+
if temperature is not None and temperature < 0:
|
|
536
|
+
print_cli_error(
|
|
537
|
+
console,
|
|
538
|
+
"--temperature 값은 0 이상이어야 합니다.",
|
|
539
|
+
)
|
|
540
|
+
raise typer.Exit(1)
|
|
541
|
+
if top_p is not None and (top_p <= 0 or top_p > 1):
|
|
542
|
+
print_cli_error(
|
|
543
|
+
console,
|
|
544
|
+
"--top-p 값은 0보다 크고 1 이하여야 합니다.",
|
|
545
|
+
)
|
|
546
|
+
raise typer.Exit(1)
|
|
547
|
+
if max_tokens is not None and max_tokens <= 0:
|
|
548
|
+
print_cli_error(
|
|
549
|
+
console,
|
|
550
|
+
"--max-tokens 값은 1 이상이어야 합니다.",
|
|
551
|
+
)
|
|
552
|
+
raise typer.Exit(1)
|
|
553
|
+
if generation_n is not None and generation_n <= 0:
|
|
554
|
+
print_cli_error(
|
|
555
|
+
console,
|
|
556
|
+
"--generation-n 값은 1 이상이어야 합니다.",
|
|
557
|
+
)
|
|
558
|
+
raise typer.Exit(1)
|
|
559
|
+
if generation_seed is not None and generation_seed < 0:
|
|
560
|
+
print_cli_error(
|
|
561
|
+
console,
|
|
562
|
+
"--generation-seed 값은 0 이상이어야 합니다.",
|
|
563
|
+
)
|
|
564
|
+
raise typer.Exit(1)
|
|
565
|
+
|
|
566
|
+
selection_policy = selection_policy.strip().lower()
|
|
567
|
+
if selection_policy not in {"best", "index"}:
|
|
568
|
+
print_cli_error(
|
|
569
|
+
console,
|
|
570
|
+
"--selection-policy 값이 올바르지 않습니다.",
|
|
571
|
+
fixes=["best 또는 index로 지정하세요."],
|
|
572
|
+
)
|
|
573
|
+
raise typer.Exit(1)
|
|
574
|
+
|
|
575
|
+
sample_count = generation_n or 1
|
|
576
|
+
if selection_policy == "index":
|
|
577
|
+
if selection_index is None:
|
|
578
|
+
print_cli_error(
|
|
579
|
+
console,
|
|
580
|
+
"--selection-index 값이 필요합니다.",
|
|
581
|
+
)
|
|
582
|
+
raise typer.Exit(1)
|
|
583
|
+
if selection_index < 0 or selection_index >= sample_count:
|
|
584
|
+
print_cli_error(
|
|
585
|
+
console,
|
|
586
|
+
"--selection-index 값이 범위를 벗어났습니다.",
|
|
587
|
+
fixes=[f"0부터 {sample_count - 1} 사이로 지정하세요."],
|
|
588
|
+
)
|
|
589
|
+
raise typer.Exit(1)
|
|
590
|
+
elif selection_index is not None:
|
|
591
|
+
print_cli_error(
|
|
592
|
+
console,
|
|
593
|
+
"--selection-index는 selection-policy=index에서만 사용됩니다.",
|
|
594
|
+
)
|
|
595
|
+
raise typer.Exit(1)
|
|
596
|
+
|
|
597
|
+
weights_map = _parse_weights(console, weights, metric_list)
|
|
598
|
+
base_prompt = roles[resolved_role]["content"]
|
|
599
|
+
|
|
600
|
+
dataset = _build_dataset_from_run(run, console)
|
|
601
|
+
if not dataset.test_cases:
|
|
602
|
+
print_cli_error(
|
|
603
|
+
console,
|
|
604
|
+
"평가 데이터셋이 비어 있어 추천을 생성할 수 없습니다.",
|
|
605
|
+
)
|
|
606
|
+
raise typer.Exit(1)
|
|
607
|
+
|
|
608
|
+
try:
|
|
609
|
+
from evalvault.domain.entities.prompt_suggestion import PromptSuggestionResult
|
|
610
|
+
from evalvault.domain.services.holdout_splitter import split_dataset_holdout
|
|
611
|
+
from evalvault.domain.services.prompt_candidate_service import PromptCandidateService
|
|
612
|
+
from evalvault.domain.services.prompt_scoring_service import PromptScoringService
|
|
613
|
+
from evalvault.domain.services.prompt_suggestion_reporter import (
|
|
614
|
+
PromptSuggestionReporter,
|
|
615
|
+
)
|
|
616
|
+
except ModuleNotFoundError as exc:
|
|
617
|
+
print_cli_error(
|
|
618
|
+
console,
|
|
619
|
+
"프롬프트 추천 모듈을 찾을 수 없습니다.",
|
|
620
|
+
details=str(exc),
|
|
621
|
+
)
|
|
622
|
+
raise typer.Exit(1)
|
|
623
|
+
|
|
624
|
+
settings = Settings()
|
|
625
|
+
if settings.evalvault_profile:
|
|
626
|
+
settings = apply_profile(settings, settings.evalvault_profile)
|
|
627
|
+
|
|
628
|
+
settings, resolved_provider, resolved_model = _resolve_llm_config(
|
|
629
|
+
settings=settings,
|
|
630
|
+
run_model=run.model_name,
|
|
631
|
+
model_override=model,
|
|
632
|
+
provider_override=provider,
|
|
633
|
+
console=console,
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
llm_adapter = get_llm_adapter(settings)
|
|
637
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
638
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
639
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
640
|
+
generation_options = GenerationOptions(
|
|
641
|
+
temperature=temperature,
|
|
642
|
+
top_p=top_p,
|
|
643
|
+
max_tokens=max_tokens,
|
|
644
|
+
n=generation_n,
|
|
645
|
+
seed=generation_seed,
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
dev_dataset, holdout_dataset = split_dataset_holdout(
|
|
649
|
+
dataset=dataset,
|
|
650
|
+
holdout_ratio=holdout_ratio,
|
|
651
|
+
seed=seed,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
prefix = f"prompt_suggestions_{run_id}"
|
|
655
|
+
output_path, report_path = resolve_output_paths(
|
|
656
|
+
base_dir=analysis_dir,
|
|
657
|
+
output_path=output_path,
|
|
658
|
+
report_path=report_path,
|
|
659
|
+
prefix=prefix,
|
|
660
|
+
)
|
|
661
|
+
artifacts_dir = resolve_artifact_dir(
|
|
662
|
+
base_dir=analysis_dir,
|
|
663
|
+
output_path=output_path,
|
|
664
|
+
report_path=report_path,
|
|
665
|
+
prefix=prefix,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
candidate_service = PromptCandidateService()
|
|
669
|
+
scoring_service = PromptScoringService(evaluator=evaluator, llm=llm_adapter)
|
|
670
|
+
reporter = PromptSuggestionReporter()
|
|
671
|
+
|
|
672
|
+
with progress_spinner(console, "후보 생성 중...") as update:
|
|
673
|
+
candidates_list = candidate_service.build_candidates(
|
|
674
|
+
base_prompt=base_prompt,
|
|
675
|
+
role=resolved_role,
|
|
676
|
+
metrics=metric_list,
|
|
677
|
+
manual_prompts=list(manual_prompts),
|
|
678
|
+
manual_prompt_files=list(manual_prompt_files),
|
|
679
|
+
auto=auto,
|
|
680
|
+
auto_count=candidates,
|
|
681
|
+
metadata={"run_id": run_id},
|
|
682
|
+
)
|
|
683
|
+
if not candidates_list:
|
|
684
|
+
print_cli_error(
|
|
685
|
+
console,
|
|
686
|
+
"후보 프롬프트가 생성되지 않았습니다.",
|
|
687
|
+
)
|
|
688
|
+
raise typer.Exit(1)
|
|
689
|
+
|
|
690
|
+
update("후보 평가 중...")
|
|
691
|
+
scores = asyncio.run(
|
|
692
|
+
scoring_service.score_candidates(
|
|
693
|
+
base_run=run,
|
|
694
|
+
dev_dataset=dev_dataset,
|
|
695
|
+
holdout_dataset=holdout_dataset,
|
|
696
|
+
candidates=candidates_list,
|
|
697
|
+
metrics=metric_list,
|
|
698
|
+
weights=weights_map,
|
|
699
|
+
generation_options=generation_options,
|
|
700
|
+
selection_policy=selection_policy,
|
|
701
|
+
selection_index=selection_index,
|
|
702
|
+
)
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
ranking = _rank_candidates(scores)
|
|
706
|
+
result = PromptSuggestionResult(
|
|
707
|
+
run_id=run_id,
|
|
708
|
+
role=resolved_role,
|
|
709
|
+
metrics=metric_list,
|
|
710
|
+
weights=weights_map,
|
|
711
|
+
candidates=candidates_list,
|
|
712
|
+
scores=scores,
|
|
713
|
+
ranking=ranking,
|
|
714
|
+
holdout_ratio=holdout_ratio,
|
|
715
|
+
metadata={
|
|
716
|
+
"seed": seed,
|
|
717
|
+
"model": resolved_model,
|
|
718
|
+
"provider": resolved_provider,
|
|
719
|
+
"temperature": temperature,
|
|
720
|
+
"top_p": top_p,
|
|
721
|
+
"max_tokens": max_tokens,
|
|
722
|
+
"generation_n": generation_n,
|
|
723
|
+
"generation_seed": generation_seed,
|
|
724
|
+
"selection_policy": selection_policy,
|
|
725
|
+
"selection_index": selection_index,
|
|
726
|
+
},
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
update("결과 저장 중...")
|
|
730
|
+
reporter.write_outputs(
|
|
731
|
+
result=result,
|
|
732
|
+
output_path=output_path,
|
|
733
|
+
report_path=report_path,
|
|
734
|
+
artifacts_dir=artifacts_dir,
|
|
735
|
+
storage=storage,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
score_map = {score.candidate_id: score for score in scores}
|
|
739
|
+
candidate_map = {candidate.candidate_id: candidate for candidate in candidates_list}
|
|
740
|
+
console.print("\n[bold]추천 결과[/bold]")
|
|
741
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
742
|
+
table.add_column("Rank", justify="right")
|
|
743
|
+
table.add_column("Candidate", style="dim")
|
|
744
|
+
table.add_column("Source")
|
|
745
|
+
table.add_column("Score", justify="right")
|
|
746
|
+
for idx, candidate_id in enumerate(ranking[:5], start=1):
|
|
747
|
+
candidate = candidate_map.get(candidate_id)
|
|
748
|
+
score = score_map.get(candidate_id)
|
|
749
|
+
if not candidate or not score:
|
|
750
|
+
continue
|
|
751
|
+
preview = candidate.content.replace("\n", " ")[:60]
|
|
752
|
+
table.add_row(
|
|
753
|
+
str(idx),
|
|
754
|
+
preview,
|
|
755
|
+
candidate.source,
|
|
756
|
+
f"{score.weighted_score:.4f}",
|
|
757
|
+
)
|
|
758
|
+
console.print(table)
|
|
759
|
+
console.print(f"\n[green]JSON[/green] {output_path}")
|
|
760
|
+
console.print(f"[green]Report[/green] {report_path}\n")
|
|
761
|
+
|
|
154
762
|
return app
|
|
155
763
|
|
|
156
764
|
|