evalvault 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. evalvault/adapters/inbound/api/adapter.py +190 -19
  2. evalvault/adapters/inbound/api/routers/runs.py +66 -2
  3. evalvault/adapters/inbound/cli/commands/method.py +5 -2
  4. evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
  5. evalvault/adapters/inbound/cli/commands/run.py +88 -5
  6. evalvault/adapters/inbound/cli/commands/run_helpers.py +12 -0
  7. evalvault/adapters/inbound/mcp/tools.py +5 -2
  8. evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
  9. evalvault/adapters/outbound/improvement/pattern_detector.py +1 -1
  10. evalvault/adapters/outbound/improvement/playbook_loader.py +1 -1
  11. evalvault/adapters/outbound/llm/__init__.py +5 -43
  12. evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
  13. evalvault/adapters/outbound/llm/factory.py +103 -0
  14. evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
  15. evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
  16. evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
  17. evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
  18. evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
  19. evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
  20. evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
  21. evalvault/adapters/outbound/storage/base_sql.py +528 -21
  22. evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
  23. evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
  24. evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
  25. evalvault/debug_ragas.py +7 -1
  26. evalvault/debug_ragas_real.py +5 -1
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/prompt_suggestion.py +50 -0
  29. evalvault/domain/services/__init__.py +6 -0
  30. evalvault/domain/services/evaluator.py +191 -103
  31. evalvault/domain/services/holdout_splitter.py +67 -0
  32. evalvault/domain/services/intent_classifier.py +73 -0
  33. evalvault/domain/services/pipeline_template_registry.py +3 -0
  34. evalvault/domain/services/prompt_candidate_service.py +117 -0
  35. evalvault/domain/services/prompt_registry.py +40 -2
  36. evalvault/domain/services/prompt_scoring_service.py +286 -0
  37. evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
  38. evalvault/domain/services/synthetic_qa_generator.py +4 -3
  39. evalvault/ports/inbound/learning_hook_port.py +4 -1
  40. evalvault/ports/outbound/__init__.py +2 -0
  41. evalvault/ports/outbound/llm_factory_port.py +13 -0
  42. evalvault/ports/outbound/llm_port.py +34 -2
  43. evalvault/ports/outbound/storage_port.py +38 -0
  44. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/METADATA +228 -4
  45. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/RECORD +48 -40
  46. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/WHEEL +0 -0
  47. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/entry_points.txt +0 -0
  48. {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -2,17 +2,28 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import asyncio
5
6
  import difflib
6
7
  from pathlib import Path
8
+ from typing import Any
7
9
 
8
10
  import typer
9
11
  from rich.console import Console
10
12
  from rich.table import Table
11
13
 
14
+ from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
15
+ from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
12
16
  from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
13
- from evalvault.domain.entities import PromptSetBundle
17
+ from evalvault.config.settings import Settings, apply_profile
18
+ from evalvault.domain.entities import Dataset, EvaluationRun, PromptSetBundle, TestCase
19
+ from evalvault.domain.services.evaluator import RagasEvaluator
20
+ from evalvault.ports.outbound.llm_port import GenerationOptions
14
21
 
22
+ from ..utils.analysis_io import resolve_artifact_dir, resolve_output_paths
23
+ from ..utils.console import print_cli_error, print_cli_warning, progress_spinner
15
24
  from ..utils.options import db_option
25
+ from ..utils.validators import parse_csv_option
26
+ from .run_helpers import _is_oss_open_model
16
27
 
17
28
 
18
29
  def _bundle_to_role_map(bundle: PromptSetBundle) -> dict[str, dict[str, str]]:
@@ -31,6 +42,182 @@ def _bundle_to_role_map(bundle: PromptSetBundle) -> dict[str, dict[str, str]]:
31
42
  return roles
32
43
 
33
44
 
45
+ def _require_db_path(console: Console, db_path: Path | None) -> Path:
46
+ if db_path is None:
47
+ print_cli_error(
48
+ console,
49
+ "DB 경로가 필요합니다.",
50
+ fixes=["--db 옵션으로 SQLite DB 경로를 지정하세요."],
51
+ )
52
+ raise typer.Exit(1)
53
+ return db_path
54
+
55
+
56
+ def _default_role(bundle: PromptSetBundle) -> str | None:
57
+ for item in bundle.items:
58
+ if item.role == "system":
59
+ return item.role
60
+ return bundle.items[0].role if bundle.items else None
61
+
62
+
63
+ def _build_dataset_from_run(run: EvaluationRun, console: Console) -> Dataset:
64
+ test_cases: list[TestCase] = []
65
+ skipped = 0
66
+ for result in run.results:
67
+ if not result.question or result.answer is None or result.contexts is None:
68
+ skipped += 1
69
+ continue
70
+ test_cases.append(
71
+ TestCase(
72
+ id=result.test_case_id,
73
+ question=result.question,
74
+ answer=result.answer,
75
+ contexts=result.contexts,
76
+ ground_truth=result.ground_truth,
77
+ )
78
+ )
79
+ if skipped:
80
+ print_cli_warning(
81
+ console,
82
+ f"{skipped}개 테스트 케이스에 질문/답변/컨텍스트가 없어 제외했습니다.",
83
+ )
84
+ return Dataset(
85
+ name=run.dataset_name,
86
+ version=run.dataset_version,
87
+ test_cases=test_cases,
88
+ thresholds=dict(run.thresholds),
89
+ )
90
+
91
+
92
+ def _parse_weights(
93
+ console: Console,
94
+ weights_raw: str | None,
95
+ metrics: list[str],
96
+ ) -> dict[str, float]:
97
+ if not metrics:
98
+ return {}
99
+ if not weights_raw:
100
+ base = 1.0 / len(metrics)
101
+ return dict.fromkeys(metrics, base)
102
+ entries = parse_csv_option(weights_raw)
103
+ weights: dict[str, float] = {}
104
+ for entry in entries:
105
+ if "=" not in entry:
106
+ print_cli_error(
107
+ console,
108
+ "--weights 형식이 올바르지 않습니다.",
109
+ fixes=["예: --weights faithfulness=0.5,answer_relevancy=0.5"],
110
+ details=entry,
111
+ )
112
+ raise typer.Exit(1)
113
+ key, raw_value = entry.split("=", 1)
114
+ key = key.strip()
115
+ raw_value = raw_value.strip()
116
+ try:
117
+ value = float(raw_value)
118
+ except ValueError:
119
+ print_cli_error(
120
+ console,
121
+ "--weights 값은 숫자여야 합니다.",
122
+ details=entry,
123
+ )
124
+ raise typer.Exit(1)
125
+ if value < 0:
126
+ print_cli_error(
127
+ console,
128
+ "--weights 값은 0 이상이어야 합니다.",
129
+ details=entry,
130
+ )
131
+ raise typer.Exit(1)
132
+ weights[key] = value
133
+
134
+ missing = [metric for metric in metrics if metric not in weights]
135
+ if missing:
136
+ print_cli_error(
137
+ console,
138
+ "--weights에 모든 메트릭을 포함해야 합니다.",
139
+ fixes=["누락된 메트릭을 추가하거나 --weights 옵션을 제거하세요."],
140
+ details=", ".join(missing),
141
+ )
142
+ raise typer.Exit(1)
143
+
144
+ total = sum(weights.values())
145
+ if total <= 0:
146
+ print_cli_error(
147
+ console,
148
+ "--weights 합계는 0보다 커야 합니다.",
149
+ )
150
+ raise typer.Exit(1)
151
+ return {metric: weights[metric] / total for metric in metrics}
152
+
153
+
154
+ def _rank_candidates(scores: list[Any]) -> list[str]:
155
+ return [
156
+ score.candidate_id
157
+ for score in sorted(scores, key=lambda entry: entry.weighted_score, reverse=True)
158
+ ]
159
+
160
+
161
+ def _resolve_llm_config(
162
+ *,
163
+ settings: Settings,
164
+ run_model: str,
165
+ model_override: str | None,
166
+ provider_override: str | None,
167
+ console: Console,
168
+ ) -> tuple[Settings, str, str]:
169
+ resolved_model = model_override or run_model
170
+ if not resolved_model:
171
+ print_cli_error(
172
+ console,
173
+ "LLM 모델을 결정할 수 없습니다.",
174
+ fixes=["--model 옵션을 지정하세요."],
175
+ )
176
+ raise typer.Exit(1)
177
+
178
+ provider = provider_override
179
+ if "/" in resolved_model:
180
+ provider, resolved_model = resolved_model.split("/", 1)
181
+ elif not provider:
182
+ if "/" in run_model:
183
+ provider, run_model = run_model.split("/", 1)
184
+ resolved_model = model_override or run_model
185
+ else:
186
+ provider = settings.llm_provider
187
+
188
+ if provider is None:
189
+ provider = settings.llm_provider
190
+
191
+ provider = provider.strip().lower()
192
+ if _is_oss_open_model(resolved_model) and provider != "vllm":
193
+ provider = "ollama"
194
+
195
+ settings.llm_provider = provider
196
+ if provider == "ollama":
197
+ settings.ollama_model = resolved_model
198
+ elif provider == "vllm":
199
+ settings.vllm_model = resolved_model
200
+ elif provider == "azure":
201
+ settings.azure_deployment = resolved_model
202
+ elif provider == "anthropic":
203
+ settings.anthropic_model = resolved_model
204
+ else:
205
+ settings.openai_model = resolved_model
206
+
207
+ if settings.llm_provider == "openai" and not settings.openai_api_key:
208
+ print_cli_error(
209
+ console,
210
+ "OPENAI_API_KEY가 설정되지 않았습니다.",
211
+ fixes=[
212
+ ".env 파일 또는 환경 변수에 OPENAI_API_KEY=... 값을 추가하세요.",
213
+ "--provider ollama 같이 로컬 모델을 사용하세요.",
214
+ ],
215
+ )
216
+ raise typer.Exit(1)
217
+
218
+ return settings, provider, resolved_model
219
+
220
+
34
221
  def create_prompts_app(console: Console) -> typer.Typer:
35
222
  """Create the `prompts` sub-application."""
36
223
 
@@ -39,10 +226,11 @@ def create_prompts_app(console: Console) -> typer.Typer:
39
226
  @app.command("show")
40
227
  def show_prompt_set(
41
228
  run_id: str = typer.Argument(..., help="Run ID to inspect."),
42
- db_path: Path = db_option(help_text="Path to database file."),
229
+ db_path: Path | None = db_option(help_text="Path to database file."),
43
230
  ) -> None:
44
231
  """Show prompt snapshots attached to a run."""
45
- storage = SQLiteStorageAdapter(db_path=db_path)
232
+ resolved_db = _require_db_path(console, db_path)
233
+ storage = SQLiteStorageAdapter(db_path=resolved_db)
46
234
  bundle = storage.get_prompt_set_for_run(run_id)
47
235
  if not bundle:
48
236
  console.print("[yellow]No prompt set found for this run.[/yellow]")
@@ -72,7 +260,7 @@ def create_prompts_app(console: Console) -> typer.Typer:
72
260
  def diff_prompt_sets(
73
261
  run_id_a: str = typer.Argument(..., help="Base run ID."),
74
262
  run_id_b: str = typer.Argument(..., help="Target run ID."),
75
- db_path: Path = db_option(help_text="Path to database file."),
263
+ db_path: Path | None = db_option(help_text="Path to database file."),
76
264
  max_lines: int = typer.Option(
77
265
  40,
78
266
  "--max-lines",
@@ -85,7 +273,8 @@ def create_prompts_app(console: Console) -> typer.Typer:
85
273
  ),
86
274
  ) -> None:
87
275
  """Compare prompt snapshots between two runs."""
88
- storage = SQLiteStorageAdapter(db_path=db_path)
276
+ resolved_db = _require_db_path(console, db_path)
277
+ storage = SQLiteStorageAdapter(db_path=resolved_db)
89
278
  bundle_a = storage.get_prompt_set_for_run(run_id_a)
90
279
  bundle_b = storage.get_prompt_set_for_run(run_id_b)
91
280
 
@@ -107,7 +296,11 @@ def create_prompts_app(console: Console) -> typer.Typer:
107
296
  for role in all_roles:
108
297
  a = roles_a.get(role)
109
298
  b = roles_b.get(role)
299
+ if not a and not b:
300
+ continue
110
301
  if not a:
302
+ if not b:
303
+ continue
111
304
  table.add_row(role, "-", b["checksum"][:12], "[yellow]missing[/yellow]")
112
305
  continue
113
306
  if not b:
@@ -151,6 +344,421 @@ def create_prompts_app(console: Console) -> typer.Typer:
151
344
  console.print("[dim]... diff truncated ...[/dim]")
152
345
  console.print()
153
346
 
347
+ @app.command("suggest")
348
+ def suggest_prompt_candidates(
349
+ run_id: str = typer.Argument(..., help="Run ID to analyze."),
350
+ role: str | None = typer.Option(
351
+ None,
352
+ "--role",
353
+ help="Prompt role to improve (system or metric name).",
354
+ ),
355
+ metrics: str | None = typer.Option(
356
+ None,
357
+ "--metrics",
358
+ "-m",
359
+ help="Comma-separated list of metrics to score (default: run metrics).",
360
+ ),
361
+ model: str | None = typer.Option(
362
+ None,
363
+ "--model",
364
+ help="Override LLM model for regeneration/scoring.",
365
+ ),
366
+ provider: str | None = typer.Option(
367
+ None,
368
+ "--provider",
369
+ help="Override LLM provider (openai|ollama|vllm|azure|anthropic).",
370
+ ),
371
+ temperature: float | None = typer.Option(
372
+ None,
373
+ "--temperature",
374
+ help="Sampling temperature for regeneration.",
375
+ ),
376
+ top_p: float | None = typer.Option(
377
+ None,
378
+ "--top-p",
379
+ help="Nucleus sampling top-p for regeneration.",
380
+ ),
381
+ max_tokens: int | None = typer.Option(
382
+ None,
383
+ "--max-tokens",
384
+ help="Max completion tokens for regeneration.",
385
+ ),
386
+ generation_n: int | None = typer.Option(
387
+ None,
388
+ "--generation-n",
389
+ help="Number of samples per regeneration.",
390
+ ),
391
+ generation_seed: int | None = typer.Option(
392
+ None,
393
+ "--generation-seed",
394
+ help="Seed for regeneration sampling.",
395
+ ),
396
+ selection_policy: str = typer.Option(
397
+ "best",
398
+ "--selection-policy",
399
+ help="Sample selection policy (best|index).",
400
+ ),
401
+ selection_index: int | None = typer.Option(
402
+ None,
403
+ "--selection-index",
404
+ help="Sample index when using selection-policy=index.",
405
+ ),
406
+ weights: str | None = typer.Option(
407
+ None,
408
+ "--weights",
409
+ help="Comma-separated metric weights (e.g. faithfulness=0.5,answer_relevancy=0.5).",
410
+ ),
411
+ candidates: int = typer.Option(
412
+ 5,
413
+ "--candidates",
414
+ help="Number of auto-generated candidates (default: 5).",
415
+ ),
416
+ manual_prompts: list[str] = typer.Option(
417
+ [],
418
+ "--prompt",
419
+ help="Manual prompt candidate (repeatable).",
420
+ show_default=False,
421
+ ),
422
+ manual_prompt_files: list[Path] = typer.Option(
423
+ [],
424
+ "--prompt-file",
425
+ help="Manual prompt candidate file (repeatable).",
426
+ exists=True,
427
+ readable=True,
428
+ show_default=False,
429
+ ),
430
+ auto: bool = typer.Option(
431
+ True,
432
+ "--auto/--no-auto",
433
+ help="Enable auto candidate generation.",
434
+ ),
435
+ holdout_ratio: float = typer.Option(
436
+ 0.2,
437
+ "--holdout-ratio",
438
+ help="Holdout ratio for scoring (default: 0.2).",
439
+ ),
440
+ seed: int | None = typer.Option(
441
+ None,
442
+ "--seed",
443
+ help="Random seed for holdout split.",
444
+ ),
445
+ output_path: Path | None = typer.Option(
446
+ None,
447
+ "--output",
448
+ "-o",
449
+ help="Output JSON path.",
450
+ ),
451
+ report_path: Path | None = typer.Option(
452
+ None,
453
+ "--report",
454
+ help="Markdown report path.",
455
+ ),
456
+ analysis_dir: Path | None = typer.Option(
457
+ None,
458
+ "--analysis-dir",
459
+ help="Base directory for analysis outputs.",
460
+ ),
461
+ db_path: Path | None = db_option(help_text="Path to database file."),
462
+ ) -> None:
463
+ """Suggest prompt improvements by scoring candidate prompts."""
464
+
465
+ resolved_db = _require_db_path(console, db_path)
466
+ storage = SQLiteStorageAdapter(db_path=resolved_db)
467
+
468
+ try:
469
+ run = storage.get_run(run_id)
470
+ except KeyError as exc:
471
+ print_cli_error(
472
+ console,
473
+ "Run을 찾지 못했습니다.",
474
+ details=str(exc),
475
+ )
476
+ raise typer.Exit(1)
477
+
478
+ bundle = storage.get_prompt_set_for_run(run_id)
479
+ if not bundle:
480
+ print_cli_error(
481
+ console,
482
+ "이 run에 연결된 프롬프트 스냅샷이 없습니다.",
483
+ fixes=["`evalvault run` 실행 시 --db 옵션을 지정했는지 확인하세요."],
484
+ )
485
+ raise typer.Exit(1)
486
+
487
+ roles = _bundle_to_role_map(bundle)
488
+ resolved_role = role or _default_role(bundle)
489
+ if not resolved_role:
490
+ print_cli_error(
491
+ console,
492
+ "프롬프트 role을 결정할 수 없습니다.",
493
+ )
494
+ raise typer.Exit(1)
495
+ if resolved_role not in roles:
496
+ print_cli_error(
497
+ console,
498
+ "지정한 role의 프롬프트를 찾을 수 없습니다.",
499
+ details=resolved_role,
500
+ fixes=[f"사용 가능한 role: {', '.join(sorted(roles))}"],
501
+ )
502
+ raise typer.Exit(1)
503
+
504
+ metric_list = parse_csv_option(metrics) or list(run.metrics_evaluated)
505
+ if not metric_list:
506
+ print_cli_error(
507
+ console,
508
+ "평가 메트릭이 없습니다.",
509
+ fixes=["--metrics 옵션을 지정하세요."],
510
+ )
511
+ raise typer.Exit(1)
512
+
513
+ if candidates <= 0:
514
+ print_cli_error(
515
+ console,
516
+ "--candidates 값은 1 이상이어야 합니다.",
517
+ )
518
+ raise typer.Exit(1)
519
+
520
+ if holdout_ratio <= 0 or holdout_ratio >= 1:
521
+ print_cli_error(
522
+ console,
523
+ "--holdout-ratio 값은 0과 1 사이여야 합니다.",
524
+ )
525
+ raise typer.Exit(1)
526
+
527
+ if not auto and not manual_prompts and not manual_prompt_files:
528
+ print_cli_error(
529
+ console,
530
+ "자동 후보를 끌 경우 수동 후보가 필요합니다.",
531
+ fixes=["--prompt 또는 --prompt-file을 추가하세요."],
532
+ )
533
+ raise typer.Exit(1)
534
+
535
+ if temperature is not None and temperature < 0:
536
+ print_cli_error(
537
+ console,
538
+ "--temperature 값은 0 이상이어야 합니다.",
539
+ )
540
+ raise typer.Exit(1)
541
+ if top_p is not None and (top_p <= 0 or top_p > 1):
542
+ print_cli_error(
543
+ console,
544
+ "--top-p 값은 0보다 크고 1 이하여야 합니다.",
545
+ )
546
+ raise typer.Exit(1)
547
+ if max_tokens is not None and max_tokens <= 0:
548
+ print_cli_error(
549
+ console,
550
+ "--max-tokens 값은 1 이상이어야 합니다.",
551
+ )
552
+ raise typer.Exit(1)
553
+ if generation_n is not None and generation_n <= 0:
554
+ print_cli_error(
555
+ console,
556
+ "--generation-n 값은 1 이상이어야 합니다.",
557
+ )
558
+ raise typer.Exit(1)
559
+ if generation_seed is not None and generation_seed < 0:
560
+ print_cli_error(
561
+ console,
562
+ "--generation-seed 값은 0 이상이어야 합니다.",
563
+ )
564
+ raise typer.Exit(1)
565
+
566
+ selection_policy = selection_policy.strip().lower()
567
+ if selection_policy not in {"best", "index"}:
568
+ print_cli_error(
569
+ console,
570
+ "--selection-policy 값이 올바르지 않습니다.",
571
+ fixes=["best 또는 index로 지정하세요."],
572
+ )
573
+ raise typer.Exit(1)
574
+
575
+ sample_count = generation_n or 1
576
+ if selection_policy == "index":
577
+ if selection_index is None:
578
+ print_cli_error(
579
+ console,
580
+ "--selection-index 값이 필요합니다.",
581
+ )
582
+ raise typer.Exit(1)
583
+ if selection_index < 0 or selection_index >= sample_count:
584
+ print_cli_error(
585
+ console,
586
+ "--selection-index 값이 범위를 벗어났습니다.",
587
+ fixes=[f"0부터 {sample_count - 1} 사이로 지정하세요."],
588
+ )
589
+ raise typer.Exit(1)
590
+ elif selection_index is not None:
591
+ print_cli_error(
592
+ console,
593
+ "--selection-index는 selection-policy=index에서만 사용됩니다.",
594
+ )
595
+ raise typer.Exit(1)
596
+
597
+ weights_map = _parse_weights(console, weights, metric_list)
598
+ base_prompt = roles[resolved_role]["content"]
599
+
600
+ dataset = _build_dataset_from_run(run, console)
601
+ if not dataset.test_cases:
602
+ print_cli_error(
603
+ console,
604
+ "평가 데이터셋이 비어 있어 추천을 생성할 수 없습니다.",
605
+ )
606
+ raise typer.Exit(1)
607
+
608
+ try:
609
+ from evalvault.domain.entities.prompt_suggestion import PromptSuggestionResult
610
+ from evalvault.domain.services.holdout_splitter import split_dataset_holdout
611
+ from evalvault.domain.services.prompt_candidate_service import PromptCandidateService
612
+ from evalvault.domain.services.prompt_scoring_service import PromptScoringService
613
+ from evalvault.domain.services.prompt_suggestion_reporter import (
614
+ PromptSuggestionReporter,
615
+ )
616
+ except ModuleNotFoundError as exc:
617
+ print_cli_error(
618
+ console,
619
+ "프롬프트 추천 모듈을 찾을 수 없습니다.",
620
+ details=str(exc),
621
+ )
622
+ raise typer.Exit(1)
623
+
624
+ settings = Settings()
625
+ if settings.evalvault_profile:
626
+ settings = apply_profile(settings, settings.evalvault_profile)
627
+
628
+ settings, resolved_provider, resolved_model = _resolve_llm_config(
629
+ settings=settings,
630
+ run_model=run.model_name,
631
+ model_override=model,
632
+ provider_override=provider,
633
+ console=console,
634
+ )
635
+
636
+ llm_adapter = get_llm_adapter(settings)
637
+ llm_factory = SettingsLLMFactory(settings)
638
+ korean_toolkit = try_create_korean_toolkit()
639
+ evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
640
+ generation_options = GenerationOptions(
641
+ temperature=temperature,
642
+ top_p=top_p,
643
+ max_tokens=max_tokens,
644
+ n=generation_n,
645
+ seed=generation_seed,
646
+ )
647
+
648
+ dev_dataset, holdout_dataset = split_dataset_holdout(
649
+ dataset=dataset,
650
+ holdout_ratio=holdout_ratio,
651
+ seed=seed,
652
+ )
653
+
654
+ prefix = f"prompt_suggestions_{run_id}"
655
+ output_path, report_path = resolve_output_paths(
656
+ base_dir=analysis_dir,
657
+ output_path=output_path,
658
+ report_path=report_path,
659
+ prefix=prefix,
660
+ )
661
+ artifacts_dir = resolve_artifact_dir(
662
+ base_dir=analysis_dir,
663
+ output_path=output_path,
664
+ report_path=report_path,
665
+ prefix=prefix,
666
+ )
667
+
668
+ candidate_service = PromptCandidateService()
669
+ scoring_service = PromptScoringService(evaluator=evaluator, llm=llm_adapter)
670
+ reporter = PromptSuggestionReporter()
671
+
672
+ with progress_spinner(console, "후보 생성 중...") as update:
673
+ candidates_list = candidate_service.build_candidates(
674
+ base_prompt=base_prompt,
675
+ role=resolved_role,
676
+ metrics=metric_list,
677
+ manual_prompts=list(manual_prompts),
678
+ manual_prompt_files=list(manual_prompt_files),
679
+ auto=auto,
680
+ auto_count=candidates,
681
+ metadata={"run_id": run_id},
682
+ )
683
+ if not candidates_list:
684
+ print_cli_error(
685
+ console,
686
+ "후보 프롬프트가 생성되지 않았습니다.",
687
+ )
688
+ raise typer.Exit(1)
689
+
690
+ update("후보 평가 중...")
691
+ scores = asyncio.run(
692
+ scoring_service.score_candidates(
693
+ base_run=run,
694
+ dev_dataset=dev_dataset,
695
+ holdout_dataset=holdout_dataset,
696
+ candidates=candidates_list,
697
+ metrics=metric_list,
698
+ weights=weights_map,
699
+ generation_options=generation_options,
700
+ selection_policy=selection_policy,
701
+ selection_index=selection_index,
702
+ )
703
+ )
704
+
705
+ ranking = _rank_candidates(scores)
706
+ result = PromptSuggestionResult(
707
+ run_id=run_id,
708
+ role=resolved_role,
709
+ metrics=metric_list,
710
+ weights=weights_map,
711
+ candidates=candidates_list,
712
+ scores=scores,
713
+ ranking=ranking,
714
+ holdout_ratio=holdout_ratio,
715
+ metadata={
716
+ "seed": seed,
717
+ "model": resolved_model,
718
+ "provider": resolved_provider,
719
+ "temperature": temperature,
720
+ "top_p": top_p,
721
+ "max_tokens": max_tokens,
722
+ "generation_n": generation_n,
723
+ "generation_seed": generation_seed,
724
+ "selection_policy": selection_policy,
725
+ "selection_index": selection_index,
726
+ },
727
+ )
728
+
729
+ update("결과 저장 중...")
730
+ reporter.write_outputs(
731
+ result=result,
732
+ output_path=output_path,
733
+ report_path=report_path,
734
+ artifacts_dir=artifacts_dir,
735
+ storage=storage,
736
+ )
737
+
738
+ score_map = {score.candidate_id: score for score in scores}
739
+ candidate_map = {candidate.candidate_id: candidate for candidate in candidates_list}
740
+ console.print("\n[bold]추천 결과[/bold]")
741
+ table = Table(show_header=True, header_style="bold cyan")
742
+ table.add_column("Rank", justify="right")
743
+ table.add_column("Candidate", style="dim")
744
+ table.add_column("Source")
745
+ table.add_column("Score", justify="right")
746
+ for idx, candidate_id in enumerate(ranking[:5], start=1):
747
+ candidate = candidate_map.get(candidate_id)
748
+ score = score_map.get(candidate_id)
749
+ if not candidate or not score:
750
+ continue
751
+ preview = candidate.content.replace("\n", " ")[:60]
752
+ table.add_row(
753
+ str(idx),
754
+ preview,
755
+ candidate.source,
756
+ f"{score.weighted_score:.4f}",
757
+ )
758
+ console.print(table)
759
+ console.print(f"\n[green]JSON[/green] {output_path}")
760
+ console.print(f"[green]Report[/green] {report_path}\n")
761
+
154
762
  return app
155
763
 
156
764