evalvault 1.62.0__py3-none-any.whl → 1.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +43 -2
- evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +527 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.0.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -20,7 +20,8 @@ from evalvault.adapters.outbound.documents.versioned_loader import (
|
|
|
20
20
|
load_versioned_chunks_from_pdf_dir,
|
|
21
21
|
)
|
|
22
22
|
from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
|
|
23
|
-
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
23
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
24
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
24
25
|
from evalvault.adapters.outbound.phoenix.sync_service import (
|
|
25
26
|
PhoenixDatasetInfo,
|
|
26
27
|
PhoenixSyncError,
|
|
@@ -39,6 +40,7 @@ from evalvault.domain.services.memory_based_analysis import MemoryBasedAnalysis
|
|
|
39
40
|
from evalvault.domain.services.prompt_registry import (
|
|
40
41
|
PromptInput,
|
|
41
42
|
build_prompt_bundle,
|
|
43
|
+
build_prompt_inputs_from_snapshots,
|
|
42
44
|
build_prompt_summary,
|
|
43
45
|
)
|
|
44
46
|
from evalvault.domain.services.ragas_prompt_overrides import (
|
|
@@ -808,6 +810,9 @@ def register_run_commands(
|
|
|
808
810
|
if profile_name:
|
|
809
811
|
settings = apply_profile(settings, profile_name)
|
|
810
812
|
|
|
813
|
+
if db_path is None:
|
|
814
|
+
db_path = Path(settings.evalvault_db_path)
|
|
815
|
+
|
|
811
816
|
# Override model if specified
|
|
812
817
|
if model:
|
|
813
818
|
if _is_oss_open_model(model) and settings.llm_provider != "vllm":
|
|
@@ -1436,7 +1441,9 @@ def register_run_commands(
|
|
|
1436
1441
|
if should_enable_phoenix:
|
|
1437
1442
|
ensure_phoenix_instrumentation(settings, console=console, force=True)
|
|
1438
1443
|
|
|
1439
|
-
|
|
1444
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
1445
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
1446
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
1440
1447
|
llm_adapter = None
|
|
1441
1448
|
try:
|
|
1442
1449
|
llm_adapter = get_llm_adapter(settings)
|
|
@@ -1707,6 +1714,40 @@ def register_run_commands(
|
|
|
1707
1714
|
result.retrieval_metadata = merged_retriever_metadata
|
|
1708
1715
|
|
|
1709
1716
|
result.tracker_metadata.setdefault("run_mode", preset.name)
|
|
1717
|
+
tracker_meta = result.tracker_metadata or {}
|
|
1718
|
+
result.tracker_metadata = tracker_meta
|
|
1719
|
+
ragas_snapshots = tracker_meta.get("ragas_prompt_snapshots")
|
|
1720
|
+
ragas_snapshot_inputs = build_prompt_inputs_from_snapshots(
|
|
1721
|
+
ragas_snapshots if isinstance(ragas_snapshots, dict) else None,
|
|
1722
|
+
)
|
|
1723
|
+
override_status: dict[str, str] = {}
|
|
1724
|
+
raw_override = tracker_meta.get("ragas_prompt_overrides")
|
|
1725
|
+
if isinstance(raw_override, dict):
|
|
1726
|
+
override_status = cast(dict[str, str], raw_override)
|
|
1727
|
+
if override_status:
|
|
1728
|
+
prompt_inputs = [
|
|
1729
|
+
entry
|
|
1730
|
+
for entry in prompt_inputs
|
|
1731
|
+
if not (
|
|
1732
|
+
entry.kind == "ragas"
|
|
1733
|
+
and override_status.get(entry.role) is not None
|
|
1734
|
+
and override_status.get(entry.role) != "applied"
|
|
1735
|
+
)
|
|
1736
|
+
]
|
|
1737
|
+
|
|
1738
|
+
if ragas_snapshot_inputs:
|
|
1739
|
+
existing_roles = {entry.role for entry in prompt_inputs if entry.kind == "ragas"}
|
|
1740
|
+
for entry in ragas_snapshot_inputs:
|
|
1741
|
+
if entry.role in existing_roles and override_status.get(entry.role) == "applied":
|
|
1742
|
+
continue
|
|
1743
|
+
prompt_inputs.append(entry)
|
|
1744
|
+
if prompt_inputs and not db_path:
|
|
1745
|
+
print_cli_warning(
|
|
1746
|
+
console,
|
|
1747
|
+
"Prompt snapshot은 --db 저장 시에만 DB에 기록됩니다.",
|
|
1748
|
+
tips=["--db data/db/evalvault.db 옵션을 추가하세요."],
|
|
1749
|
+
)
|
|
1750
|
+
|
|
1710
1751
|
if prompt_inputs:
|
|
1711
1752
|
prompt_bundle = build_prompt_bundle(
|
|
1712
1753
|
run_id=result.run_id,
|
|
@@ -443,6 +443,16 @@ def _save_to_db(
|
|
|
443
443
|
result.run_id,
|
|
444
444
|
prompt_bundle.prompt_set.prompt_set_id,
|
|
445
445
|
)
|
|
446
|
+
excel_path = db_path.parent / f"evalvault_run_{result.run_id}.xlsx"
|
|
447
|
+
try:
|
|
448
|
+
storage.export_run_to_excel(result.run_id, excel_path)
|
|
449
|
+
console.print(f"[green]Excel export saved: {excel_path}[/green]")
|
|
450
|
+
except Exception as exc:
|
|
451
|
+
print_cli_warning(
|
|
452
|
+
console,
|
|
453
|
+
"엑셀 내보내기에 실패했습니다.",
|
|
454
|
+
tips=[str(exc)],
|
|
455
|
+
)
|
|
446
456
|
console.print(f"[green]Results saved to database: {db_path}[/green]")
|
|
447
457
|
console.print(f"[dim]Run ID: {result.run_id}[/dim]")
|
|
448
458
|
if prompt_bundle:
|
|
@@ -18,7 +18,8 @@ from evalvault.adapters.inbound.cli.utils.analysis_io import (
|
|
|
18
18
|
)
|
|
19
19
|
from evalvault.adapters.outbound.analysis.pipeline_factory import build_analysis_pipeline_service
|
|
20
20
|
from evalvault.adapters.outbound.analysis.statistical_adapter import StatisticalAnalysisAdapter
|
|
21
|
-
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
21
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
22
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
22
23
|
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
23
24
|
from evalvault.config.settings import Settings, apply_profile
|
|
24
25
|
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
|
|
@@ -175,7 +176,9 @@ def run_evaluation(payload: dict[str, Any] | RunEvaluationRequest) -> RunEvaluat
|
|
|
175
176
|
)
|
|
176
177
|
|
|
177
178
|
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
178
|
-
|
|
179
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
180
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
181
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
179
182
|
adapter = WebUIAdapter(
|
|
180
183
|
storage=storage,
|
|
181
184
|
evaluator=evaluator,
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from typing import Any
|
|
5
|
+
from typing import Any, cast
|
|
6
6
|
|
|
7
7
|
from evalvault.adapters.outbound.analysis.base_module import BaseAnalysisModule
|
|
8
8
|
from evalvault.adapters.outbound.analysis.pipeline_helpers import (
|
|
@@ -12,6 +12,9 @@ from evalvault.adapters.outbound.analysis.pipeline_helpers import (
|
|
|
12
12
|
safe_mean,
|
|
13
13
|
truncate_text,
|
|
14
14
|
)
|
|
15
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory
|
|
16
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
17
|
+
from evalvault.config.settings import Settings
|
|
15
18
|
from evalvault.domain.entities import Dataset, EvaluationRun, TestCase
|
|
16
19
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
17
20
|
from evalvault.ports.outbound.llm_port import LLMPort
|
|
@@ -30,7 +33,10 @@ class RagasEvaluatorModule(BaseAnalysisModule):
|
|
|
30
33
|
|
|
31
34
|
def __init__(self, llm_adapter: LLMPort | None = None) -> None:
|
|
32
35
|
self._llm_adapter = llm_adapter
|
|
33
|
-
|
|
36
|
+
settings = Settings()
|
|
37
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
38
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
39
|
+
self._evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
34
40
|
|
|
35
41
|
def execute(
|
|
36
42
|
self,
|
|
@@ -143,14 +149,12 @@ class RagasEvaluatorModule(BaseAnalysisModule):
|
|
|
143
149
|
*,
|
|
144
150
|
recomputed: bool,
|
|
145
151
|
) -> dict[str, Any]:
|
|
146
|
-
if metrics and isinstance(
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
default=0,
|
|
151
|
-
) # type: ignore[arg-type]
|
|
152
|
+
if metrics and all(isinstance(value, list) for value in metrics.values()):
|
|
153
|
+
metrics_lists = cast(dict[str, list[float]], metrics)
|
|
154
|
+
avg_scores = average_scores(metrics_lists)
|
|
155
|
+
sample_count = max((len(values) for values in metrics_lists.values()), default=0)
|
|
152
156
|
else:
|
|
153
|
-
avg_scores =
|
|
157
|
+
avg_scores = cast(dict[str, float], metrics)
|
|
154
158
|
sample_count = len(per_case)
|
|
155
159
|
|
|
156
160
|
overall = safe_mean(avg_scores.values()) if avg_scores else 0.0
|
|
@@ -8,6 +8,10 @@ from evalvault.adapters.outbound.llm.base import (
|
|
|
8
8
|
LLMConfigurationError,
|
|
9
9
|
create_openai_embeddings_with_legacy,
|
|
10
10
|
)
|
|
11
|
+
from evalvault.adapters.outbound.llm.factory import (
|
|
12
|
+
SettingsLLMFactory,
|
|
13
|
+
create_llm_adapter_for_model,
|
|
14
|
+
)
|
|
11
15
|
from evalvault.adapters.outbound.llm.llm_relation_augmenter import LLMRelationAugmenter
|
|
12
16
|
from evalvault.config.settings import Settings
|
|
13
17
|
from evalvault.ports.outbound.llm_port import LLMPort
|
|
@@ -70,49 +74,6 @@ def get_llm_adapter(settings: Settings) -> LLMPort:
|
|
|
70
74
|
)
|
|
71
75
|
|
|
72
76
|
|
|
73
|
-
def create_llm_adapter_for_model(
|
|
74
|
-
provider: str,
|
|
75
|
-
model_name: str,
|
|
76
|
-
base_settings: Settings,
|
|
77
|
-
) -> LLMPort:
|
|
78
|
-
provider = provider.lower()
|
|
79
|
-
|
|
80
|
-
if provider == "openai":
|
|
81
|
-
base_settings.llm_provider = "openai"
|
|
82
|
-
base_settings.openai_model = model_name
|
|
83
|
-
from evalvault.adapters.outbound.llm.openai_adapter import OpenAIAdapter
|
|
84
|
-
|
|
85
|
-
return OpenAIAdapter(base_settings)
|
|
86
|
-
if provider == "ollama":
|
|
87
|
-
base_settings.llm_provider = "ollama"
|
|
88
|
-
base_settings.ollama_model = model_name
|
|
89
|
-
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
90
|
-
|
|
91
|
-
return OllamaAdapter(base_settings)
|
|
92
|
-
if provider == "vllm":
|
|
93
|
-
base_settings.llm_provider = "vllm"
|
|
94
|
-
base_settings.vllm_model = model_name
|
|
95
|
-
from evalvault.adapters.outbound.llm.vllm_adapter import VLLMAdapter
|
|
96
|
-
|
|
97
|
-
return VLLMAdapter(base_settings)
|
|
98
|
-
if provider == "azure":
|
|
99
|
-
base_settings.llm_provider = "azure"
|
|
100
|
-
base_settings.azure_deployment = model_name
|
|
101
|
-
from evalvault.adapters.outbound.llm.azure_adapter import AzureOpenAIAdapter
|
|
102
|
-
|
|
103
|
-
return AzureOpenAIAdapter(base_settings)
|
|
104
|
-
if provider == "anthropic":
|
|
105
|
-
base_settings.llm_provider = "anthropic"
|
|
106
|
-
base_settings.anthropic_model = model_name
|
|
107
|
-
from evalvault.adapters.outbound.llm.anthropic_adapter import AnthropicAdapter
|
|
108
|
-
|
|
109
|
-
return AnthropicAdapter(base_settings)
|
|
110
|
-
|
|
111
|
-
raise ValueError(
|
|
112
|
-
f"Unsupported LLM provider: '{provider}'. Supported: openai, ollama, vllm, azure, anthropic"
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
|
|
116
77
|
__all__ = [
|
|
117
78
|
"BaseLLMAdapter",
|
|
118
79
|
"LLMConfigurationError",
|
|
@@ -123,6 +84,7 @@ __all__ = [
|
|
|
123
84
|
"LLMRelationAugmenter",
|
|
124
85
|
"OllamaAdapter",
|
|
125
86
|
"VLLMAdapter",
|
|
87
|
+
"SettingsLLMFactory",
|
|
126
88
|
"get_llm_adapter",
|
|
127
89
|
"create_llm_adapter_for_model",
|
|
128
90
|
]
|
|
@@ -14,7 +14,7 @@ from evalvault.adapters.outbound.llm.base import (
|
|
|
14
14
|
from evalvault.adapters.outbound.llm.instructor_factory import create_instructor_llm
|
|
15
15
|
from evalvault.config.phoenix_support import instrumentation_span, set_span_attributes
|
|
16
16
|
from evalvault.config.settings import Settings
|
|
17
|
-
from evalvault.ports.outbound.llm_port import ThinkingConfig
|
|
17
|
+
from evalvault.ports.outbound.llm_port import GenerationOptions, ThinkingConfig
|
|
18
18
|
|
|
19
19
|
try: # Optional dependency
|
|
20
20
|
from anthropic import AsyncAnthropic
|
|
@@ -147,7 +147,12 @@ class AnthropicAdapter(BaseLLMAdapter):
|
|
|
147
147
|
"""Get the extended thinking token budget."""
|
|
148
148
|
return self._thinking_budget
|
|
149
149
|
|
|
150
|
-
async def agenerate_text(
|
|
150
|
+
async def agenerate_text(
|
|
151
|
+
self,
|
|
152
|
+
prompt: str,
|
|
153
|
+
*,
|
|
154
|
+
options: GenerationOptions | None = None,
|
|
155
|
+
) -> str:
|
|
151
156
|
"""Generate text from a prompt (async).
|
|
152
157
|
|
|
153
158
|
Uses the Anthropic messages API for simple text generation.
|
|
@@ -158,10 +163,17 @@ class AnthropicAdapter(BaseLLMAdapter):
|
|
|
158
163
|
Returns:
|
|
159
164
|
Generated text string
|
|
160
165
|
"""
|
|
166
|
+
max_tokens = options.max_tokens if options and options.max_tokens is not None else 8192
|
|
167
|
+
api_kwargs: dict[str, Any] = {}
|
|
168
|
+
if options and options.temperature is not None:
|
|
169
|
+
api_kwargs["temperature"] = options.temperature
|
|
170
|
+
if options and options.top_p is not None:
|
|
171
|
+
api_kwargs["top_p"] = options.top_p
|
|
161
172
|
response = await self._anthropic_client.messages.create(
|
|
162
173
|
model=self._model_name,
|
|
163
|
-
max_tokens=
|
|
174
|
+
max_tokens=max_tokens,
|
|
164
175
|
messages=[{"role": "user", "content": prompt}],
|
|
176
|
+
**api_kwargs,
|
|
165
177
|
)
|
|
166
178
|
# Extract text from response content blocks
|
|
167
179
|
text_parts = []
|
|
@@ -170,7 +182,13 @@ class AnthropicAdapter(BaseLLMAdapter):
|
|
|
170
182
|
text_parts.append(block.text)
|
|
171
183
|
return "".join(text_parts)
|
|
172
184
|
|
|
173
|
-
def generate_text(
|
|
185
|
+
def generate_text(
|
|
186
|
+
self,
|
|
187
|
+
prompt: str,
|
|
188
|
+
*,
|
|
189
|
+
json_mode: bool = False,
|
|
190
|
+
options: GenerationOptions | None = None,
|
|
191
|
+
) -> str:
|
|
174
192
|
"""Generate text from a prompt (sync).
|
|
175
193
|
|
|
176
194
|
Args:
|
|
@@ -192,12 +210,14 @@ class AnthropicAdapter(BaseLLMAdapter):
|
|
|
192
210
|
import nest_asyncio
|
|
193
211
|
|
|
194
212
|
nest_asyncio.apply()
|
|
195
|
-
return loop.run_until_complete(self.agenerate_text(prompt))
|
|
213
|
+
return loop.run_until_complete(self.agenerate_text(prompt, options=options))
|
|
196
214
|
except ImportError:
|
|
197
215
|
import concurrent.futures
|
|
198
216
|
|
|
199
217
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
200
|
-
future = executor.submit(
|
|
218
|
+
future = executor.submit(
|
|
219
|
+
asyncio.run, self.agenerate_text(prompt, options=options)
|
|
220
|
+
)
|
|
201
221
|
return future.result()
|
|
202
222
|
else:
|
|
203
|
-
return asyncio.run(self.agenerate_text(prompt))
|
|
223
|
+
return asyncio.run(self.agenerate_text(prompt, options=options))
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from evalvault.config.settings import Settings
|
|
4
|
+
from evalvault.ports.outbound.llm_factory_port import LLMFactoryPort
|
|
5
|
+
from evalvault.ports.outbound.llm_port import LLMPort
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SettingsLLMFactory(LLMFactoryPort):
|
|
9
|
+
def __init__(self, settings: Settings) -> None:
|
|
10
|
+
self._settings = settings
|
|
11
|
+
|
|
12
|
+
def create_faithfulness_fallback(
|
|
13
|
+
self,
|
|
14
|
+
active_provider: str | None,
|
|
15
|
+
active_model: str | None,
|
|
16
|
+
) -> LLMPort | None:
|
|
17
|
+
provider, model = _resolve_faithfulness_fallback_config(
|
|
18
|
+
settings=self._settings,
|
|
19
|
+
active_provider=active_provider,
|
|
20
|
+
active_model=active_model,
|
|
21
|
+
)
|
|
22
|
+
if not provider or not model:
|
|
23
|
+
return None
|
|
24
|
+
return create_llm_adapter_for_model(provider, model, self._settings)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_llm_adapter_for_model(
|
|
28
|
+
provider: str,
|
|
29
|
+
model_name: str,
|
|
30
|
+
base_settings: Settings,
|
|
31
|
+
) -> LLMPort:
|
|
32
|
+
provider = provider.lower()
|
|
33
|
+
|
|
34
|
+
if provider == "openai":
|
|
35
|
+
base_settings.llm_provider = "openai"
|
|
36
|
+
base_settings.openai_model = model_name
|
|
37
|
+
from evalvault.adapters.outbound.llm.openai_adapter import OpenAIAdapter
|
|
38
|
+
|
|
39
|
+
return OpenAIAdapter(base_settings)
|
|
40
|
+
if provider == "ollama":
|
|
41
|
+
base_settings.llm_provider = "ollama"
|
|
42
|
+
base_settings.ollama_model = model_name
|
|
43
|
+
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
44
|
+
|
|
45
|
+
return OllamaAdapter(base_settings)
|
|
46
|
+
if provider == "vllm":
|
|
47
|
+
base_settings.llm_provider = "vllm"
|
|
48
|
+
base_settings.vllm_model = model_name
|
|
49
|
+
from evalvault.adapters.outbound.llm.vllm_adapter import VLLMAdapter
|
|
50
|
+
|
|
51
|
+
return VLLMAdapter(base_settings)
|
|
52
|
+
if provider == "azure":
|
|
53
|
+
base_settings.llm_provider = "azure"
|
|
54
|
+
base_settings.azure_deployment = model_name
|
|
55
|
+
from evalvault.adapters.outbound.llm.azure_adapter import AzureOpenAIAdapter
|
|
56
|
+
|
|
57
|
+
return AzureOpenAIAdapter(base_settings)
|
|
58
|
+
if provider == "anthropic":
|
|
59
|
+
base_settings.llm_provider = "anthropic"
|
|
60
|
+
base_settings.anthropic_model = model_name
|
|
61
|
+
from evalvault.adapters.outbound.llm.anthropic_adapter import AnthropicAdapter
|
|
62
|
+
|
|
63
|
+
return AnthropicAdapter(base_settings)
|
|
64
|
+
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"Unsupported LLM provider: '{provider}'. Supported: openai, ollama, vllm, azure, anthropic"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _resolve_faithfulness_fallback_config(
|
|
71
|
+
*,
|
|
72
|
+
settings: Settings,
|
|
73
|
+
active_provider: str | None,
|
|
74
|
+
active_model: str | None,
|
|
75
|
+
) -> tuple[str | None, str | None]:
|
|
76
|
+
provider = (
|
|
77
|
+
settings.faithfulness_fallback_provider.strip().lower()
|
|
78
|
+
if settings.faithfulness_fallback_provider
|
|
79
|
+
else None
|
|
80
|
+
)
|
|
81
|
+
model = settings.faithfulness_fallback_model
|
|
82
|
+
normalized_active = active_provider.strip().lower() if active_provider else None
|
|
83
|
+
default_provider = normalized_active or settings.llm_provider.lower()
|
|
84
|
+
|
|
85
|
+
if not provider and model:
|
|
86
|
+
provider = default_provider
|
|
87
|
+
if provider and not model:
|
|
88
|
+
model = _default_faithfulness_fallback_model(provider)
|
|
89
|
+
if not provider and not model:
|
|
90
|
+
provider = default_provider
|
|
91
|
+
model = _default_faithfulness_fallback_model(default_provider)
|
|
92
|
+
|
|
93
|
+
if not provider or not model:
|
|
94
|
+
return None, None
|
|
95
|
+
return provider, model
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _default_faithfulness_fallback_model(provider: str) -> str | None:
|
|
99
|
+
if provider == "ollama":
|
|
100
|
+
return "gpt-oss-safeguard:20b"
|
|
101
|
+
if provider == "vllm":
|
|
102
|
+
return "gpt-oss-120b"
|
|
103
|
+
return None
|
|
@@ -9,6 +9,29 @@ from evalvault.domain.services.entity_extractor import Entity, Relation
|
|
|
9
9
|
from evalvault.ports.outbound.llm_port import LLMPort
|
|
10
10
|
from evalvault.ports.outbound.relation_augmenter_port import RelationAugmenterPort
|
|
11
11
|
|
|
12
|
+
_RELATION_SYSTEM_PROMPT_KO = (
|
|
13
|
+
"당신은 한국어 보험 문서의 지식 그래프 감사자입니다. "
|
|
14
|
+
"제공된 문서 스니펫을 보고 관계를 확인하거나 수정하세요."
|
|
15
|
+
)
|
|
16
|
+
_RELATION_SYSTEM_PROMPT_EN = (
|
|
17
|
+
"You are a knowledge graph auditor for Korean insurance documents. "
|
|
18
|
+
"Review the provided document snippet and confirm or fix the relations."
|
|
19
|
+
)
|
|
20
|
+
_RELATION_PROMPT_TEMPLATE_KO = (
|
|
21
|
+
"{system_prompt}\n"
|
|
22
|
+
"source, target, relation_type, confidence, justification 키를 포함한 JSON 배열만 반환하세요.\n\n"
|
|
23
|
+
"문서:\n{document_text}\n\n"
|
|
24
|
+
"엔티티:\n{entity_lines}\n\n"
|
|
25
|
+
"낮은 신뢰도 관계:\n{relation_lines}"
|
|
26
|
+
)
|
|
27
|
+
_RELATION_PROMPT_TEMPLATE_EN = (
|
|
28
|
+
"{system_prompt}\n"
|
|
29
|
+
"Return a JSON array of objects with keys source, target, relation_type, confidence, justification.\n\n"
|
|
30
|
+
"Document:\n{document_text}\n\n"
|
|
31
|
+
"Entities:\n{entity_lines}\n\n"
|
|
32
|
+
"Low-confidence relations:\n{relation_lines}"
|
|
33
|
+
)
|
|
34
|
+
|
|
12
35
|
|
|
13
36
|
class LLMRelationAugmenter(RelationAugmenterPort):
|
|
14
37
|
"""LLM을 사용해 저신뢰 관계를 검증/보강."""
|
|
@@ -18,13 +41,17 @@ class LLMRelationAugmenter(RelationAugmenterPort):
|
|
|
18
41
|
llm_port: LLMPort,
|
|
19
42
|
max_relations: int = 5,
|
|
20
43
|
system_prompt: str | None = None,
|
|
44
|
+
language: str = "ko",
|
|
21
45
|
):
|
|
22
46
|
self._llm_port = llm_port
|
|
23
47
|
self._max_relations = max_relations
|
|
24
|
-
self.
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
48
|
+
self._language = language
|
|
49
|
+
if system_prompt:
|
|
50
|
+
self._system_prompt = system_prompt
|
|
51
|
+
else:
|
|
52
|
+
self._system_prompt = (
|
|
53
|
+
_RELATION_SYSTEM_PROMPT_EN if language == "en" else _RELATION_SYSTEM_PROMPT_KO
|
|
54
|
+
)
|
|
28
55
|
|
|
29
56
|
def augment_relations(
|
|
30
57
|
self,
|
|
@@ -75,16 +102,14 @@ class LLMRelationAugmenter(RelationAugmenterPort):
|
|
|
75
102
|
f"- {rel.source} -> {rel.target} [{rel.relation_type}] conf={rel.confidence:.2f}"
|
|
76
103
|
for rel in relations
|
|
77
104
|
]
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
"Low-confidence relations:\n"
|
|
87
|
-
f"{chr(10).join(relation_lines)}"
|
|
105
|
+
template = (
|
|
106
|
+
_RELATION_PROMPT_TEMPLATE_EN if self._language == "en" else _RELATION_PROMPT_TEMPLATE_KO
|
|
107
|
+
)
|
|
108
|
+
return template.format(
|
|
109
|
+
system_prompt=self._system_prompt,
|
|
110
|
+
document_text=document_text,
|
|
111
|
+
entity_lines=chr(10).join(entity_lines),
|
|
112
|
+
relation_lines=chr(10).join(relation_lines),
|
|
88
113
|
)
|
|
89
114
|
|
|
90
115
|
@staticmethod
|
|
@@ -19,7 +19,7 @@ from evalvault.adapters.outbound.llm.base import BaseLLMAdapter
|
|
|
19
19
|
from evalvault.adapters.outbound.llm.instructor_factory import create_instructor_llm
|
|
20
20
|
from evalvault.adapters.outbound.llm.token_aware_chat import ThinkingTokenTrackingAsyncOpenAI
|
|
21
21
|
from evalvault.config.settings import Settings
|
|
22
|
-
from evalvault.ports.outbound.llm_port import ThinkingConfig
|
|
22
|
+
from evalvault.ports.outbound.llm_port import GenerationOptions, ThinkingConfig
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class OllamaAdapter(BaseLLMAdapter):
|
|
@@ -240,7 +240,12 @@ class OllamaAdapter(BaseLLMAdapter):
|
|
|
240
240
|
else:
|
|
241
241
|
return asyncio.run(self.embed(texts, model, dimension))
|
|
242
242
|
|
|
243
|
-
async def agenerate_text(
|
|
243
|
+
async def agenerate_text(
|
|
244
|
+
self,
|
|
245
|
+
prompt: str,
|
|
246
|
+
*,
|
|
247
|
+
options: GenerationOptions | None = None,
|
|
248
|
+
) -> str:
|
|
244
249
|
"""Generate text from a prompt (async).
|
|
245
250
|
|
|
246
251
|
Uses the Ollama OpenAI-compatible API for simple text generation.
|
|
@@ -251,13 +256,30 @@ class OllamaAdapter(BaseLLMAdapter):
|
|
|
251
256
|
Returns:
|
|
252
257
|
Generated text string
|
|
253
258
|
"""
|
|
254
|
-
|
|
255
|
-
model
|
|
256
|
-
messages
|
|
257
|
-
|
|
259
|
+
api_kwargs: dict[str, Any] = {
|
|
260
|
+
"model": self._ollama_model,
|
|
261
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
262
|
+
}
|
|
263
|
+
if options and options.max_tokens is not None:
|
|
264
|
+
api_kwargs["max_completion_tokens"] = options.max_tokens
|
|
265
|
+
if options and options.temperature is not None:
|
|
266
|
+
api_kwargs["temperature"] = options.temperature
|
|
267
|
+
if options and options.top_p is not None:
|
|
268
|
+
api_kwargs["top_p"] = options.top_p
|
|
269
|
+
if options and options.n is not None:
|
|
270
|
+
api_kwargs["n"] = options.n
|
|
271
|
+
if options and options.seed is not None:
|
|
272
|
+
api_kwargs["seed"] = options.seed
|
|
273
|
+
response = await self._embedding_client.chat.completions.create(**api_kwargs)
|
|
258
274
|
return response.choices[0].message.content or ""
|
|
259
275
|
|
|
260
|
-
def generate_text(
|
|
276
|
+
def generate_text(
|
|
277
|
+
self,
|
|
278
|
+
prompt: str,
|
|
279
|
+
*,
|
|
280
|
+
json_mode: bool = False,
|
|
281
|
+
options: GenerationOptions | None = None,
|
|
282
|
+
) -> str:
|
|
261
283
|
"""Generate text from a prompt (sync).
|
|
262
284
|
|
|
263
285
|
Args:
|
|
@@ -279,12 +301,14 @@ class OllamaAdapter(BaseLLMAdapter):
|
|
|
279
301
|
import nest_asyncio
|
|
280
302
|
|
|
281
303
|
nest_asyncio.apply()
|
|
282
|
-
return loop.run_until_complete(self.agenerate_text(prompt))
|
|
304
|
+
return loop.run_until_complete(self.agenerate_text(prompt, options=options))
|
|
283
305
|
except ImportError:
|
|
284
306
|
import concurrent.futures
|
|
285
307
|
|
|
286
308
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
287
|
-
future = executor.submit(
|
|
309
|
+
future = executor.submit(
|
|
310
|
+
asyncio.run, self.agenerate_text(prompt, options=options)
|
|
311
|
+
)
|
|
288
312
|
return future.result()
|
|
289
313
|
else:
|
|
290
|
-
return asyncio.run(self.agenerate_text(prompt))
|
|
314
|
+
return asyncio.run(self.agenerate_text(prompt, options=options))
|
|
@@ -10,6 +10,7 @@ from evalvault.adapters.outbound.llm.instructor_factory import create_instructor
|
|
|
10
10
|
from evalvault.adapters.outbound.llm.token_aware_chat import TokenTrackingAsyncOpenAI
|
|
11
11
|
from evalvault.config.phoenix_support import instrumentation_span, set_span_attributes
|
|
12
12
|
from evalvault.config.settings import Settings
|
|
13
|
+
from evalvault.ports.outbound.llm_port import GenerationOptions
|
|
13
14
|
|
|
14
15
|
_DEFAULT_MAX_COMPLETION_TOKENS = 8192
|
|
15
16
|
_GPT5_MAX_COMPLETION_TOKENS = 16384
|
|
@@ -73,7 +74,12 @@ class OpenAIAdapter(BaseLLMAdapter):
|
|
|
73
74
|
"""
|
|
74
75
|
return self._embedding_model_name
|
|
75
76
|
|
|
76
|
-
async def agenerate_text(
|
|
77
|
+
async def agenerate_text(
|
|
78
|
+
self,
|
|
79
|
+
prompt: str,
|
|
80
|
+
*,
|
|
81
|
+
options: GenerationOptions | None = None,
|
|
82
|
+
) -> str:
|
|
77
83
|
"""Generate text from a prompt (async).
|
|
78
84
|
|
|
79
85
|
Uses the OpenAI chat completions API directly for simple text generation.
|
|
@@ -89,18 +95,35 @@ class OpenAIAdapter(BaseLLMAdapter):
|
|
|
89
95
|
"llm.model": self._model_name,
|
|
90
96
|
"llm.mode": "async",
|
|
91
97
|
}
|
|
98
|
+
max_tokens = options.max_tokens if options and options.max_tokens is not None else None
|
|
99
|
+
api_kwargs = {
|
|
100
|
+
"model": self._model_name,
|
|
101
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
102
|
+
"max_completion_tokens": max_tokens
|
|
103
|
+
or _max_completion_tokens_for_model(self._model_name),
|
|
104
|
+
}
|
|
105
|
+
if options and options.temperature is not None:
|
|
106
|
+
api_kwargs["temperature"] = options.temperature
|
|
107
|
+
if options and options.top_p is not None:
|
|
108
|
+
api_kwargs["top_p"] = options.top_p
|
|
109
|
+
if options and options.n is not None:
|
|
110
|
+
api_kwargs["n"] = options.n
|
|
111
|
+
if options and options.seed is not None:
|
|
112
|
+
api_kwargs["seed"] = options.seed
|
|
92
113
|
with instrumentation_span("llm.generate_text", attrs) as span:
|
|
93
|
-
response = await self._client.chat.completions.create(
|
|
94
|
-
model=self._model_name,
|
|
95
|
-
messages=[{"role": "user", "content": prompt}],
|
|
96
|
-
max_completion_tokens=_max_completion_tokens_for_model(self._model_name),
|
|
97
|
-
)
|
|
114
|
+
response = await self._client.chat.completions.create(**api_kwargs)
|
|
98
115
|
content = response.choices[0].message.content or ""
|
|
99
116
|
if span:
|
|
100
117
|
set_span_attributes(span, {"llm.response.length": len(content)})
|
|
101
118
|
return content
|
|
102
119
|
|
|
103
|
-
def generate_text(
|
|
120
|
+
def generate_text(
|
|
121
|
+
self,
|
|
122
|
+
prompt: str,
|
|
123
|
+
*,
|
|
124
|
+
json_mode: bool = False,
|
|
125
|
+
options: GenerationOptions | None = None,
|
|
126
|
+
) -> str:
|
|
104
127
|
"""Generate text from a prompt (sync).
|
|
105
128
|
|
|
106
129
|
Uses sync OpenAI client directly.
|
|
@@ -124,11 +147,21 @@ class OpenAIAdapter(BaseLLMAdapter):
|
|
|
124
147
|
sync_client = OpenAI(**client_kwargs)
|
|
125
148
|
|
|
126
149
|
# API 호출 파라미터
|
|
150
|
+
max_tokens = options.max_tokens if options and options.max_tokens is not None else None
|
|
127
151
|
api_kwargs: dict = {
|
|
128
152
|
"model": self._model_name,
|
|
129
153
|
"messages": [{"role": "user", "content": prompt}],
|
|
130
|
-
"max_completion_tokens":
|
|
154
|
+
"max_completion_tokens": max_tokens
|
|
155
|
+
or _max_completion_tokens_for_model(self._model_name),
|
|
131
156
|
}
|
|
157
|
+
if options and options.temperature is not None:
|
|
158
|
+
api_kwargs["temperature"] = options.temperature
|
|
159
|
+
if options and options.top_p is not None:
|
|
160
|
+
api_kwargs["top_p"] = options.top_p
|
|
161
|
+
if options and options.n is not None:
|
|
162
|
+
api_kwargs["n"] = options.n
|
|
163
|
+
if options and options.seed is not None:
|
|
164
|
+
api_kwargs["seed"] = options.seed
|
|
132
165
|
|
|
133
166
|
# JSON 모드 설정
|
|
134
167
|
if json_mode:
|