evalvault 1.74.0__py3-none-any.whl → 1.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +127 -80
- evalvault/adapters/inbound/api/routers/calibration.py +9 -9
- evalvault/adapters/inbound/api/routers/chat.py +303 -17
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/api/routers/domain.py +10 -5
- evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
- evalvault/adapters/inbound/api/routers/runs.py +23 -4
- evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
- evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
- evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
- evalvault/adapters/inbound/cli/commands/compare.py +2 -7
- evalvault/adapters/inbound/cli/commands/debug.py +3 -2
- evalvault/adapters/inbound/cli/commands/domain.py +12 -12
- evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
- evalvault/adapters/inbound/cli/commands/gate.py +3 -2
- evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
- evalvault/adapters/inbound/cli/commands/history.py +3 -12
- evalvault/adapters/inbound/cli/commands/method.py +3 -4
- evalvault/adapters/inbound/cli/commands/ops.py +2 -2
- evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
- evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
- evalvault/adapters/inbound/cli/commands/regress.py +5 -4
- evalvault/adapters/inbound/cli/commands/run.py +188 -59
- evalvault/adapters/inbound/cli/commands/run_helpers.py +181 -70
- evalvault/adapters/inbound/cli/commands/stage.py +6 -25
- evalvault/adapters/inbound/cli/utils/options.py +10 -4
- evalvault/adapters/inbound/mcp/tools.py +11 -8
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
- evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
- evalvault/adapters/outbound/domain_memory/factory.py +68 -0
- evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
- evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
- evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
- evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
- evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
- evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
- evalvault/adapters/outbound/storage/base_sql.py +3 -2
- evalvault/adapters/outbound/storage/factory.py +53 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/settings.py +71 -11
- evalvault/domain/services/domain_learning_hook.py +2 -1
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/ports/inbound/web_port.py +3 -1
- evalvault/ports/outbound/storage_port.py +2 -0
- evalvault-1.76.0.dist-info/METADATA +221 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +58 -53
- evalvault-1.74.0.dist-info/METADATA +0 -585
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
evalvault/config/settings.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any
|
|
5
5
|
|
|
6
|
-
from pydantic import Field, PrivateAttr
|
|
6
|
+
from pydantic import AliasChoices, Field, PrivateAttr
|
|
7
7
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
8
8
|
|
|
9
9
|
from evalvault.config.secret_manager import (
|
|
@@ -55,6 +55,33 @@ def _parse_cors_origins(cors_origins: str | None) -> list[str]:
|
|
|
55
55
|
return [origin.strip() for origin in cors_origins.split(",") if origin.strip()]
|
|
56
56
|
|
|
57
57
|
|
|
58
|
+
def resolve_tracker_providers(provider: str | None) -> list[str]:
|
|
59
|
+
if not provider:
|
|
60
|
+
return []
|
|
61
|
+
normalized = provider.strip().lower()
|
|
62
|
+
if normalized in {"none", "off", "disabled"}:
|
|
63
|
+
return ["none"]
|
|
64
|
+
aliases = {
|
|
65
|
+
"all": ["mlflow", "phoenix"],
|
|
66
|
+
"default": ["mlflow", "phoenix"],
|
|
67
|
+
}
|
|
68
|
+
if normalized in aliases:
|
|
69
|
+
return aliases[normalized]
|
|
70
|
+
separators = [",", "+", "/", "|"]
|
|
71
|
+
for sep in separators:
|
|
72
|
+
normalized = normalized.replace(sep, ",")
|
|
73
|
+
providers = [p.strip() for p in normalized.split(",") if p.strip()]
|
|
74
|
+
if not providers:
|
|
75
|
+
return []
|
|
76
|
+
if "none" in providers and len(providers) > 1:
|
|
77
|
+
raise ValueError("tracker_provider cannot combine 'none' with other providers")
|
|
78
|
+
deduped: list[str] = []
|
|
79
|
+
for entry in providers:
|
|
80
|
+
if entry not in deduped:
|
|
81
|
+
deduped.append(entry)
|
|
82
|
+
return deduped
|
|
83
|
+
|
|
84
|
+
|
|
58
85
|
SECRET_REFERENCE_FIELDS = (
|
|
59
86
|
"api_auth_tokens",
|
|
60
87
|
"knowledge_read_tokens",
|
|
@@ -83,13 +110,14 @@ def _validate_production_settings(settings: "Settings") -> None:
|
|
|
83
110
|
if settings.llm_provider == "openai" and not settings.openai_api_key:
|
|
84
111
|
missing.append("OPENAI_API_KEY")
|
|
85
112
|
|
|
86
|
-
|
|
113
|
+
providers = resolve_tracker_providers(settings.tracker_provider)
|
|
114
|
+
if "langfuse" in providers:
|
|
87
115
|
if not settings.langfuse_public_key:
|
|
88
116
|
missing.append("LANGFUSE_PUBLIC_KEY")
|
|
89
117
|
if not settings.langfuse_secret_key:
|
|
90
118
|
missing.append("LANGFUSE_SECRET_KEY")
|
|
91
119
|
|
|
92
|
-
if
|
|
120
|
+
if "mlflow" in providers and not settings.mlflow_tracking_uri:
|
|
93
121
|
missing.append("MLFLOW_TRACKING_URI")
|
|
94
122
|
|
|
95
123
|
if (
|
|
@@ -179,6 +207,10 @@ class Settings(BaseSettings):
|
|
|
179
207
|
default="data/db/evalvault.db",
|
|
180
208
|
description="SQLite database path for API/CLI storage.",
|
|
181
209
|
)
|
|
210
|
+
db_backend: str = Field(
|
|
211
|
+
default="postgres",
|
|
212
|
+
description="Storage backend: 'postgres' or 'sqlite'.",
|
|
213
|
+
)
|
|
182
214
|
evalvault_memory_db_path: str = Field(
|
|
183
215
|
default="data/db/evalvault_memory.db",
|
|
184
216
|
description="SQLite database path for Domain Memory storage.",
|
|
@@ -242,7 +274,7 @@ class Settings(BaseSettings):
|
|
|
242
274
|
description="Ollama server URL",
|
|
243
275
|
)
|
|
244
276
|
ollama_model: str = Field(
|
|
245
|
-
default="
|
|
277
|
+
default="qwen3:14b",
|
|
246
278
|
description="Ollama model name for evaluation",
|
|
247
279
|
)
|
|
248
280
|
ollama_embedding_model: str = Field(
|
|
@@ -351,6 +383,14 @@ class Settings(BaseSettings):
|
|
|
351
383
|
default="http://localhost:6006/v1/traces",
|
|
352
384
|
description="Phoenix OTLP endpoint for traces",
|
|
353
385
|
)
|
|
386
|
+
phoenix_project_name: str = Field(
|
|
387
|
+
default="evalvault",
|
|
388
|
+
description="Phoenix project name for grouping traces",
|
|
389
|
+
)
|
|
390
|
+
phoenix_annotations_enabled: bool = Field(
|
|
391
|
+
default=True,
|
|
392
|
+
description="Enable automatic Phoenix span annotations",
|
|
393
|
+
)
|
|
354
394
|
phoenix_api_token: str | None = Field(
|
|
355
395
|
default=None,
|
|
356
396
|
description="Phoenix API token for cloud deployments (optional)",
|
|
@@ -368,8 +408,8 @@ class Settings(BaseSettings):
|
|
|
368
408
|
|
|
369
409
|
# Tracker Provider Selection
|
|
370
410
|
tracker_provider: str = Field(
|
|
371
|
-
default="
|
|
372
|
-
description="Tracker provider: 'langfuse', 'mlflow',
|
|
411
|
+
default="mlflow+phoenix",
|
|
412
|
+
description="Tracker provider: 'langfuse', 'mlflow', 'phoenix', 'none', or combinations",
|
|
373
413
|
)
|
|
374
414
|
|
|
375
415
|
# Cluster map configuration
|
|
@@ -395,11 +435,31 @@ class Settings(BaseSettings):
|
|
|
395
435
|
)
|
|
396
436
|
|
|
397
437
|
# PostgreSQL Configuration (optional)
|
|
398
|
-
postgres_host: str | None = Field(
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
438
|
+
postgres_host: str | None = Field(
|
|
439
|
+
default=None,
|
|
440
|
+
validation_alias=AliasChoices("POSTGRES_HOST", "EVALVAULT_DB_HOST"),
|
|
441
|
+
description="PostgreSQL server host",
|
|
442
|
+
)
|
|
443
|
+
postgres_port: int = Field(
|
|
444
|
+
default=5432,
|
|
445
|
+
validation_alias=AliasChoices("POSTGRES_PORT", "EVALVAULT_DB_PORT"),
|
|
446
|
+
description="PostgreSQL server port",
|
|
447
|
+
)
|
|
448
|
+
postgres_database: str = Field(
|
|
449
|
+
default="evalvault",
|
|
450
|
+
validation_alias=AliasChoices("POSTGRES_DATABASE", "EVALVAULT_DB_NAME"),
|
|
451
|
+
description="PostgreSQL database name",
|
|
452
|
+
)
|
|
453
|
+
postgres_user: str | None = Field(
|
|
454
|
+
default=None,
|
|
455
|
+
validation_alias=AliasChoices("POSTGRES_USER", "EVALVAULT_DB_USER"),
|
|
456
|
+
description="PostgreSQL user",
|
|
457
|
+
)
|
|
458
|
+
postgres_password: str | None = Field(
|
|
459
|
+
default=None,
|
|
460
|
+
validation_alias=AliasChoices("POSTGRES_PASSWORD", "EVALVAULT_DB_PASSWORD"),
|
|
461
|
+
description="PostgreSQL password",
|
|
462
|
+
)
|
|
403
463
|
postgres_connection_string: str | None = Field(
|
|
404
464
|
default=None, description="PostgreSQL connection string (overrides other postgres settings)"
|
|
405
465
|
)
|
|
@@ -20,7 +20,8 @@ class DomainLearningHook:
|
|
|
20
20
|
Formation dynamics를 구현합니다.
|
|
21
21
|
|
|
22
22
|
사용 예시:
|
|
23
|
-
|
|
23
|
+
from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
|
|
24
|
+
memory_adapter = build_domain_memory_adapter()
|
|
24
25
|
hook = DomainLearningHook(memory_adapter)
|
|
25
26
|
|
|
26
27
|
# 평가 후 메모리 형성
|
|
@@ -1934,6 +1934,8 @@ class RagasEvaluator:
|
|
|
1934
1934
|
|
|
1935
1935
|
def _calculate_cost(self, model_name: str, prompt_tokens: int, completion_tokens: int) -> float:
|
|
1936
1936
|
"""Calculate estimated cost in USD based on model pricing."""
|
|
1937
|
+
if "ollama" in model_name:
|
|
1938
|
+
return 0.0
|
|
1937
1939
|
# Find matching model key (exact or substring match)
|
|
1938
1940
|
price_key = "openai/gpt-4o" # Default fallback
|
|
1939
1941
|
for key in self.MODEL_PRICING:
|
|
@@ -18,7 +18,7 @@ class EvalRequest:
|
|
|
18
18
|
|
|
19
19
|
dataset_path: str
|
|
20
20
|
metrics: list[str]
|
|
21
|
-
model_name: str = "ollama/
|
|
21
|
+
model_name: str = "ollama/qwen3:14b"
|
|
22
22
|
evaluation_task: str = "qa"
|
|
23
23
|
langfuse_enabled: bool = False
|
|
24
24
|
thresholds: dict[str, float] = field(default_factory=dict)
|
|
@@ -121,12 +121,14 @@ class WebUIPort(Protocol):
|
|
|
121
121
|
def list_runs(
|
|
122
122
|
self,
|
|
123
123
|
limit: int = 50,
|
|
124
|
+
offset: int = 0,
|
|
124
125
|
filters: RunFilters | None = None,
|
|
125
126
|
) -> list[RunSummary]:
|
|
126
127
|
"""평가 목록 조회.
|
|
127
128
|
|
|
128
129
|
Args:
|
|
129
130
|
limit: 최대 조회 개수
|
|
131
|
+
offset: 조회 시작 위치
|
|
130
132
|
filters: 필터 조건
|
|
131
133
|
|
|
132
134
|
Returns:
|
|
@@ -83,6 +83,7 @@ class StoragePort(Protocol):
|
|
|
83
83
|
def list_runs(
|
|
84
84
|
self,
|
|
85
85
|
limit: int = 100,
|
|
86
|
+
offset: int = 0,
|
|
86
87
|
dataset_name: str | None = None,
|
|
87
88
|
model_name: str | None = None,
|
|
88
89
|
) -> list[EvaluationRun]:
|
|
@@ -90,6 +91,7 @@ class StoragePort(Protocol):
|
|
|
90
91
|
|
|
91
92
|
Args:
|
|
92
93
|
limit: 최대 조회 개수
|
|
94
|
+
offset: 조회 시작 위치 (선택)
|
|
93
95
|
dataset_name: 필터링할 데이터셋 이름 (선택)
|
|
94
96
|
model_name: 필터링할 모델 이름 (선택)
|
|
95
97
|
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evalvault
|
|
3
|
+
Version: 1.76.0
|
|
4
|
+
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
|
+
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
|
+
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/ntts9990/EvalVault.git
|
|
8
|
+
Project-URL: Issues, https://github.com/ntts9990/EvalVault/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/ntts9990/EvalVault/releases
|
|
10
|
+
Author: EvalVault Contributors
|
|
11
|
+
Maintainer: EvalVault Contributors
|
|
12
|
+
License: Apache-2.0
|
|
13
|
+
License-File: LICENSE.md
|
|
14
|
+
Keywords: ai,evaluation,langfuse,llm,machine-learning,nlp,observability,opentelemetry,phoenix,rag,ragas,retrieval-augmented-generation,testing
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
25
|
+
Classifier: Topic :: Software Development :: Testing
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.12
|
|
28
|
+
Requires-Dist: chainlit>=2.9.5
|
|
29
|
+
Requires-Dist: chardet
|
|
30
|
+
Requires-Dist: fastapi>=0.128.0
|
|
31
|
+
Requires-Dist: instructor
|
|
32
|
+
Requires-Dist: langchain-openai
|
|
33
|
+
Requires-Dist: langfuse
|
|
34
|
+
Requires-Dist: matplotlib<3.9.0,>=3.8.0
|
|
35
|
+
Requires-Dist: networkx
|
|
36
|
+
Requires-Dist: openai
|
|
37
|
+
Requires-Dist: openpyxl
|
|
38
|
+
Requires-Dist: pandas
|
|
39
|
+
Requires-Dist: pydantic
|
|
40
|
+
Requires-Dist: pydantic-settings
|
|
41
|
+
Requires-Dist: pypdf>=4.3.0
|
|
42
|
+
Requires-Dist: python-multipart
|
|
43
|
+
Requires-Dist: ragas==0.4.2
|
|
44
|
+
Requires-Dist: rich
|
|
45
|
+
Requires-Dist: truststore>=0.10.4
|
|
46
|
+
Requires-Dist: typer
|
|
47
|
+
Requires-Dist: uvicorn>=0.40.0
|
|
48
|
+
Requires-Dist: xlrd
|
|
49
|
+
Provides-Extra: analysis
|
|
50
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == 'analysis'
|
|
51
|
+
Requires-Dist: xgboost>=2.0.0; extra == 'analysis'
|
|
52
|
+
Provides-Extra: anthropic
|
|
53
|
+
Requires-Dist: anthropic; extra == 'anthropic'
|
|
54
|
+
Requires-Dist: langchain-anthropic; extra == 'anthropic'
|
|
55
|
+
Provides-Extra: benchmark
|
|
56
|
+
Requires-Dist: datasets>=2.0.0; extra == 'benchmark'
|
|
57
|
+
Requires-Dist: lm-eval[api]>=0.4.0; extra == 'benchmark'
|
|
58
|
+
Provides-Extra: dashboard
|
|
59
|
+
Requires-Dist: matplotlib<3.9.0,>=3.8.0; extra == 'dashboard'
|
|
60
|
+
Provides-Extra: dev
|
|
61
|
+
Requires-Dist: anthropic; extra == 'dev'
|
|
62
|
+
Requires-Dist: arize-phoenix>=8.0.0; extra == 'dev'
|
|
63
|
+
Requires-Dist: datasets>=2.0.0; extra == 'dev'
|
|
64
|
+
Requires-Dist: faiss-cpu>=1.8.0; extra == 'dev'
|
|
65
|
+
Requires-Dist: ijson>=3.3.0; extra == 'dev'
|
|
66
|
+
Requires-Dist: kiwipiepy>=0.18.0; extra == 'dev'
|
|
67
|
+
Requires-Dist: langchain-anthropic; extra == 'dev'
|
|
68
|
+
Requires-Dist: lm-eval[api]>=0.4.0; extra == 'dev'
|
|
69
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == 'dev'
|
|
70
|
+
Requires-Dist: mkdocs>=1.5.0; extra == 'dev'
|
|
71
|
+
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'dev'
|
|
72
|
+
Requires-Dist: mlflow>=2.0.0; extra == 'dev'
|
|
73
|
+
Requires-Dist: openinference-instrumentation-langchain>=0.1.0; extra == 'dev'
|
|
74
|
+
Requires-Dist: opentelemetry-api>=1.20.0; extra == 'dev'
|
|
75
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.20.0; extra == 'dev'
|
|
76
|
+
Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'dev'
|
|
77
|
+
Requires-Dist: pgvector>=0.2.5; extra == 'dev'
|
|
78
|
+
Requires-Dist: psycopg[binary]>=3.0.0; extra == 'dev'
|
|
79
|
+
Requires-Dist: pydeps>=3.0.1; extra == 'dev'
|
|
80
|
+
Requires-Dist: pymdown-extensions>=10.7.0; extra == 'dev'
|
|
81
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
82
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
83
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
84
|
+
Requires-Dist: pytest-html; extra == 'dev'
|
|
85
|
+
Requires-Dist: pytest-mock; extra == 'dev'
|
|
86
|
+
Requires-Dist: pytest-rerunfailures; extra == 'dev'
|
|
87
|
+
Requires-Dist: pytest-xdist; extra == 'dev'
|
|
88
|
+
Requires-Dist: python-multipart; extra == 'dev'
|
|
89
|
+
Requires-Dist: rank-bm25>=0.2.2; extra == 'dev'
|
|
90
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
91
|
+
Requires-Dist: scikit-learn<1.4.0,>=1.3.0; extra == 'dev'
|
|
92
|
+
Requires-Dist: sentence-transformers>=5.2.0; extra == 'dev'
|
|
93
|
+
Requires-Dist: xgboost>=2.0.0; extra == 'dev'
|
|
94
|
+
Provides-Extra: docs
|
|
95
|
+
Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
|
|
96
|
+
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
97
|
+
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
|
|
98
|
+
Requires-Dist: pymdown-extensions>=10.7.0; extra == 'docs'
|
|
99
|
+
Provides-Extra: korean
|
|
100
|
+
Requires-Dist: kiwipiepy>=0.18.0; extra == 'korean'
|
|
101
|
+
Requires-Dist: rank-bm25>=0.2.2; extra == 'korean'
|
|
102
|
+
Requires-Dist: sentence-transformers>=5.2.0; extra == 'korean'
|
|
103
|
+
Provides-Extra: mlflow
|
|
104
|
+
Requires-Dist: mlflow>=2.0.0; extra == 'mlflow'
|
|
105
|
+
Provides-Extra: perf
|
|
106
|
+
Requires-Dist: faiss-cpu>=1.8.0; extra == 'perf'
|
|
107
|
+
Requires-Dist: ijson>=3.3.0; extra == 'perf'
|
|
108
|
+
Provides-Extra: phoenix
|
|
109
|
+
Requires-Dist: arize-phoenix>=8.0.0; extra == 'phoenix'
|
|
110
|
+
Requires-Dist: openinference-instrumentation-langchain>=0.1.0; extra == 'phoenix'
|
|
111
|
+
Requires-Dist: opentelemetry-api>=1.20.0; extra == 'phoenix'
|
|
112
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.20.0; extra == 'phoenix'
|
|
113
|
+
Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'phoenix'
|
|
114
|
+
Provides-Extra: postgres
|
|
115
|
+
Requires-Dist: pgvector>=0.2.5; extra == 'postgres'
|
|
116
|
+
Requires-Dist: psycopg[binary]>=3.0.0; extra == 'postgres'
|
|
117
|
+
Provides-Extra: secrets
|
|
118
|
+
Requires-Dist: boto3; extra == 'secrets'
|
|
119
|
+
Requires-Dist: google-cloud-secret-manager; extra == 'secrets'
|
|
120
|
+
Requires-Dist: hvac; extra == 'secrets'
|
|
121
|
+
Provides-Extra: timeseries
|
|
122
|
+
Requires-Dist: aeon>=1.3.0; extra == 'timeseries'
|
|
123
|
+
Requires-Dist: numba>=0.55.0; extra == 'timeseries'
|
|
124
|
+
Provides-Extra: web
|
|
125
|
+
Description-Content-Type: text/markdown
|
|
126
|
+
|
|
127
|
+
# EvalVault
|
|
128
|
+
|
|
129
|
+
RAG(Retrieval-Augmented Generation) 시스템을 대상으로 **평가(Eval) → 분석(Analysis) → 추적(Tracing) → 개선 루프**를 하나의 워크플로로 묶는 CLI + Web UI 플랫폼입니다.
|
|
130
|
+
|
|
131
|
+
[](https://pypi.org/project/evalvault/)
|
|
132
|
+
[](https://www.python.org/downloads/)
|
|
133
|
+
[](https://github.com/ntts9990/EvalVault/actions/workflows/ci.yml)
|
|
134
|
+
[](LICENSE.md)
|
|
135
|
+
|
|
136
|
+
English version? See `README.en.md`.
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Quickstart (CLI)
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
uv sync --extra dev
|
|
144
|
+
cp .env.example .env
|
|
145
|
+
|
|
146
|
+
uv run evalvault run --mode simple tests/fixtures/e2e/insurance_qa_korean.json \
|
|
147
|
+
--metrics faithfulness,answer_relevancy \
|
|
148
|
+
--profile dev \
|
|
149
|
+
--auto-analyze
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Tip: 기본 저장소는 Postgres+pgvector입니다. SQLite를 쓰려면 `--db` 또는 `DB_BACKEND=sqlite` + `EVALVAULT_DB_PATH`를 지정하세요.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## 핵심 기능
|
|
157
|
+
|
|
158
|
+
- **End-to-End 평가 루프**: Eval → Analysis → Tracing → Improvement를 한 흐름으로 실행
|
|
159
|
+
- **Dataset 중심 운영**: 합격 기준(threshold)을 데이터셋에 유지
|
|
160
|
+
- **Artifacts-first**: 보고서뿐 아니라 모듈별 원본 결과를 구조화 저장
|
|
161
|
+
- **옵션형 Observability**: Phoenix/Langfuse/MLflow는 필요할 때만 활성화
|
|
162
|
+
- **CLI + Web UI**: 동일 run_id 기반으로 히스토리/비교/리포트 통합
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## 문서 허브
|
|
167
|
+
|
|
168
|
+
- 문서 인덱스: `docs/INDEX.md`
|
|
169
|
+
- 핸드북(교과서형): `docs/handbook/INDEX.md`
|
|
170
|
+
- 외부 요약본: `docs/handbook/EXTERNAL.md`
|
|
171
|
+
- 운영 가이드(로컬/도커/관측/런북): `docs/handbook/CHAPTERS/04_operations.md`
|
|
172
|
+
- 워크플로(실행/분석/비교/회귀): `docs/handbook/CHAPTERS/03_workflows.md`
|
|
173
|
+
- 품질/테스트/CI: `docs/handbook/CHAPTERS/06_quality_and_testing.md`
|
|
174
|
+
- 아키텍처: `docs/handbook/CHAPTERS/01_architecture.md`
|
|
175
|
+
- 오프라인/폐쇄망(Docker/모델 캐시): `docs/guides/OFFLINE_DOCKER.md`, `docs/guides/OFFLINE_MODELS.md`
|
|
176
|
+
|
|
177
|
+
참고(호환성): `docs/guides/USER_GUIDE.md`, `docs/guides/DEV_GUIDE.md` 등 일부 문서는 과거 링크 호환을 위한 deprecated 스텁이며, 최신 내용은 handbook을 따릅니다.
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## Web UI
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
# API
|
|
185
|
+
uv run evalvault serve-api --reload
|
|
186
|
+
|
|
187
|
+
# Frontend
|
|
188
|
+
cd frontend
|
|
189
|
+
npm install
|
|
190
|
+
npm run dev
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
브라우저에서 `http://localhost:5173` 접속 후, Evaluation Studio에서 실행/히스토리/리포트를 확인합니다.
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## 오프라인/폐쇄망
|
|
198
|
+
|
|
199
|
+
- Docker 이미지 번들: `docs/guides/OFFLINE_DOCKER.md`
|
|
200
|
+
- NLP 모델 캐시 번들: `docs/guides/OFFLINE_MODELS.md`
|
|
201
|
+
|
|
202
|
+
LLM 모델은 폐쇄망 내부 인프라가 관리하며, EvalVault는 **분석용 NLP 모델 캐시**만 번들에 포함합니다.
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## 기여
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
uv run ruff check src/ tests/
|
|
210
|
+
uv run ruff format src/ tests/
|
|
211
|
+
uv run pytest tests -v
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
- 기여 가이드: `CONTRIBUTING.md`
|
|
215
|
+
- 개발/테스트 루틴: `AGENTS.md`, `docs/handbook/CHAPTERS/06_quality_and_testing.md`
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
EvalVault is licensed under the [Apache 2.0](LICENSE.md) license.
|