evalvault 1.74.0__py3-none-any.whl → 1.76.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. evalvault/adapters/inbound/api/adapter.py +127 -80
  2. evalvault/adapters/inbound/api/routers/calibration.py +9 -9
  3. evalvault/adapters/inbound/api/routers/chat.py +303 -17
  4. evalvault/adapters/inbound/api/routers/config.py +3 -1
  5. evalvault/adapters/inbound/api/routers/domain.py +10 -5
  6. evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
  7. evalvault/adapters/inbound/api/routers/runs.py +23 -4
  8. evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
  9. evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
  10. evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
  11. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
  12. evalvault/adapters/inbound/cli/commands/compare.py +2 -7
  13. evalvault/adapters/inbound/cli/commands/debug.py +3 -2
  14. evalvault/adapters/inbound/cli/commands/domain.py +12 -12
  15. evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
  16. evalvault/adapters/inbound/cli/commands/gate.py +3 -2
  17. evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
  18. evalvault/adapters/inbound/cli/commands/history.py +3 -12
  19. evalvault/adapters/inbound/cli/commands/method.py +3 -4
  20. evalvault/adapters/inbound/cli/commands/ops.py +2 -2
  21. evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
  22. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
  23. evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
  24. evalvault/adapters/inbound/cli/commands/regress.py +5 -4
  25. evalvault/adapters/inbound/cli/commands/run.py +188 -59
  26. evalvault/adapters/inbound/cli/commands/run_helpers.py +181 -70
  27. evalvault/adapters/inbound/cli/commands/stage.py +6 -25
  28. evalvault/adapters/inbound/cli/utils/options.py +10 -4
  29. evalvault/adapters/inbound/mcp/tools.py +11 -8
  30. evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
  31. evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
  32. evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
  33. evalvault/adapters/outbound/domain_memory/factory.py +68 -0
  34. evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
  35. evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
  36. evalvault/adapters/outbound/llm/factory.py +1 -1
  37. evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
  38. evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
  39. evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
  40. evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
  41. evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
  42. evalvault/adapters/outbound/storage/base_sql.py +3 -2
  43. evalvault/adapters/outbound/storage/factory.py +53 -0
  44. evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
  45. evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
  46. evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
  47. evalvault/config/instrumentation.py +8 -6
  48. evalvault/config/phoenix_support.py +5 -0
  49. evalvault/config/settings.py +71 -11
  50. evalvault/domain/services/domain_learning_hook.py +2 -1
  51. evalvault/domain/services/evaluator.py +2 -0
  52. evalvault/ports/inbound/web_port.py +3 -1
  53. evalvault/ports/outbound/storage_port.py +2 -0
  54. evalvault-1.76.0.dist-info/METADATA +221 -0
  55. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +58 -53
  56. evalvault-1.74.0.dist-info/METADATA +0 -585
  57. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
  58. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
  59. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -3,7 +3,7 @@
3
3
  from pathlib import Path
4
4
  from typing import Any
5
5
 
6
- from pydantic import Field, PrivateAttr
6
+ from pydantic import AliasChoices, Field, PrivateAttr
7
7
  from pydantic_settings import BaseSettings, SettingsConfigDict
8
8
 
9
9
  from evalvault.config.secret_manager import (
@@ -55,6 +55,33 @@ def _parse_cors_origins(cors_origins: str | None) -> list[str]:
55
55
  return [origin.strip() for origin in cors_origins.split(",") if origin.strip()]
56
56
 
57
57
 
58
+ def resolve_tracker_providers(provider: str | None) -> list[str]:
59
+ if not provider:
60
+ return []
61
+ normalized = provider.strip().lower()
62
+ if normalized in {"none", "off", "disabled"}:
63
+ return ["none"]
64
+ aliases = {
65
+ "all": ["mlflow", "phoenix"],
66
+ "default": ["mlflow", "phoenix"],
67
+ }
68
+ if normalized in aliases:
69
+ return aliases[normalized]
70
+ separators = [",", "+", "/", "|"]
71
+ for sep in separators:
72
+ normalized = normalized.replace(sep, ",")
73
+ providers = [p.strip() for p in normalized.split(",") if p.strip()]
74
+ if not providers:
75
+ return []
76
+ if "none" in providers and len(providers) > 1:
77
+ raise ValueError("tracker_provider cannot combine 'none' with other providers")
78
+ deduped: list[str] = []
79
+ for entry in providers:
80
+ if entry not in deduped:
81
+ deduped.append(entry)
82
+ return deduped
83
+
84
+
58
85
  SECRET_REFERENCE_FIELDS = (
59
86
  "api_auth_tokens",
60
87
  "knowledge_read_tokens",
@@ -83,13 +110,14 @@ def _validate_production_settings(settings: "Settings") -> None:
83
110
  if settings.llm_provider == "openai" and not settings.openai_api_key:
84
111
  missing.append("OPENAI_API_KEY")
85
112
 
86
- if settings.tracker_provider == "langfuse":
113
+ providers = resolve_tracker_providers(settings.tracker_provider)
114
+ if "langfuse" in providers:
87
115
  if not settings.langfuse_public_key:
88
116
  missing.append("LANGFUSE_PUBLIC_KEY")
89
117
  if not settings.langfuse_secret_key:
90
118
  missing.append("LANGFUSE_SECRET_KEY")
91
119
 
92
- if settings.tracker_provider == "mlflow" and not settings.mlflow_tracking_uri:
120
+ if "mlflow" in providers and not settings.mlflow_tracking_uri:
93
121
  missing.append("MLFLOW_TRACKING_URI")
94
122
 
95
123
  if (
@@ -179,6 +207,10 @@ class Settings(BaseSettings):
179
207
  default="data/db/evalvault.db",
180
208
  description="SQLite database path for API/CLI storage.",
181
209
  )
210
+ db_backend: str = Field(
211
+ default="postgres",
212
+ description="Storage backend: 'postgres' or 'sqlite'.",
213
+ )
182
214
  evalvault_memory_db_path: str = Field(
183
215
  default="data/db/evalvault_memory.db",
184
216
  description="SQLite database path for Domain Memory storage.",
@@ -242,7 +274,7 @@ class Settings(BaseSettings):
242
274
  description="Ollama server URL",
243
275
  )
244
276
  ollama_model: str = Field(
245
- default="gpt-oss-safeguard:20b",
277
+ default="qwen3:14b",
246
278
  description="Ollama model name for evaluation",
247
279
  )
248
280
  ollama_embedding_model: str = Field(
@@ -351,6 +383,14 @@ class Settings(BaseSettings):
351
383
  default="http://localhost:6006/v1/traces",
352
384
  description="Phoenix OTLP endpoint for traces",
353
385
  )
386
+ phoenix_project_name: str = Field(
387
+ default="evalvault",
388
+ description="Phoenix project name for grouping traces",
389
+ )
390
+ phoenix_annotations_enabled: bool = Field(
391
+ default=True,
392
+ description="Enable automatic Phoenix span annotations",
393
+ )
354
394
  phoenix_api_token: str | None = Field(
355
395
  default=None,
356
396
  description="Phoenix API token for cloud deployments (optional)",
@@ -368,8 +408,8 @@ class Settings(BaseSettings):
368
408
 
369
409
  # Tracker Provider Selection
370
410
  tracker_provider: str = Field(
371
- default="langfuse",
372
- description="Tracker provider: 'langfuse', 'mlflow', or 'phoenix'",
411
+ default="mlflow+phoenix",
412
+ description="Tracker provider: 'langfuse', 'mlflow', 'phoenix', 'none', or combinations",
373
413
  )
374
414
 
375
415
  # Cluster map configuration
@@ -395,11 +435,31 @@ class Settings(BaseSettings):
395
435
  )
396
436
 
397
437
  # PostgreSQL Configuration (optional)
398
- postgres_host: str | None = Field(default=None, description="PostgreSQL server host")
399
- postgres_port: int = Field(default=5432, description="PostgreSQL server port")
400
- postgres_database: str = Field(default="evalvault", description="PostgreSQL database name")
401
- postgres_user: str | None = Field(default=None, description="PostgreSQL user")
402
- postgres_password: str | None = Field(default=None, description="PostgreSQL password")
438
+ postgres_host: str | None = Field(
439
+ default=None,
440
+ validation_alias=AliasChoices("POSTGRES_HOST", "EVALVAULT_DB_HOST"),
441
+ description="PostgreSQL server host",
442
+ )
443
+ postgres_port: int = Field(
444
+ default=5432,
445
+ validation_alias=AliasChoices("POSTGRES_PORT", "EVALVAULT_DB_PORT"),
446
+ description="PostgreSQL server port",
447
+ )
448
+ postgres_database: str = Field(
449
+ default="evalvault",
450
+ validation_alias=AliasChoices("POSTGRES_DATABASE", "EVALVAULT_DB_NAME"),
451
+ description="PostgreSQL database name",
452
+ )
453
+ postgres_user: str | None = Field(
454
+ default=None,
455
+ validation_alias=AliasChoices("POSTGRES_USER", "EVALVAULT_DB_USER"),
456
+ description="PostgreSQL user",
457
+ )
458
+ postgres_password: str | None = Field(
459
+ default=None,
460
+ validation_alias=AliasChoices("POSTGRES_PASSWORD", "EVALVAULT_DB_PASSWORD"),
461
+ description="PostgreSQL password",
462
+ )
403
463
  postgres_connection_string: str | None = Field(
404
464
  default=None, description="PostgreSQL connection string (overrides other postgres settings)"
405
465
  )
@@ -20,7 +20,8 @@ class DomainLearningHook:
20
20
  Formation dynamics를 구현합니다.
21
21
 
22
22
  사용 예시:
23
- memory_adapter = SQLiteDomainMemoryAdapter()
23
+ from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
24
+ memory_adapter = build_domain_memory_adapter()
24
25
  hook = DomainLearningHook(memory_adapter)
25
26
 
26
27
  # 평가 후 메모리 형성
@@ -1934,6 +1934,8 @@ class RagasEvaluator:
1934
1934
 
1935
1935
  def _calculate_cost(self, model_name: str, prompt_tokens: int, completion_tokens: int) -> float:
1936
1936
  """Calculate estimated cost in USD based on model pricing."""
1937
+ if "ollama" in model_name:
1938
+ return 0.0
1937
1939
  # Find matching model key (exact or substring match)
1938
1940
  price_key = "openai/gpt-4o" # Default fallback
1939
1941
  for key in self.MODEL_PRICING:
@@ -18,7 +18,7 @@ class EvalRequest:
18
18
 
19
19
  dataset_path: str
20
20
  metrics: list[str]
21
- model_name: str = "ollama/gpt-oss-safeguard:20b"
21
+ model_name: str = "ollama/qwen3:14b"
22
22
  evaluation_task: str = "qa"
23
23
  langfuse_enabled: bool = False
24
24
  thresholds: dict[str, float] = field(default_factory=dict)
@@ -121,12 +121,14 @@ class WebUIPort(Protocol):
121
121
  def list_runs(
122
122
  self,
123
123
  limit: int = 50,
124
+ offset: int = 0,
124
125
  filters: RunFilters | None = None,
125
126
  ) -> list[RunSummary]:
126
127
  """평가 목록 조회.
127
128
 
128
129
  Args:
129
130
  limit: 최대 조회 개수
131
+ offset: 조회 시작 위치
130
132
  filters: 필터 조건
131
133
 
132
134
  Returns:
@@ -83,6 +83,7 @@ class StoragePort(Protocol):
83
83
  def list_runs(
84
84
  self,
85
85
  limit: int = 100,
86
+ offset: int = 0,
86
87
  dataset_name: str | None = None,
87
88
  model_name: str | None = None,
88
89
  ) -> list[EvaluationRun]:
@@ -90,6 +91,7 @@ class StoragePort(Protocol):
90
91
 
91
92
  Args:
92
93
  limit: 최대 조회 개수
94
+ offset: 조회 시작 위치 (선택)
93
95
  dataset_name: 필터링할 데이터셋 이름 (선택)
94
96
  model_name: 필터링할 모델 이름 (선택)
95
97
 
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalvault
3
+ Version: 1.76.0
4
+ Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
5
+ Project-URL: Homepage, https://github.com/ntts9990/EvalVault
6
+ Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
7
+ Project-URL: Repository, https://github.com/ntts9990/EvalVault.git
8
+ Project-URL: Issues, https://github.com/ntts9990/EvalVault/issues
9
+ Project-URL: Changelog, https://github.com/ntts9990/EvalVault/releases
10
+ Author: EvalVault Contributors
11
+ Maintainer: EvalVault Contributors
12
+ License: Apache-2.0
13
+ License-File: LICENSE.md
14
+ Keywords: ai,evaluation,langfuse,llm,machine-learning,nlp,observability,opentelemetry,phoenix,rag,ragas,retrieval-augmented-generation,testing
15
+ Classifier: Development Status :: 4 - Beta
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development :: Quality Assurance
25
+ Classifier: Topic :: Software Development :: Testing
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.12
28
+ Requires-Dist: chainlit>=2.9.5
29
+ Requires-Dist: chardet
30
+ Requires-Dist: fastapi>=0.128.0
31
+ Requires-Dist: instructor
32
+ Requires-Dist: langchain-openai
33
+ Requires-Dist: langfuse
34
+ Requires-Dist: matplotlib<3.9.0,>=3.8.0
35
+ Requires-Dist: networkx
36
+ Requires-Dist: openai
37
+ Requires-Dist: openpyxl
38
+ Requires-Dist: pandas
39
+ Requires-Dist: pydantic
40
+ Requires-Dist: pydantic-settings
41
+ Requires-Dist: pypdf>=4.3.0
42
+ Requires-Dist: python-multipart
43
+ Requires-Dist: ragas==0.4.2
44
+ Requires-Dist: rich
45
+ Requires-Dist: truststore>=0.10.4
46
+ Requires-Dist: typer
47
+ Requires-Dist: uvicorn>=0.40.0
48
+ Requires-Dist: xlrd
49
+ Provides-Extra: analysis
50
+ Requires-Dist: scikit-learn>=1.3.0; extra == 'analysis'
51
+ Requires-Dist: xgboost>=2.0.0; extra == 'analysis'
52
+ Provides-Extra: anthropic
53
+ Requires-Dist: anthropic; extra == 'anthropic'
54
+ Requires-Dist: langchain-anthropic; extra == 'anthropic'
55
+ Provides-Extra: benchmark
56
+ Requires-Dist: datasets>=2.0.0; extra == 'benchmark'
57
+ Requires-Dist: lm-eval[api]>=0.4.0; extra == 'benchmark'
58
+ Provides-Extra: dashboard
59
+ Requires-Dist: matplotlib<3.9.0,>=3.8.0; extra == 'dashboard'
60
+ Provides-Extra: dev
61
+ Requires-Dist: anthropic; extra == 'dev'
62
+ Requires-Dist: arize-phoenix>=8.0.0; extra == 'dev'
63
+ Requires-Dist: datasets>=2.0.0; extra == 'dev'
64
+ Requires-Dist: faiss-cpu>=1.8.0; extra == 'dev'
65
+ Requires-Dist: ijson>=3.3.0; extra == 'dev'
66
+ Requires-Dist: kiwipiepy>=0.18.0; extra == 'dev'
67
+ Requires-Dist: langchain-anthropic; extra == 'dev'
68
+ Requires-Dist: lm-eval[api]>=0.4.0; extra == 'dev'
69
+ Requires-Dist: mkdocs-material>=9.5.0; extra == 'dev'
70
+ Requires-Dist: mkdocs>=1.5.0; extra == 'dev'
71
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'dev'
72
+ Requires-Dist: mlflow>=2.0.0; extra == 'dev'
73
+ Requires-Dist: openinference-instrumentation-langchain>=0.1.0; extra == 'dev'
74
+ Requires-Dist: opentelemetry-api>=1.20.0; extra == 'dev'
75
+ Requires-Dist: opentelemetry-exporter-otlp>=1.20.0; extra == 'dev'
76
+ Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'dev'
77
+ Requires-Dist: pgvector>=0.2.5; extra == 'dev'
78
+ Requires-Dist: psycopg[binary]>=3.0.0; extra == 'dev'
79
+ Requires-Dist: pydeps>=3.0.1; extra == 'dev'
80
+ Requires-Dist: pymdown-extensions>=10.7.0; extra == 'dev'
81
+ Requires-Dist: pytest; extra == 'dev'
82
+ Requires-Dist: pytest-asyncio; extra == 'dev'
83
+ Requires-Dist: pytest-cov; extra == 'dev'
84
+ Requires-Dist: pytest-html; extra == 'dev'
85
+ Requires-Dist: pytest-mock; extra == 'dev'
86
+ Requires-Dist: pytest-rerunfailures; extra == 'dev'
87
+ Requires-Dist: pytest-xdist; extra == 'dev'
88
+ Requires-Dist: python-multipart; extra == 'dev'
89
+ Requires-Dist: rank-bm25>=0.2.2; extra == 'dev'
90
+ Requires-Dist: ruff; extra == 'dev'
91
+ Requires-Dist: scikit-learn<1.4.0,>=1.3.0; extra == 'dev'
92
+ Requires-Dist: sentence-transformers>=5.2.0; extra == 'dev'
93
+ Requires-Dist: xgboost>=2.0.0; extra == 'dev'
94
+ Provides-Extra: docs
95
+ Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
96
+ Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
97
+ Requires-Dist: mkdocstrings[python]>=0.24.0; extra == 'docs'
98
+ Requires-Dist: pymdown-extensions>=10.7.0; extra == 'docs'
99
+ Provides-Extra: korean
100
+ Requires-Dist: kiwipiepy>=0.18.0; extra == 'korean'
101
+ Requires-Dist: rank-bm25>=0.2.2; extra == 'korean'
102
+ Requires-Dist: sentence-transformers>=5.2.0; extra == 'korean'
103
+ Provides-Extra: mlflow
104
+ Requires-Dist: mlflow>=2.0.0; extra == 'mlflow'
105
+ Provides-Extra: perf
106
+ Requires-Dist: faiss-cpu>=1.8.0; extra == 'perf'
107
+ Requires-Dist: ijson>=3.3.0; extra == 'perf'
108
+ Provides-Extra: phoenix
109
+ Requires-Dist: arize-phoenix>=8.0.0; extra == 'phoenix'
110
+ Requires-Dist: openinference-instrumentation-langchain>=0.1.0; extra == 'phoenix'
111
+ Requires-Dist: opentelemetry-api>=1.20.0; extra == 'phoenix'
112
+ Requires-Dist: opentelemetry-exporter-otlp>=1.20.0; extra == 'phoenix'
113
+ Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'phoenix'
114
+ Provides-Extra: postgres
115
+ Requires-Dist: pgvector>=0.2.5; extra == 'postgres'
116
+ Requires-Dist: psycopg[binary]>=3.0.0; extra == 'postgres'
117
+ Provides-Extra: secrets
118
+ Requires-Dist: boto3; extra == 'secrets'
119
+ Requires-Dist: google-cloud-secret-manager; extra == 'secrets'
120
+ Requires-Dist: hvac; extra == 'secrets'
121
+ Provides-Extra: timeseries
122
+ Requires-Dist: aeon>=1.3.0; extra == 'timeseries'
123
+ Requires-Dist: numba>=0.55.0; extra == 'timeseries'
124
+ Provides-Extra: web
125
+ Description-Content-Type: text/markdown
126
+
127
+ # EvalVault
128
+
129
+ RAG(Retrieval-Augmented Generation) 시스템을 대상으로 **평가(Eval) → 분석(Analysis) → 추적(Tracing) → 개선 루프**를 하나의 워크플로로 묶는 CLI + Web UI 플랫폼입니다.
130
+
131
+ [![PyPI](https://img.shields.io/pypi/v/evalvault.svg)](https://pypi.org/project/evalvault/)
132
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
133
+ [![CI](https://github.com/ntts9990/EvalVault/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/ntts9990/EvalVault/actions/workflows/ci.yml)
134
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE.md)
135
+
136
+ English version? See `README.en.md`.
137
+
138
+ ---
139
+
140
+ ## Quickstart (CLI)
141
+
142
+ ```bash
143
+ uv sync --extra dev
144
+ cp .env.example .env
145
+
146
+ uv run evalvault run --mode simple tests/fixtures/e2e/insurance_qa_korean.json \
147
+ --metrics faithfulness,answer_relevancy \
148
+ --profile dev \
149
+ --auto-analyze
150
+ ```
151
+
152
+ Tip: 기본 저장소는 Postgres+pgvector입니다. SQLite를 쓰려면 `--db` 또는 `DB_BACKEND=sqlite` + `EVALVAULT_DB_PATH`를 지정하세요.
153
+
154
+ ---
155
+
156
+ ## 핵심 기능
157
+
158
+ - **End-to-End 평가 루프**: Eval → Analysis → Tracing → Improvement를 한 흐름으로 실행
159
+ - **Dataset 중심 운영**: 합격 기준(threshold)을 데이터셋에 유지
160
+ - **Artifacts-first**: 보고서뿐 아니라 모듈별 원본 결과를 구조화 저장
161
+ - **옵션형 Observability**: Phoenix/Langfuse/MLflow는 필요할 때만 활성화
162
+ - **CLI + Web UI**: 동일 run_id 기반으로 히스토리/비교/리포트 통합
163
+
164
+ ---
165
+
166
+ ## 문서 허브
167
+
168
+ - 문서 인덱스: `docs/INDEX.md`
169
+ - 핸드북(교과서형): `docs/handbook/INDEX.md`
170
+ - 외부 요약본: `docs/handbook/EXTERNAL.md`
171
+ - 운영 가이드(로컬/도커/관측/런북): `docs/handbook/CHAPTERS/04_operations.md`
172
+ - 워크플로(실행/분석/비교/회귀): `docs/handbook/CHAPTERS/03_workflows.md`
173
+ - 품질/테스트/CI: `docs/handbook/CHAPTERS/06_quality_and_testing.md`
174
+ - 아키텍처: `docs/handbook/CHAPTERS/01_architecture.md`
175
+ - 오프라인/폐쇄망(Docker/모델 캐시): `docs/guides/OFFLINE_DOCKER.md`, `docs/guides/OFFLINE_MODELS.md`
176
+
177
+ 참고(호환성): `docs/guides/USER_GUIDE.md`, `docs/guides/DEV_GUIDE.md` 등 일부 문서는 과거 링크 호환을 위한 deprecated 스텁이며, 최신 내용은 handbook을 따릅니다.
178
+
179
+ ---
180
+
181
+ ## Web UI
182
+
183
+ ```bash
184
+ # API
185
+ uv run evalvault serve-api --reload
186
+
187
+ # Frontend
188
+ cd frontend
189
+ npm install
190
+ npm run dev
191
+ ```
192
+
193
+ 브라우저에서 `http://localhost:5173` 접속 후, Evaluation Studio에서 실행/히스토리/리포트를 확인합니다.
194
+
195
+ ---
196
+
197
+ ## 오프라인/폐쇄망
198
+
199
+ - Docker 이미지 번들: `docs/guides/OFFLINE_DOCKER.md`
200
+ - NLP 모델 캐시 번들: `docs/guides/OFFLINE_MODELS.md`
201
+
202
+ LLM 모델은 폐쇄망 내부 인프라가 관리하며, EvalVault는 **분석용 NLP 모델 캐시**만 번들에 포함합니다.
203
+
204
+ ---
205
+
206
+ ## 기여
207
+
208
+ ```bash
209
+ uv run ruff check src/ tests/
210
+ uv run ruff format src/ tests/
211
+ uv run pytest tests -v
212
+ ```
213
+
214
+ - 기여 가이드: `CONTRIBUTING.md`
215
+ - 개발/테스트 루틴: `AGENTS.md`, `docs/handbook/CHAPTERS/06_quality_and_testing.md`
216
+
217
+ ---
218
+
219
+ ## License
220
+
221
+ EvalVault is licensed under the [Apache 2.0](LICENSE.md) license.